├── .gitignore ├── Cargo.toml ├── .github └── workflows │ ├── rust.yml │ └── greetings.yml ├── little_raft ├── Cargo.toml ├── src │ ├── timer.rs │ ├── lib.rs │ ├── cluster.rs │ ├── message.rs │ ├── state_machine.rs │ └── replica.rs └── tests │ ├── raft_stable.rs │ └── raft_unstable.rs ├── LICENSE ├── README.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "little_raft" 4 | ] -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | 24 | -------------------------------------------------------------------------------- /.github/workflows/greetings.yml: -------------------------------------------------------------------------------- 1 | name: Greetings 2 | 3 | on: [pull_request, issues] 4 | 5 | jobs: 6 | greeting: 7 | runs-on: ubuntu-latest 8 | permissions: 9 | issues: write 10 | pull-requests: write 11 | steps: 12 | - uses: actions/first-interaction@v1 13 | with: 14 | repo-token: ${{ secrets.GITHUB_TOKEN }} 15 | issue-message: 'Welcome to Little Raft! Thanks for contributing.' 16 | pr-message: 'Congratulations on opening your first PR to Little Raft! Welcome.' 17 | -------------------------------------------------------------------------------- /little_raft/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | description = "The lightest distributed consensus library. Run your own replicated state machine!" 3 | name = "little_raft" 4 | version = "0.2.0" 5 | authors = ["Ilya Andreev "] 6 | edition = "2018" 7 | license = "MIT" 8 | homepage = "https://github.com/andreev-io/little-raft" 9 | repository = "https://github.com/andreev-io/little-raft" 10 | readme = "../README.md" 11 | keywords = ["distributed-systems", "raft", "consensus"] 12 | categories = ["concurrency", "database", "database-implementations"] 13 | 14 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 15 | 16 | [dependencies] 17 | rand = "0.8.3" 18 | crossbeam-channel = "0.5.1" 19 | crossbeam = "0.8.0" 20 | timer = "0.1.3" 21 | time = "0.1.39" 22 | bytes = "0.4.7" 23 | -------------------------------------------------------------------------------- /little_raft/src/timer.rs: -------------------------------------------------------------------------------- 1 | use crossbeam::channel::{bounded, Receiver}; 2 | use std::{thread, time::Duration}; 3 | 4 | pub struct Timer { 5 | rx: Receiver<()>, 6 | timeout: Duration, 7 | } 8 | 9 | // Timer fires after the specified duration. The timer can be renewed. 10 | impl Timer { 11 | pub fn new(timeout: Duration) -> Timer { 12 | Timer { 13 | timeout, 14 | rx: Timer::get_timeout_channel(timeout), 15 | } 16 | } 17 | 18 | pub fn renew(&mut self) { 19 | self.rx = Timer::get_timeout_channel(self.timeout); 20 | } 21 | 22 | pub fn get_rx(&self) -> &Receiver<()> { 23 | &self.rx 24 | } 25 | 26 | fn get_timeout_channel(timeout: Duration) -> Receiver<()> { 27 | let (tx, rx) = bounded(1); 28 | thread::spawn(move || { 29 | thread::sleep(timeout); 30 | let _ = tx.send(()); 31 | }); 32 | 33 | rx 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /little_raft/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This crate is a small but full-featured implementation of the Raft 2 | //! distributed consensus protocol. By using this library, you can run a 3 | //! replicated state machine in your own cluster. The cluster could be comprised 4 | //! of dozens of physical servers in different parts of the world or of two 5 | //! threads on a single CPU. 6 | //! 7 | //! The goal of this library is to provide a generic implementation of the 8 | //! algorithm that the library user can leverage in their own way. It is 9 | //! entirely up to the user how to configure the Raft cluster, how to ensure 10 | //! communication between the nodes, how to process client's messages, how to do 11 | //! service discovery, and what kind of state machine to replicate. 12 | //! 13 | //! The implementation is kept as simple as possible on purpose, with the entire 14 | //! library code base fitting in under 1,000 lines of code. 15 | pub mod cluster; 16 | pub mod message; 17 | pub mod replica; 18 | pub mod state_machine; 19 | mod timer; 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ilia (Ilya) Andreev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /little_raft/src/cluster.rs: -------------------------------------------------------------------------------- 1 | use crate::{message::Message, replica::ReplicaID, state_machine::{StateMachineTransition}}; 2 | 3 | /// Cluster is used for the local Raft Replica to communicate with the rest of 4 | /// the Raft cluster. It is up to the user how to abstract that communication. 5 | /// The Cluster trait also contains hooks which the Replica will use to inform 6 | /// the crate user of state changes. 7 | pub trait Cluster 8 | where 9 | T: StateMachineTransition, 10 | D: Clone, 11 | { 12 | /// This function is used to deliver messages to target Replicas. The 13 | /// Replica will provide the to_id of the other Replica it's trying to send 14 | /// its message to and provide the message itself. The send_message 15 | /// implementation must not block but is allowed to silently fail -- Raft 16 | /// exists to achieve consensus in spite of failures, after all. 17 | fn send_message(&mut self, to_id: usize, message: Message); 18 | 19 | /// This function is used by the Replica to receive pending messages from 20 | /// the cluster. The receive_messages implementation must not block and must 21 | /// not return the same message more than once. Note that receive_messages 22 | /// is only called when the Replica is notified via the recv_msg channel. 23 | fn receive_messages(&mut self) -> Vec>; 24 | 25 | /// By returning true from halt you can signal to the Replica that it should 26 | /// stop running. 27 | fn halt(&self) -> bool; 28 | 29 | /// This function is a hook that the Replica uses to inform the user of the 30 | /// Leader change. The leader_id is an Option because the Leader 31 | /// might be unknown for a period of time. Remember that only Leaders can 32 | /// process transitions submitted by the Raft users, so the leader_id can be 33 | /// used to redirect the requests from non-Leader nodes to the Leader node. 34 | fn register_leader(&mut self, leader_id: Option); 35 | } 36 | -------------------------------------------------------------------------------- /little_raft/src/message.rs: -------------------------------------------------------------------------------- 1 | use crate::replica::ReplicaID; 2 | use crate::state_machine::{StateMachineTransition}; 3 | 4 | /// LogEntry is a state machine transition along with some metadata needed for 5 | /// Raft. 6 | #[derive(Clone, Debug, PartialEq, Eq, PartialOrd)] 7 | pub struct LogEntry 8 | where 9 | T: StateMachineTransition, 10 | { 11 | pub transition: T, 12 | pub index: usize, 13 | pub term: usize, 14 | } 15 | 16 | /// Message describes messages that the replicas pass between each other to 17 | /// achieve consensus on the distributed state machine. 18 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd)] 19 | pub enum Message 20 | where 21 | T: StateMachineTransition, 22 | D: Clone, 23 | { 24 | /// AppendEntryRequest is used by the Leader to send out logs for other 25 | /// replicas to append to their log. It also has information on what logs 26 | /// are ready to be applied to the state machine. AppendEntryRequest is also 27 | /// used as a heartbeat message by the Leader even when no new logs need to 28 | /// be processed. 29 | AppendEntryRequest { 30 | from_id: ReplicaID, 31 | term: usize, 32 | prev_log_index: usize, 33 | prev_log_term: usize, 34 | entries: Vec>, 35 | commit_index: usize, 36 | }, 37 | 38 | /// AppendEntryResponse is used by replicas to respond to AppendEntryRequest 39 | /// messages. 40 | AppendEntryResponse { 41 | from_id: ReplicaID, 42 | term: usize, 43 | success: bool, 44 | last_index: usize, 45 | mismatch_index: Option, 46 | }, 47 | 48 | /// VoteRequest is used by Candidates to solicit votes for themselves. 49 | VoteRequest { 50 | from_id: ReplicaID, 51 | term: usize, 52 | last_log_index: usize, 53 | last_log_term: usize, 54 | }, 55 | 56 | /// VoteResponse is used by replicas to respond to VoteRequest messages. 57 | VoteResponse { 58 | from_id: ReplicaID, 59 | term: usize, 60 | vote_granted: bool, 61 | }, 62 | 63 | InstallSnapshotRequest { 64 | from_id: ReplicaID, 65 | term: usize, 66 | last_included_index: usize, 67 | last_included_term: usize, 68 | offset: usize, 69 | data: D, 70 | done: bool, 71 | }, 72 | 73 | InstallSnapshotResponse { 74 | from_id: ReplicaID, 75 | term: usize, 76 | last_included_index: usize, 77 | }, 78 | } 79 | -------------------------------------------------------------------------------- /little_raft/src/state_machine.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | /// TransitionState describes the state of a particular transition. 4 | #[derive(Clone, Debug, PartialEq)] 5 | pub enum TransitionState { 6 | /// Queued transitions have been received from the user but have not been 7 | /// processed yet. They are in the queue. 8 | Queued, 9 | 10 | /// Committed transitions have not yet been applied to the state machine but 11 | /// have already been replicated across the cluster such that they are 12 | /// guaranteed to be present in the log of all future cluster leaders. 13 | Committed, 14 | 15 | /// Applied transitions have been replicated across the cluster and have 16 | /// been applied to the local state machine. 17 | Applied, 18 | 19 | /// Abandoned transitions have been ignored by the replica. 20 | Abandoned(TransitionAbandonedReason), 21 | } 22 | 23 | /// TransitionAbandonedReason explains why a particular transition has been 24 | /// abandoned by the replica. 25 | #[derive(Clone, Debug, PartialEq)] 26 | pub enum TransitionAbandonedReason { 27 | /// NotLeader transitions have been abandoned because the replica is not the 28 | /// cluster leader. 29 | NotLeader, 30 | 31 | // ConflictWithLeader uncommitted transitions are abandoned because they 32 | // don't match the consensus achieved by the majority of the cluster. 33 | ConflictWithLeader, 34 | } 35 | 36 | /// StateMachineTransition describes a user-defined transition that can be 37 | /// applied to the state machine replicated by Raft. 38 | pub trait StateMachineTransition: Clone + Debug { 39 | /// TransitionID is used to identify the transition. 40 | type TransitionID: Eq; 41 | 42 | /// get_id is used by the Replica to identify the transition to be able to 43 | /// call register_transition_state. 44 | fn get_id(&self) -> Self::TransitionID; 45 | } 46 | 47 | /// Snapshot is an object used for log compaction. The user can use snapshots to 48 | /// represent StateMachine state at a particular point. This will let the 49 | /// Replica start from a saved state or perform log compaction before the log 50 | /// sequence starts taking up too much memory. 51 | #[derive(Clone)] 52 | pub struct Snapshot where D: Clone { 53 | pub last_included_index: usize, 54 | pub last_included_term: usize, 55 | pub data: D, 56 | } 57 | 58 | /// StateMachine describes a user-defined state machine that is replicated 59 | /// across the cluster. Raft can replicate whatever distributed state machine 60 | /// can implement this trait. 61 | pub trait StateMachine 62 | where 63 | T: StateMachineTransition, 64 | D: Clone, 65 | { 66 | /// This is a hook that the local Replica will call each time the state of a 67 | /// particular transition changes. It is up to the user what to do with that 68 | /// information. 69 | fn register_transition_state(&mut self, transition_id: T::TransitionID, state: TransitionState); 70 | 71 | /// When a particular transition is ready to be applied, the Replica will 72 | /// call apply_transition to apply said transition to the local state 73 | /// machine. 74 | fn apply_transition(&mut self, transition: T); 75 | 76 | /// This function is used to receive transitions from the user that need to 77 | /// be applied to the replicated state machine. Note that only the Leader 78 | /// Replica processes transitions and only when notified via the 79 | /// recv_transition channel. All other Replicas poll for transitions and 80 | /// discard them. get_pending_transitions must not return the same 81 | /// transition twice. 82 | fn get_pending_transitions(&mut self) -> Vec; 83 | 84 | /// Replica calls get_snapshot once upon startup. If the Replica and the 85 | /// associated StateMachine should start from a certain checkpoint 86 | /// previously saved with a call to create_snapshot or set_snapshot, this 87 | /// function should return Some(snapshot). Otherwise it can return None. If 88 | /// None is returned, the Replica can still recover its state from other 89 | /// nodes in the cluster, but it might take longer to do so than if it 90 | /// recovered from a previously snapshotted value. 91 | /// 92 | /// Little Raft will take care of loading the Snapshot into the Replica and 93 | /// achieving consensus provided snapshot.last_included_index and 94 | /// snapshot.last_included_term are truthful. However, it is up to the user 95 | /// to put the StateMachine into the right state before returning from 96 | /// load_snapshot(). 97 | fn get_snapshot(&mut self) -> Option>; 98 | 99 | /// create_snapshot is periodically called by the Replica if log compaction 100 | /// is enabled by setting snapshot_delta > 0. The implementation MUST create 101 | /// a snapshot object with truthful values of index and term. 102 | /// 103 | /// If the Replica should use this snapshot as a checkpoint upon restart, 104 | /// the implementation MUST save the created snapshot object to permanent 105 | /// storage and return it with get_snapshot after restart. 106 | fn create_snapshot( 107 | &mut self, 108 | last_included_index: usize, 109 | last_included_term: usize, 110 | ) -> Snapshot; 111 | 112 | /// When a Replica receives a snapshot from another Replica, set_snapshot is 113 | /// called. The StateMachine MUST then load its state from the provided 114 | /// snapshot and potentially save said snapshot to persistent storage, same 115 | /// way it is done in create_snapshot. 116 | fn set_snapshot(&mut self, snapshot: Snapshot); 117 | } 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Little Raft 2 | The lightest distributed consensus library. Run your own replicated state machine! :heart: 3 | 4 | ## Installing 5 | Simply import the crate. In your `Cargo.toml`, add 6 | ``` 7 | [dependencies] 8 | little_raft = "0.2.0" 9 | ``` 10 | 11 | ## Using 12 | To start running Little Raft, you only need to do three things. 13 | 1. Implement the StateMachine that you want your cluster to maintain. Little Raft will take care of replicating this machine across the cluster and achieving consensus on its state. 14 | ```rust 15 | /// StateMachine describes a user-defined state machine that is replicated 16 | /// across the cluster. Raft can Replica whatever distributed state machine can 17 | /// implement this trait. 18 | pub trait StateMachine 19 | where 20 | T: StateMachineTransition, 21 | { 22 | /// This is a hook that the local Replica will call each time the state of a 23 | /// particular transition changes. It is up to the user what to do with that 24 | /// information. 25 | fn register_transition_state(&mut self, transition_id: T::TransitionID, state: TransitionState); 26 | 27 | /// When a particular transition is ready to be applied, the Replica will 28 | /// call apply_transition to apply said transition to the local state 29 | /// machine. 30 | fn apply_transition(&mut self, transition: T); 31 | 32 | /// This function is used to receive transitions from the user that need to 33 | /// be applied to the replicated state machine. Note that while all Replicas 34 | /// poll get_pending_transitions periodically, only the Leader Replica 35 | /// actually processes them. All other Replicas discard pending transitions. 36 | /// get_pending_transitions must not return the same transition twice. 37 | fn get_pending_transitions(&mut self) -> Vec; 38 | } 39 | ``` 40 | 41 | 2. Implement the Cluster abstraction so that the local Replica can communicate with other nodes. 42 | ```rust 43 | /// Cluster is used for the local Raft Replica to communicate with the rest of 44 | /// the Raft cluster. It is up to the user how to abstract that communication. 45 | /// The Cluster trait also contains hooks which the Replica will use to inform 46 | /// the crate user of state changes. 47 | pub trait Cluster 48 | where 49 | T: StateMachineTransition, 50 | { 51 | /// This function is used to deliver messages to target Replicas. The 52 | /// Replica will provide the to_id of the other Replica it's trying to send 53 | /// its message to and provide the message itself. The send_message 54 | /// implementation must not block but is allowed silently fail -- Raft 55 | /// exists to achieve consensus in spite of failures, after all. 56 | fn send_message(&mut self, to_id: usize, message: Message); 57 | 58 | /// This function is used by the Replica to receive pending messages from 59 | /// the cluster. The receive_messages implementation must not block and must 60 | /// not return the same message more than once. 61 | fn receive_messages(&mut self) -> Vec>; 62 | 63 | /// By returning true from halt you can signal to the Replica that it should 64 | /// stop running. 65 | fn halt(&self) -> bool; 66 | 67 | /// This function is a hook that the Replica uses to inform the user of the 68 | /// Leader change. The leader_id is an Option because the Leader 69 | /// might be unknown for a period of time. Remember that only Leaders can 70 | /// process transitions submitted by the Raft users, so the leader_id can be 71 | /// used to redirect the requests from non-Leader nodes to the Leader node. 72 | fn register_leader(&mut self, leader_id: Option); 73 | } 74 | ``` 75 | 3. Start your replica! 76 | ```rust 77 | /// Create a new Replica. 78 | /// 79 | /// id is the ID of this Replica within the cluster. 80 | /// 81 | /// peer_ids is a vector of IDs of all other Replicas in the cluster. 82 | /// 83 | /// cluster represents the abstraction the Replica uses to talk with other 84 | /// Replicas. 85 | /// 86 | /// state_machine is the state machine that Raft maintains. 87 | /// 88 | /// noop_transition is a transition that can be applied to the state machine 89 | /// multiple times with no effect. 90 | /// 91 | /// heartbeat_timeout defines how often the Leader Replica sends out 92 | /// heartbeat messages. 93 | /// 94 | /// election_timeout_range defines the election timeout interval. If the 95 | /// Replica gets no messages from the Leader before the timeout, it 96 | /// initiates an election. 97 | /// 98 | /// In practice, pick election_timeout_range to be 2-3x the value of 99 | /// heartbeat_timeout, depending on your particular use-case network latency 100 | /// and responsiveness needs. An election_timeout_range / heartbeat_timeout 101 | /// ratio that's too low might cause unwarranted re-elections in the 102 | /// cluster. 103 | pub fn new( 104 | id: ReplicaID, 105 | peer_ids: Vec, 106 | cluster: Arc>, 107 | state_machine: Arc>, 108 | noop_transition: T, 109 | heartbeat_timeout: Duration, 110 | election_timeout_range: (Duration, Duration), 111 | ) -> Replica; 112 | 113 | /// This function starts the Replica and blocks forever. 114 | /// 115 | /// recv_msg is a channel on which the user must notify the Replica whenever 116 | /// new messages from the Cluster are available. The Replica will not poll 117 | /// for messages from the Cluster unless notified through recv_msg. 118 | /// 119 | /// recv_transition is a channel on which the user must notify the Replica 120 | /// whenever new transitions to be processed for the StateMachine are 121 | /// available. The Replica will not poll for pending transitions for the 122 | /// StateMachine unless notified through recv_transition. 123 | pub fn start(&mut self, recv_msg: Receiver<()>, recv_transition: Receiver<()>); 124 | ``` 125 | 126 | 127 | With that, you're good to go. We are working on examples, but for now you can look at the `little_raft/tests` directory and at the documentation at [https://docs.rs/little_raft/0.1.3/little_raft/](https://docs.rs/little_raft/0.1.3/little_raft/). We're working on adding more tests. 128 | 129 | 130 | ## Testing 131 | Run `cargo test`. 132 | 133 | ## Contributing 134 | Contributions are very welcome! Do remember that one of the goals of this library is to be as small and simple as possible. Let's keep the code in `little_raft/src` **under 1,000 lines**. PRs breaking this rule will be declined. 135 | ```bash 136 | > cloc little_raft/src 137 | 6 text files. 138 | 6 unique files. 139 | 0 files ignored. 140 | 141 | github.com/AlDanial/cloc v 1.90 T=0.02 s (369.2 files/s, 56185.0 lines/s) 142 | ------------------------------------------------------------------------------- 143 | Language files blank comment code 144 | ------------------------------------------------------------------------------- 145 | Rust 6 82 199 632 146 | ------------------------------------------------------------------------------- 147 | SUM: 6 82 199 632 148 | ------------------------------------------------------------------------------- 149 | ``` 150 | 151 | You are welcome to pick up and work on any of the issues open for this project. Or you can submit new issues if anything comes up from your experience using this library. -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "autocfg" 7 | version = "1.0.1" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 10 | 11 | [[package]] 12 | name = "byteorder" 13 | version = "1.4.3" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" 16 | 17 | [[package]] 18 | name = "bytes" 19 | version = "0.4.12" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c" 22 | dependencies = [ 23 | "byteorder", 24 | "iovec", 25 | ] 26 | 27 | [[package]] 28 | name = "cfg-if" 29 | version = "1.0.0" 30 | source = "registry+https://github.com/rust-lang/crates.io-index" 31 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 32 | 33 | [[package]] 34 | name = "chrono" 35 | version = "0.2.25" 36 | source = "registry+https://github.com/rust-lang/crates.io-index" 37 | checksum = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00" 38 | dependencies = [ 39 | "num", 40 | "time", 41 | ] 42 | 43 | [[package]] 44 | name = "crossbeam" 45 | version = "0.8.0" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "fd01a6eb3daaafa260f6fc94c3a6c36390abc2080e38e3e34ced87393fb77d80" 48 | dependencies = [ 49 | "cfg-if", 50 | "crossbeam-channel", 51 | "crossbeam-deque", 52 | "crossbeam-epoch", 53 | "crossbeam-queue", 54 | "crossbeam-utils", 55 | ] 56 | 57 | [[package]] 58 | name = "crossbeam-channel" 59 | version = "0.5.1" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" 62 | dependencies = [ 63 | "cfg-if", 64 | "crossbeam-utils", 65 | ] 66 | 67 | [[package]] 68 | name = "crossbeam-deque" 69 | version = "0.8.0" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" 72 | dependencies = [ 73 | "cfg-if", 74 | "crossbeam-epoch", 75 | "crossbeam-utils", 76 | ] 77 | 78 | [[package]] 79 | name = "crossbeam-epoch" 80 | version = "0.9.3" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" 83 | dependencies = [ 84 | "cfg-if", 85 | "crossbeam-utils", 86 | "lazy_static", 87 | "memoffset", 88 | "scopeguard", 89 | ] 90 | 91 | [[package]] 92 | name = "crossbeam-queue" 93 | version = "0.3.1" 94 | source = "registry+https://github.com/rust-lang/crates.io-index" 95 | checksum = "0f6cb3c7f5b8e51bc3ebb73a2327ad4abdbd119dc13223f14f961d2f38486756" 96 | dependencies = [ 97 | "cfg-if", 98 | "crossbeam-utils", 99 | ] 100 | 101 | [[package]] 102 | name = "crossbeam-utils" 103 | version = "0.8.3" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" 106 | dependencies = [ 107 | "autocfg", 108 | "cfg-if", 109 | "lazy_static", 110 | ] 111 | 112 | [[package]] 113 | name = "getrandom" 114 | version = "0.2.2" 115 | source = "registry+https://github.com/rust-lang/crates.io-index" 116 | checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" 117 | dependencies = [ 118 | "cfg-if", 119 | "libc", 120 | "wasi", 121 | ] 122 | 123 | [[package]] 124 | name = "iovec" 125 | version = "0.1.4" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" 128 | dependencies = [ 129 | "libc", 130 | ] 131 | 132 | [[package]] 133 | name = "lazy_static" 134 | version = "1.4.0" 135 | source = "registry+https://github.com/rust-lang/crates.io-index" 136 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 137 | 138 | [[package]] 139 | name = "libc" 140 | version = "0.2.93" 141 | source = "registry+https://github.com/rust-lang/crates.io-index" 142 | checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41" 143 | 144 | [[package]] 145 | name = "little_raft" 146 | version = "0.2.0" 147 | dependencies = [ 148 | "bytes", 149 | "crossbeam", 150 | "crossbeam-channel", 151 | "rand", 152 | "time", 153 | "timer", 154 | ] 155 | 156 | [[package]] 157 | name = "memoffset" 158 | version = "0.6.3" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d" 161 | dependencies = [ 162 | "autocfg", 163 | ] 164 | 165 | [[package]] 166 | name = "num" 167 | version = "0.1.42" 168 | source = "registry+https://github.com/rust-lang/crates.io-index" 169 | checksum = "4703ad64153382334aa8db57c637364c322d3372e097840c72000dabdcf6156e" 170 | dependencies = [ 171 | "num-integer", 172 | "num-iter", 173 | "num-traits", 174 | ] 175 | 176 | [[package]] 177 | name = "num-integer" 178 | version = "0.1.44" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" 181 | dependencies = [ 182 | "autocfg", 183 | "num-traits", 184 | ] 185 | 186 | [[package]] 187 | name = "num-iter" 188 | version = "0.1.42" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" 191 | dependencies = [ 192 | "autocfg", 193 | "num-integer", 194 | "num-traits", 195 | ] 196 | 197 | [[package]] 198 | name = "num-traits" 199 | version = "0.2.14" 200 | source = "registry+https://github.com/rust-lang/crates.io-index" 201 | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" 202 | dependencies = [ 203 | "autocfg", 204 | ] 205 | 206 | [[package]] 207 | name = "ppv-lite86" 208 | version = "0.2.10" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" 211 | 212 | [[package]] 213 | name = "rand" 214 | version = "0.8.3" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" 217 | dependencies = [ 218 | "libc", 219 | "rand_chacha", 220 | "rand_core", 221 | "rand_hc", 222 | ] 223 | 224 | [[package]] 225 | name = "rand_chacha" 226 | version = "0.3.0" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" 229 | dependencies = [ 230 | "ppv-lite86", 231 | "rand_core", 232 | ] 233 | 234 | [[package]] 235 | name = "rand_core" 236 | version = "0.6.2" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" 239 | dependencies = [ 240 | "getrandom", 241 | ] 242 | 243 | [[package]] 244 | name = "rand_hc" 245 | version = "0.3.0" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" 248 | dependencies = [ 249 | "rand_core", 250 | ] 251 | 252 | [[package]] 253 | name = "scopeguard" 254 | version = "1.1.0" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 257 | 258 | [[package]] 259 | name = "time" 260 | version = "0.1.43" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" 263 | dependencies = [ 264 | "libc", 265 | "winapi", 266 | ] 267 | 268 | [[package]] 269 | name = "timer" 270 | version = "0.1.6" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "a9522a9ec40055e2f9e514e38d2415a496e81dbfc1ece15d98d2fe55c44946b3" 273 | dependencies = [ 274 | "chrono", 275 | ] 276 | 277 | [[package]] 278 | name = "wasi" 279 | version = "0.10.2+wasi-snapshot-preview1" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" 282 | 283 | [[package]] 284 | name = "winapi" 285 | version = "0.3.9" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 288 | dependencies = [ 289 | "winapi-i686-pc-windows-gnu", 290 | "winapi-x86_64-pc-windows-gnu", 291 | ] 292 | 293 | [[package]] 294 | name = "winapi-i686-pc-windows-gnu" 295 | version = "0.4.0" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 298 | 299 | [[package]] 300 | name = "winapi-x86_64-pc-windows-gnu" 301 | version = "0.4.0" 302 | source = "registry+https://github.com/rust-lang/crates.io-index" 303 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 304 | -------------------------------------------------------------------------------- /little_raft/tests/raft_stable.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use crossbeam_channel as channel; 3 | use crossbeam_channel::{unbounded, Receiver, Sender}; 4 | use little_raft::{ 5 | cluster::Cluster, 6 | message::Message, 7 | replica::Replica, 8 | state_machine::{Snapshot, StateMachine, StateMachineTransition, TransitionState}, 9 | }; 10 | use std::convert::TryInto; 11 | use std::sync::{Arc, Mutex}; 12 | 13 | use std::{collections::BTreeMap, thread, time::Duration}; 14 | 15 | const HEARTBEAT_TIMEOUT: Duration = Duration::from_millis(500); 16 | const MIN_ELECTION_TIMEOUT: Duration = Duration::from_millis(750); 17 | const MAX_ELECTION_TIMEOUT: Duration = Duration::from_millis(950); 18 | 19 | // Our state machine will carry out simple plus and minus operations on a 20 | // number, starting from zero. 21 | #[derive(Clone, Debug)] 22 | struct ArithmeticOperation { 23 | id: usize, 24 | delta: i32, 25 | } 26 | 27 | impl StateMachineTransition for ArithmeticOperation { 28 | type TransitionID = usize; 29 | fn get_id(&self) -> Self::TransitionID { 30 | self.id 31 | } 32 | } 33 | 34 | // The Calculator is the state machine that maintains a number that we can add 35 | // to or subtract from. ID is simply for convenience. 36 | struct Calculator { 37 | id: usize, 38 | value: i32, 39 | applied_ids_tx: Sender<(usize, usize)>, 40 | pending_transitions: Vec, 41 | } 42 | 43 | impl StateMachine for Calculator { 44 | fn apply_transition(&mut self, transition: ArithmeticOperation) { 45 | self.value += transition.delta; 46 | println!("id {} my value is now {} after applying delta {}", self.id, self.value, transition.delta); 47 | } 48 | 49 | fn register_transition_state( 50 | &mut self, 51 | transition_id: ::TransitionID, 52 | state: TransitionState, 53 | ) { 54 | // Send IDs of applied transitions down the channel so we can confirm 55 | // they were applied in the right order. 56 | if state == TransitionState::Applied { 57 | self.applied_ids_tx 58 | .send((self.id, transition_id)) 59 | .expect("could not send applied transition id"); 60 | } 61 | } 62 | 63 | fn get_pending_transitions(&mut self) -> Vec { 64 | let cur = self.pending_transitions.clone(); 65 | self.pending_transitions = Vec::new(); 66 | cur 67 | } 68 | 69 | fn get_snapshot(&mut self) -> Option> { 70 | println!("checked for snapshot"); 71 | None 72 | } 73 | 74 | fn create_snapshot(&mut self, index: usize, term: usize) -> Snapshot { 75 | println!("created snapshot"); 76 | Snapshot { 77 | last_included_index: index, 78 | last_included_term: term, 79 | data: Bytes::from(self.value.to_be_bytes().to_vec()), 80 | } 81 | } 82 | 83 | fn set_snapshot(&mut self, snapshot: Snapshot) { 84 | let v: Vec = snapshot.data.into_iter().collect(); 85 | self.value = i32::from_be_bytes(v[..].try_into().expect("incorrect length")); 86 | println!("my value is now {} after loading", self.value); 87 | } 88 | } 89 | 90 | // Our test replicas will be running each in its own thread. 91 | struct ThreadCluster { 92 | id: usize, 93 | is_leader: bool, 94 | transmitters: BTreeMap>>, 95 | pending_messages: Vec>, 96 | halt: bool, 97 | } 98 | 99 | impl Cluster for ThreadCluster { 100 | fn register_leader(&mut self, leader_id: Option) { 101 | if let Some(id) = leader_id { 102 | if id == self.id { 103 | self.is_leader = true; 104 | } else { 105 | self.is_leader = false; 106 | } 107 | } else { 108 | self.is_leader = false; 109 | } 110 | } 111 | 112 | fn send_message(&mut self, to_id: usize, message: Message) { 113 | if let Some(transmitter) = self.transmitters.get(&to_id) { 114 | transmitter.send(message).expect("could not send message"); 115 | } 116 | } 117 | 118 | fn halt(&self) -> bool { 119 | self.halt 120 | } 121 | 122 | fn receive_messages(&mut self) -> Vec> { 123 | let cur = self.pending_messages.clone(); 124 | self.pending_messages = Vec::new(); 125 | cur 126 | } 127 | } 128 | 129 | // Create n clusters, each with their own copy of trasmitters used for 130 | // communication between replicas (threads). 131 | fn create_clusters( 132 | n: usize, 133 | transmitters: BTreeMap>>, 134 | ) -> Vec>> { 135 | let mut clusters = Vec::new(); 136 | for i in 0..n { 137 | let cluster = Arc::new(Mutex::new(ThreadCluster { 138 | id: i, 139 | is_leader: false, 140 | transmitters: transmitters.clone(), 141 | pending_messages: Vec::new(), 142 | halt: false, 143 | })); 144 | 145 | clusters.push(cluster); 146 | } 147 | 148 | clusters 149 | } 150 | 151 | // Create channels for the threads to communicate with. 152 | fn create_communication_between_clusters( 153 | n: usize, 154 | ) -> ( 155 | BTreeMap>>, 156 | Vec>>, 157 | ) { 158 | let (mut transmitters, mut receivers) = (BTreeMap::new(), Vec::new()); 159 | for i in 0..n { 160 | let (tx, rx) = unbounded::>(); 161 | transmitters.insert(i, tx); 162 | receivers.push(rx); 163 | } 164 | 165 | (transmitters, receivers) 166 | } 167 | 168 | fn create_peer_ids(n: usize) -> Vec> { 169 | let mut all_peer_ids = Vec::new(); 170 | for i in 0..n { 171 | let mut peer_ids = Vec::new(); 172 | for n in 0..n { 173 | if n != i { 174 | peer_ids.push(n); 175 | } 176 | } 177 | all_peer_ids.push(peer_ids); 178 | } 179 | 180 | all_peer_ids 181 | } 182 | 183 | // Create state machines, each with its own copy on which to send 184 | // (state_machine_id, transition_id) for transitions that have been applied. 185 | fn create_state_machines( 186 | n: usize, 187 | applied_transitions_tx: Sender<(usize, usize)>, 188 | ) -> Vec>> { 189 | let mut state_machines = Vec::new(); 190 | for i in 0..n { 191 | let state_machine = Arc::new(Mutex::new(Calculator { 192 | id: i, 193 | value: 0, 194 | pending_transitions: Vec::new(), 195 | applied_ids_tx: applied_transitions_tx.clone(), 196 | })); 197 | state_machines.push(state_machine); 198 | } 199 | state_machines 200 | } 201 | 202 | // Create sending ends of message notifiers, sending ends of transition 203 | // notifiers, receiving ends of message notifiers, receiving neds of transition 204 | // notifiers. 205 | fn create_notifiers( 206 | n: usize, 207 | ) -> ( 208 | Vec>, 209 | Vec>, 210 | Vec>, 211 | Vec>, 212 | ) { 213 | let mut message_tx = Vec::new(); 214 | let mut message_rx = Vec::new(); 215 | let mut transition_tx = Vec::new(); 216 | let mut transition_rx = Vec::new(); 217 | for _ in 0..n { 218 | let (message_notifier_tx, message_notifier_rx) = channel::unbounded(); 219 | let (transition_notifier_tx, transition_notifier_rx) = channel::unbounded(); 220 | message_tx.push(message_notifier_tx); 221 | message_rx.push(message_notifier_rx); 222 | transition_tx.push(transition_notifier_tx); 223 | transition_rx.push(transition_notifier_rx); 224 | } 225 | 226 | (message_tx, transition_tx, message_rx, transition_rx) 227 | } 228 | 229 | fn run_clusters_communication( 230 | mut clusters: Vec>>, 231 | mut cluster_message_receivers: Vec>>, 232 | mut message_notifiers_tx: Vec>, 233 | ) { 234 | for _ in (0..clusters.len()).rev() { 235 | let cluster = clusters.pop().unwrap(); 236 | let cluster_message_rx = cluster_message_receivers.pop().unwrap(); 237 | let message_notifier = message_notifiers_tx.pop().unwrap(); 238 | 239 | // For each cluster, start a thread where we notify the cluster replica 240 | // of a new message as soon as we receive one for it. 241 | thread::spawn(move || loop { 242 | let msg = cluster_message_rx.recv().unwrap(); 243 | match cluster.lock() { 244 | Ok(mut unlocked_cluster) => { 245 | unlocked_cluster.pending_messages.push(msg); 246 | message_notifier 247 | .send(()) 248 | .expect("could not notify of message"); 249 | } 250 | _ => return, 251 | } 252 | }); 253 | } 254 | } 255 | 256 | fn run_arithmetic_operation_on_cluster( 257 | clusters: Vec>>, 258 | state_machines: Vec>>, 259 | transition_notifiers: Vec>, 260 | delta: i32, 261 | id: usize, 262 | ) { 263 | thread::sleep(Duration::from_secs(1)); 264 | // Find the leader and send the transition request to it. 265 | for cluster in clusters.iter() { 266 | let cluster = cluster.lock().unwrap(); 267 | if cluster.is_leader { 268 | state_machines[cluster.id] 269 | .lock() 270 | .unwrap() 271 | .pending_transitions 272 | .push(ArithmeticOperation { delta, id }); 273 | transition_notifiers[cluster.id] 274 | .send(()) 275 | .expect("could not send transition notification"); 276 | break; 277 | } 278 | } 279 | 280 | thread::sleep(Duration::from_secs(2)); 281 | } 282 | 283 | fn halt_clusters(clusters: Vec>>) { 284 | thread::sleep(Duration::from_secs(1)); 285 | for cluster in clusters.iter() { 286 | let mut c = cluster.lock().unwrap(); 287 | c.halt = true; 288 | } 289 | thread::sleep(Duration::from_secs(3)); 290 | } 291 | 292 | #[test] 293 | fn run_replicas() { 294 | let n = 3; 295 | // We are going to test that three replicas can elect a leader and process a 296 | // few simple operations. 297 | // 298 | // Main complexity of this test set up comes from the fact that everything 299 | // is running on a single machine, so we have to keep track of every 300 | // cluster, replica, and state machine object. In the real world usage of 301 | // the library it's unlikely there will ever be more than a single instance 302 | // of each object per process or even a physical machine. 303 | let (transmitters, receivers) = create_communication_between_clusters(3); 304 | let clusters = create_clusters(n, transmitters); 305 | let peer_ids = create_peer_ids(n); 306 | let noop = ArithmeticOperation { delta: 0, id: 0 }; 307 | let (applied_transitions_tx, applied_transitions_rx) = unbounded(); 308 | let state_machines = create_state_machines(n, applied_transitions_tx); 309 | let (message_tx, transition_tx, message_rx, transition_rx) = create_notifiers(n); 310 | 311 | for i in 0..n { 312 | let noop = noop.clone(); 313 | let local_peer_ids = peer_ids[i].clone(); 314 | let cluster = clusters[i].clone(); 315 | let state_machine = state_machines[i].clone(); 316 | let m_rx = message_rx[i].clone(); 317 | let t_rx = transition_rx[i].clone(); 318 | 319 | thread::spawn(move || { 320 | let mut replica = Replica::new( 321 | i, 322 | local_peer_ids, 323 | cluster, 324 | state_machine, 325 | 1, 326 | noop.clone(), 327 | HEARTBEAT_TIMEOUT, 328 | (MIN_ELECTION_TIMEOUT, MAX_ELECTION_TIMEOUT), 329 | ); 330 | 331 | replica.start(m_rx, t_rx); 332 | }); 333 | } 334 | 335 | run_clusters_communication(clusters.clone(), receivers, message_tx); 336 | 337 | run_arithmetic_operation_on_cluster( 338 | clusters.clone(), 339 | state_machines.clone(), 340 | transition_tx.clone(), 341 | 5, 342 | 1, 343 | ); 344 | 345 | run_arithmetic_operation_on_cluster( 346 | clusters.clone(), 347 | state_machines.clone(), 348 | transition_tx.clone(), 349 | -51, 350 | 2, 351 | ); 352 | 353 | run_arithmetic_operation_on_cluster( 354 | clusters.clone(), 355 | state_machines.clone(), 356 | transition_tx.clone(), 357 | -511, 358 | 3, 359 | ); 360 | 361 | run_arithmetic_operation_on_cluster(clusters.clone(), state_machines.clone(), transition_tx.clone(), 3, 4); 362 | 363 | halt_clusters(clusters); 364 | 365 | // Below we confirm that every replica applied the same transitions in the 366 | // same order. 367 | let applied_transactions: Vec<(usize, usize)> = applied_transitions_rx.try_iter().collect(); 368 | let expected_vec: Vec = vec![0, 1, 2, 3, 4]; 369 | assert_eq!( 370 | expected_vec, 371 | applied_transactions.iter().fold(Vec::new(), |mut acc, x| { 372 | if x.0 == 0 { 373 | acc.push(x.1); 374 | }; 375 | acc 376 | }) 377 | ); 378 | 379 | assert_eq!( 380 | expected_vec, 381 | applied_transactions.iter().fold(Vec::new(), |mut acc, x| { 382 | if x.0 == 1 { 383 | acc.push(x.1); 384 | }; 385 | acc 386 | }) 387 | ); 388 | 389 | assert_eq!( 390 | expected_vec, 391 | applied_transactions.iter().fold(Vec::new(), |mut acc, x| { 392 | if x.0 == 2 { 393 | acc.push(x.1); 394 | }; 395 | acc 396 | }) 397 | ); 398 | } 399 | -------------------------------------------------------------------------------- /little_raft/tests/raft_unstable.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | use crossbeam_channel as channel; 3 | use crossbeam_channel::{unbounded, Receiver, Sender}; 4 | use rand::{thread_rng, Rng}; 5 | use rand::seq::SliceRandom; 6 | use little_raft::{ 7 | cluster::Cluster, 8 | message::Message, 9 | replica::Replica, 10 | state_machine::{Snapshot, StateMachine, StateMachineTransition, TransitionState}, 11 | }; 12 | use std::convert::TryInto; 13 | use std::sync::{Arc, Mutex}; 14 | 15 | use std::{collections::BTreeMap, thread, time::Duration}; 16 | 17 | const HEARTBEAT_TIMEOUT: Duration = Duration::from_millis(50); 18 | const MIN_ELECTION_TIMEOUT: Duration = Duration::from_millis(750); 19 | const MAX_ELECTION_TIMEOUT: Duration = Duration::from_millis(950); 20 | 21 | // Our state machine will carry out simple plus and minus operations on a 22 | // number, starting from zero. 23 | #[derive(Clone, Debug)] 24 | struct ArithmeticOperation { 25 | id: usize, 26 | delta: i32, 27 | } 28 | 29 | impl StateMachineTransition for ArithmeticOperation { 30 | type TransitionID = usize; 31 | fn get_id(&self) -> Self::TransitionID { 32 | self.id 33 | } 34 | } 35 | 36 | // The Calculator is the state machine that maintains a number that we can add 37 | // to or subtract from. ID is simply for convenience. 38 | struct Calculator { 39 | id: usize, 40 | value: i32, 41 | applied_ids_tx: Sender<(usize, usize)>, 42 | pending_transitions: Vec, 43 | } 44 | 45 | impl StateMachine for Calculator { 46 | fn apply_transition(&mut self, transition: ArithmeticOperation) { 47 | self.value += transition.delta; 48 | println!("id {} my value is now {} after applying delta {}", self.id, self.value, transition.delta); 49 | } 50 | 51 | fn register_transition_state( 52 | &mut self, 53 | transition_id: ::TransitionID, 54 | state: TransitionState, 55 | ) { 56 | // Send IDs of applied transitions down the channel so we can confirm 57 | // they were applied in the right order. 58 | if state == TransitionState::Applied { 59 | self.applied_ids_tx 60 | .send((self.id, transition_id)) 61 | .expect("could not send applied transition id"); 62 | } 63 | } 64 | 65 | fn get_pending_transitions(&mut self) -> Vec { 66 | let cur = self.pending_transitions.clone(); 67 | self.pending_transitions = Vec::new(); 68 | cur 69 | } 70 | 71 | fn get_snapshot(&mut self) -> Option> { 72 | println!("id {} checked for snapshot", self.id); 73 | None 74 | } 75 | 76 | fn create_snapshot(&mut self, index: usize, term: usize) -> Snapshot { 77 | println!("id {} created snapshot", self.id); 78 | Snapshot { 79 | last_included_index: index, 80 | last_included_term: term, 81 | data: Bytes::from(self.value.to_be_bytes().to_vec()), 82 | } 83 | } 84 | 85 | fn set_snapshot(&mut self, snapshot: Snapshot) { 86 | let v: Vec = snapshot.data.into_iter().collect(); 87 | self.value = i32::from_be_bytes(v[..].try_into().expect("incorrect length")); 88 | println!("id {} my value is now {} after loading", self.id, self.value); 89 | } 90 | } 91 | 92 | // Our test replicas will be running each in its own thread. 93 | struct ThreadCluster { 94 | id: usize, 95 | is_leader: bool, 96 | transmitters: BTreeMap>>, 97 | pending_messages: Vec>, 98 | halt: bool, 99 | } 100 | 101 | impl Cluster for ThreadCluster { 102 | fn register_leader(&mut self, leader_id: Option) { 103 | if let Some(id) = leader_id { 104 | if id == self.id { 105 | self.is_leader = true; 106 | } else { 107 | self.is_leader = false; 108 | } 109 | } else { 110 | self.is_leader = false; 111 | } 112 | } 113 | 114 | fn send_message(&mut self, to_id: usize, message: Message) { 115 | // Drop messages with probability 0.25. 116 | let n: u8 = rand::thread_rng().gen(); 117 | if n % 4 == 0 { 118 | return 119 | } 120 | 121 | if let Some(transmitter) = self.transmitters.get(&to_id) { 122 | transmitter.send(message).expect("could not send message"); 123 | } 124 | } 125 | 126 | fn halt(&self) -> bool { 127 | self.halt 128 | } 129 | 130 | fn receive_messages(&mut self) -> Vec> { 131 | let mut cur = self.pending_messages.clone(); 132 | // Shuffle messages. 133 | cur.shuffle(&mut thread_rng()); 134 | self.pending_messages = Vec::new(); 135 | cur 136 | } 137 | } 138 | 139 | // Create n clusters, each with their own copy of trasmitters used for 140 | // communication between replicas (threads). 141 | fn create_clusters( 142 | n: usize, 143 | transmitters: BTreeMap>>, 144 | ) -> Vec>> { 145 | let mut clusters = Vec::new(); 146 | for i in 0..n { 147 | let cluster = Arc::new(Mutex::new(ThreadCluster { 148 | id: i, 149 | is_leader: false, 150 | transmitters: transmitters.clone(), 151 | pending_messages: Vec::new(), 152 | halt: false, 153 | })); 154 | 155 | clusters.push(cluster); 156 | } 157 | 158 | clusters 159 | } 160 | 161 | // Create channels for the threads to communicate with. 162 | fn create_communication_between_clusters( 163 | n: usize, 164 | ) -> ( 165 | BTreeMap>>, 166 | Vec>>, 167 | ) { 168 | let (mut transmitters, mut receivers) = (BTreeMap::new(), Vec::new()); 169 | for i in 0..n { 170 | let (tx, rx) = unbounded::>(); 171 | transmitters.insert(i, tx); 172 | receivers.push(rx); 173 | } 174 | 175 | (transmitters, receivers) 176 | } 177 | 178 | fn create_peer_ids(n: usize) -> Vec> { 179 | let mut all_peer_ids = Vec::new(); 180 | for i in 0..n { 181 | let mut peer_ids = Vec::new(); 182 | for n in 0..n { 183 | if n != i { 184 | peer_ids.push(n); 185 | } 186 | } 187 | all_peer_ids.push(peer_ids); 188 | } 189 | 190 | all_peer_ids 191 | } 192 | 193 | // Create state machines, each with its own copy on which to send 194 | // (state_machine_id, transition_id) for transitions that have been applied. 195 | fn create_state_machines( 196 | n: usize, 197 | applied_transitions_tx: Sender<(usize, usize)>, 198 | ) -> Vec>> { 199 | let mut state_machines = Vec::new(); 200 | for i in 0..n { 201 | let state_machine = Arc::new(Mutex::new(Calculator { 202 | id: i, 203 | value: 0, 204 | pending_transitions: Vec::new(), 205 | applied_ids_tx: applied_transitions_tx.clone(), 206 | })); 207 | state_machines.push(state_machine); 208 | } 209 | state_machines 210 | } 211 | 212 | // Create sending ends of message notifiers, sending ends of transition 213 | // notifiers, receiving ends of message notifiers, receiving neds of transition 214 | // notifiers. 215 | fn create_notifiers( 216 | n: usize, 217 | ) -> ( 218 | Vec>, 219 | Vec>, 220 | Vec>, 221 | Vec>, 222 | ) { 223 | let mut message_tx = Vec::new(); 224 | let mut message_rx = Vec::new(); 225 | let mut transition_tx = Vec::new(); 226 | let mut transition_rx = Vec::new(); 227 | for _ in 0..n { 228 | let (message_notifier_tx, message_notifier_rx) = channel::unbounded(); 229 | let (transition_notifier_tx, transition_notifier_rx) = channel::unbounded(); 230 | message_tx.push(message_notifier_tx); 231 | message_rx.push(message_notifier_rx); 232 | transition_tx.push(transition_notifier_tx); 233 | transition_rx.push(transition_notifier_rx); 234 | } 235 | 236 | (message_tx, transition_tx, message_rx, transition_rx) 237 | } 238 | 239 | fn run_clusters_communication( 240 | mut clusters: Vec>>, 241 | mut cluster_message_receivers: Vec>>, 242 | mut message_notifiers_tx: Vec>, 243 | ) { 244 | for _ in (0..clusters.len()).rev() { 245 | let cluster = clusters.pop().unwrap(); 246 | let cluster_message_rx = cluster_message_receivers.pop().unwrap(); 247 | let message_notifier = message_notifiers_tx.pop().unwrap(); 248 | 249 | // For each cluster, start a thread where we notify the cluster replica 250 | // of a new message as soon as we receive one for it. 251 | thread::spawn(move || loop { 252 | let msg = cluster_message_rx.recv().unwrap(); 253 | match cluster.lock() { 254 | Ok(mut unlocked_cluster) => { 255 | unlocked_cluster.pending_messages.push(msg); 256 | message_notifier 257 | .send(()) 258 | .expect("could not notify of message"); 259 | } 260 | _ => return, 261 | } 262 | }); 263 | } 264 | } 265 | 266 | fn run_arithmetic_operation_on_cluster( 267 | clusters: Vec>>, 268 | state_machines: Vec>>, 269 | transition_notifiers: Vec>, 270 | delta: i32, 271 | id: usize, 272 | ) { 273 | // Sleep longer because in this test we're dropping 25% of all messages. 274 | thread::sleep(Duration::from_secs(2)); 275 | // Find the leader and send the transition request to it. 276 | for cluster in clusters.iter() { 277 | let cluster = cluster.lock().unwrap(); 278 | if cluster.is_leader { 279 | state_machines[cluster.id] 280 | .lock() 281 | .unwrap() 282 | .pending_transitions 283 | .push(ArithmeticOperation { delta, id }); 284 | transition_notifiers[cluster.id] 285 | .send(()) 286 | .expect("could not send transition notification"); 287 | break; 288 | } 289 | } 290 | 291 | // Sleep long. 292 | thread::sleep(Duration::from_secs(3)); 293 | } 294 | 295 | fn halt_clusters(clusters: Vec>>) { 296 | thread::sleep(Duration::from_secs(1)); 297 | for cluster in clusters.iter() { 298 | let mut c = cluster.lock().unwrap(); 299 | c.halt = true; 300 | } 301 | thread::sleep(Duration::from_secs(2)); 302 | } 303 | 304 | #[test] 305 | fn run_replicas() { 306 | let n = 3; 307 | // We are going to test that three replicas can elect a leader and process a 308 | // few simple operations. 309 | // 310 | // Main complexity of this test set up comes from the fact that everything 311 | // is running on a single machine, so we have to keep track of every 312 | // cluster, replica, and state machine object. In the real world usage of 313 | // the library it's unlikely there will ever be more than a single instance 314 | // of each object per process or even a physical machine. 315 | let (transmitters, receivers) = create_communication_between_clusters(3); 316 | let clusters = create_clusters(n, transmitters); 317 | let peer_ids = create_peer_ids(n); 318 | let noop = ArithmeticOperation { delta: 0, id: 0 }; 319 | let (applied_transitions_tx, _applied_transitions_rx) = unbounded(); 320 | let state_machines = create_state_machines(n, applied_transitions_tx); 321 | let (message_tx, transition_tx, message_rx, transition_rx) = create_notifiers(n); 322 | for i in 0..n { 323 | let noop = noop.clone(); 324 | let local_peer_ids = peer_ids[i].clone(); 325 | let cluster = clusters[i].clone(); 326 | let state_machine = state_machines[i].clone(); 327 | let m_rx = message_rx[i].clone(); 328 | let t_rx = transition_rx[i].clone(); 329 | thread::spawn(move || { 330 | let mut replica = Replica::new( 331 | i, 332 | local_peer_ids, 333 | cluster, 334 | state_machine, 335 | 1, 336 | noop.clone(), 337 | HEARTBEAT_TIMEOUT, 338 | (MIN_ELECTION_TIMEOUT, MAX_ELECTION_TIMEOUT), 339 | ); 340 | 341 | replica.start(m_rx, t_rx); 342 | }); 343 | } 344 | 345 | run_clusters_communication(clusters.clone(), receivers, message_tx); 346 | run_arithmetic_operation_on_cluster( 347 | clusters.clone(), 348 | state_machines.clone(), 349 | transition_tx.clone(), 350 | 5, 351 | 1, 352 | ); 353 | 354 | // In this test, we confirm that the cluster converged on true value one by 355 | // one after each arithmetic operation. This is different from 356 | // raft_stable.rs, where we check the order in which transition have been 357 | // applied post-factum. We can't do the same in raft_unstable.rs, because 358 | // replicas reload from snapshots in this test, meaning not all replicas go 359 | // over all transitions. Some replicas load directly from their peer's 360 | // snapshots. 361 | for machine in state_machines.clone() { 362 | assert_eq!(machine.lock().unwrap().value, 5); 363 | } 364 | 365 | run_arithmetic_operation_on_cluster( 366 | clusters.clone(), 367 | state_machines.clone(), 368 | transition_tx.clone(), 369 | -51, 370 | 2, 371 | ); 372 | 373 | for machine in state_machines.clone() { 374 | assert_eq!(machine.lock().unwrap().value, -46); 375 | } 376 | 377 | run_arithmetic_operation_on_cluster( 378 | clusters.clone(), 379 | state_machines.clone(), 380 | transition_tx.clone(), 381 | -511, 382 | 3, 383 | ); 384 | 385 | 386 | for machine in state_machines.clone() { 387 | assert_eq!(machine.lock().unwrap().value, -557); 388 | } 389 | 390 | run_arithmetic_operation_on_cluster(clusters.clone(), state_machines.clone(), transition_tx.clone(), 3, 4); 391 | 392 | for machine in state_machines.clone() { 393 | assert_eq!(machine.lock().unwrap().value, -554); 394 | } 395 | 396 | halt_clusters(clusters); 397 | } 398 | -------------------------------------------------------------------------------- /little_raft/src/replica.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | cluster::Cluster, 3 | message::{LogEntry, Message}, 4 | state_machine::{ 5 | Snapshot, StateMachine, StateMachineTransition, TransitionAbandonedReason, TransitionState, 6 | }, 7 | timer::Timer, 8 | }; 9 | use crossbeam_channel::{Receiver, Select}; 10 | use rand::Rng; 11 | use std::cmp::Ordering; 12 | use std::sync::{Arc, Mutex}; 13 | use std::{ 14 | cmp, 15 | collections::{BTreeMap, BTreeSet}, 16 | time::{Duration, Instant}, 17 | }; 18 | 19 | #[derive(Clone, PartialEq, Debug)] 20 | enum State { 21 | Follower, 22 | Candidate, 23 | Leader, 24 | } 25 | 26 | /// ReplicaID is a type alias used to identify Raft nodes. 27 | pub type ReplicaID = usize; 28 | 29 | type Result = std::result::Result; 30 | 31 | #[derive(Debug, Clone)] 32 | enum ReplicaError { 33 | LogCompacted, 34 | } 35 | 36 | /// Replica describes the local instance running the Raft algorithm. Its goal is 37 | /// to maintain the consistency of the user-defined StateMachine across the 38 | /// cluster. It uses the user-defined Cluster implementation to talk to other 39 | /// Replicas, be it over the network or pigeon post. 40 | pub struct Replica 41 | where 42 | C: Cluster, 43 | M: StateMachine, 44 | T: StateMachineTransition, 45 | D: Clone, 46 | { 47 | /// ID of this Replica. 48 | id: ReplicaID, 49 | 50 | /// IDs of other Replicas in the cluster. 51 | peer_ids: Vec, 52 | 53 | /// User-defined state machine that the cluster Replicates. 54 | state_machine: Arc>, 55 | 56 | /// Interface a Replica uses to communicate with the rest of the cluster. 57 | cluster: Arc>, 58 | 59 | /// Current term. 60 | current_term: usize, 61 | 62 | /// ID of peers with votes for self. 63 | current_votes: Option>>, 64 | 65 | /// State of this Replica. 66 | state: State, 67 | 68 | /// Who the last vote was cast for. 69 | voted_for: Option, 70 | 71 | /// entries this Replica is aware of. 72 | log: Vec>, 73 | 74 | /// Index of the highest transition known to be committed. 75 | commit_index: usize, 76 | 77 | /// Index of the highest transition applied to the local state machine. 78 | last_applied: usize, 79 | 80 | /// For each server, index of the next log entry to send to that server. 81 | /// Only present on leaders. 82 | next_index: BTreeMap, 83 | 84 | /// For each server, index of highest log entry known to be replicated on 85 | /// that server. Only present on leaders. 86 | match_index: BTreeMap, 87 | 88 | /// No-op transition used to force a faster Replica update when a cluster 89 | /// Leader changes. Applied this transition multiple times must have no 90 | /// affect on the state machine. 91 | noop_transition: T, 92 | 93 | /// Timer used for heartbeat messages. 94 | heartbeat_timer: Timer, 95 | 96 | /// Timeout range within a randomized timeout is picked for when to start a 97 | /// new Leader election if the current Leader is not sending heartbeats. 98 | election_timeout: (Duration, Duration), 99 | 100 | /// If no heartbeat message is received by the deadline, the Replica will 101 | /// start an election. 102 | next_election_deadline: Instant, 103 | 104 | /// The number of transaction logs that this instance will let accumulate 105 | /// before merging them into a single snapshot. Snapshotting is enabled <=> 106 | /// snapshot_delta > 0. 107 | snapshot_delta: usize, 108 | 109 | /// The log snapshot of this Replica. Even if snapshot_delta is 0, the 110 | /// snapshot field can be Some(_), since the Replica can be started with a 111 | /// seed snapshot. 112 | snapshot: Option>, 113 | 114 | /// The length of the log sequence that is represented by the snapshot. 115 | /// Since compacted entries aren't in the log anymore, access to the log 116 | /// should be done with log[log_index - index_offset]. 117 | /// 118 | /// The following is always true: 119 | /// 120 | /// last_log_index = log.len() - 1 + index_offset. 121 | index_offset: usize, 122 | } 123 | 124 | impl Replica 125 | where 126 | C: Cluster, 127 | M: StateMachine, 128 | T: StateMachineTransition, 129 | D: Clone, 130 | { 131 | /// Create a new Replica. 132 | /// 133 | /// id is the ID of this Replica within the cluster. 134 | /// 135 | /// peer_ids is a vector of IDs of all other Replicas in the cluster. 136 | /// 137 | /// cluster represents the abstraction the Replica uses to talk with other 138 | /// Replicas. 139 | /// 140 | /// state_machine is the state machine that Raft maintains. 141 | /// 142 | /// snapshot_delta tells the Replica how many transaction logs to accumulate 143 | /// before doing compaction and merging them into a snapshot. Snapshotting 144 | /// is enabled if and only if snapshot_delta > 0. 145 | /// 146 | /// noop_transition is a transition that can be applied to the state machine 147 | /// multiple times with no effect. 148 | /// 149 | /// heartbeat_timeout defines how often the Leader Replica sends out 150 | /// heartbeat messages. 151 | /// 152 | /// election_timeout_range defines the election timeout interval. If the 153 | /// Replica gets no messages from the Leader before the timeout, it 154 | /// initiates an election. In practice, pick election_timeout_range to be 155 | /// 2-3x the value of heartbeat_timeout, depending on your particular 156 | /// use-case network latency and responsiveness needs. An 157 | /// election_timeout_range / heartbeat_timeout ratio that's too low might 158 | /// cause unwarranted re-elections in the cluster. 159 | pub fn new( 160 | id: ReplicaID, 161 | peer_ids: Vec, 162 | cluster: Arc>, 163 | state_machine: Arc>, 164 | snapshot_delta: usize, 165 | noop_transition: T, 166 | heartbeat_timeout: Duration, 167 | election_timeout_range: (Duration, Duration), 168 | ) -> Replica { 169 | let snapshot = state_machine.lock().unwrap().get_snapshot(); 170 | // index_offset is the "length" of the snapshot, so calculate it as 171 | // snapshot.last_included_index + 1. 172 | let mut index_offset: usize = 0; 173 | let mut current_term: usize = 0; 174 | let mut log: Vec> = Vec::new(); 175 | if let Some(ref snapshot) = snapshot { 176 | index_offset = snapshot.last_included_index + 1; 177 | current_term = snapshot.last_included_term; 178 | } else { 179 | // If the Replica is starting anew, create a default no-op transition as 180 | // the very first entry in the log. This trick lets us make sure every 181 | // Replica has a non-empty log. If the Replica is starting from a 182 | // snapshot, initialize current log to empty. 183 | log = vec![LogEntry { 184 | term: 0, 185 | index: 0, 186 | transition: noop_transition.clone(), 187 | }] 188 | } 189 | 190 | Replica { 191 | state_machine, 192 | cluster, 193 | peer_ids, 194 | id, 195 | current_term, 196 | current_votes: None, 197 | state: State::Follower, 198 | voted_for: None, 199 | log, 200 | noop_transition, 201 | commit_index: 0, 202 | last_applied: 0, 203 | next_index: BTreeMap::new(), 204 | match_index: BTreeMap::new(), 205 | election_timeout: election_timeout_range, 206 | heartbeat_timer: Timer::new(heartbeat_timeout), 207 | next_election_deadline: Instant::now(), 208 | snapshot, 209 | snapshot_delta, 210 | index_offset, 211 | } 212 | } 213 | 214 | /// This function starts the Replica and blocks forever. 215 | /// 216 | /// recv_msg is a channel on which the user must notify the Replica whenever 217 | /// new messages from the Cluster are available. The Replica will not poll 218 | /// for messages from the Cluster unless notified through recv_msg. 219 | /// 220 | /// recv_transition is a channel on which the user must notify the Replica 221 | /// whenever new transitions to be processed for the StateMachine are 222 | /// available. The Replica will not poll for pending transitions for the 223 | /// StateMachine unless notified through recv_transition. 224 | pub fn start(&mut self, recv_msg: Receiver<()>, recv_transition: Receiver<()>) { 225 | loop { 226 | if self.cluster.lock().unwrap().halt() { 227 | return; 228 | } 229 | 230 | match self.state { 231 | State::Leader => self.poll_as_leader(&recv_msg, &recv_transition), 232 | State::Follower => self.poll_as_follower(&recv_msg), 233 | State::Candidate => self.poll_as_candidate(&recv_msg), 234 | } 235 | 236 | self.apply_ready_entries(); 237 | } 238 | } 239 | 240 | fn poll_as_leader(&mut self, recv_msg: &Receiver<()>, recv_transition: &Receiver<()>) { 241 | let mut select = Select::new(); 242 | let recv_heartbeat = self.heartbeat_timer.get_rx(); 243 | let (msg, transition, heartbeat) = ( 244 | select.recv(recv_msg), 245 | select.recv(recv_transition), 246 | select.recv(recv_heartbeat), 247 | ); 248 | 249 | let oper = select.select(); 250 | match oper.index() { 251 | // Process pending messages. 252 | i if i == msg => { 253 | oper.recv(recv_msg) 254 | .expect("could not react to a new message"); 255 | let messages = self.cluster.lock().unwrap().receive_messages(); 256 | for message in messages { 257 | self.process_message(message); 258 | } 259 | } 260 | // Process pending transitions. 261 | i if i == transition => { 262 | oper.recv(recv_transition) 263 | .expect("could not react to a new transition"); 264 | self.load_new_transitions(); 265 | self.broadcast_append_entry_request(); 266 | } 267 | // Broadcast heartbeat messages. 268 | i if i == heartbeat => { 269 | oper.recv(recv_heartbeat) 270 | .expect("could not react to the heartbeat"); 271 | self.broadcast_append_entry_request(); 272 | self.heartbeat_timer.renew(); 273 | } 274 | _ => unreachable!(), 275 | } 276 | } 277 | 278 | fn broadcast_append_entry_request(&mut self) { 279 | self.broadcast_message(|peer_id: ReplicaID| { 280 | match self.get_term_at_index(self.next_index[&peer_id] - 1) { 281 | Ok(term) => Message::AppendEntryRequest { 282 | from_id: self.id, 283 | term: self.current_term, 284 | prev_log_index: self.next_index[&peer_id] - 1, 285 | prev_log_term: term, 286 | entries: self.get_entries_for_peer(peer_id), 287 | commit_index: self.commit_index, 288 | }, 289 | Err(ReplicaError::LogCompacted) => { 290 | let snapshot = self.snapshot.as_ref().unwrap(); 291 | Message::InstallSnapshotRequest { 292 | from_id: self.id, 293 | term: self.current_term, 294 | last_included_index: snapshot.last_included_index, 295 | last_included_term: snapshot.last_included_term, 296 | offset: 0, 297 | data: snapshot.data.clone(), 298 | done: true, 299 | } 300 | } 301 | } 302 | }); 303 | } 304 | 305 | fn get_term_at_index(&self, index: usize) -> Result { 306 | if let Some(snapshot) = &self.snapshot { 307 | if index == snapshot.last_included_index { 308 | return Ok(snapshot.last_included_term); 309 | } else if index > snapshot.last_included_index { 310 | let localized_index = index - self.index_offset; 311 | return Ok(self.log[localized_index].term); 312 | } 313 | Err(ReplicaError::LogCompacted) 314 | } else { 315 | Ok(self.log[index].term) 316 | } 317 | } 318 | 319 | fn poll_as_follower(&mut self, recv_msg: &Receiver<()>) { 320 | match recv_msg.recv_deadline(self.next_election_deadline) { 321 | // Process pending messages. 322 | Ok(_) => { 323 | let messages = self.cluster.lock().unwrap().receive_messages(); 324 | // Update the election deadline if more than zero messages were 325 | // actually received. 326 | if !messages.is_empty() { 327 | self.update_election_deadline(); 328 | } 329 | 330 | for message in messages { 331 | self.process_message(message); 332 | } 333 | } 334 | // Become candidate and update elction deadline. 335 | _ => { 336 | self.become_candidate(); 337 | self.update_election_deadline(); 338 | } 339 | } 340 | 341 | // Load new transitions. The follower will ignore these transitions, but 342 | // they are still polled for periodically to ensure there are no stale 343 | // transitions in case the Replica's state changes. 344 | self.load_new_transitions(); 345 | } 346 | 347 | fn process_message(&mut self, message: Message) { 348 | match self.state { 349 | State::Leader => self.process_message_as_leader(message), 350 | State::Candidate => self.process_message_as_candidate(message), 351 | State::Follower => self.process_message_as_follower(message), 352 | } 353 | } 354 | 355 | fn update_election_deadline(&mut self) { 356 | // Randomize each election deadline within the allowed range. 357 | self.next_election_deadline = Instant::now() 358 | + rand::thread_rng().gen_range(self.election_timeout.0..=self.election_timeout.1); 359 | } 360 | 361 | fn poll_as_candidate(&mut self, recv_msg: &Receiver<()>) { 362 | match recv_msg.recv_deadline(self.next_election_deadline) { 363 | Ok(_) => { 364 | // Process pending messages. 365 | let messages = self.cluster.lock().unwrap().receive_messages(); 366 | // Update the election deadline if more than zero messages were 367 | // actually received. 368 | if !messages.is_empty() { 369 | self.update_election_deadline(); 370 | } 371 | for message in messages { 372 | self.process_message(message); 373 | } 374 | } 375 | // Become candidate and update elction deadline. 376 | _ => { 377 | self.become_candidate(); 378 | self.update_election_deadline(); 379 | } 380 | } 381 | 382 | // Load new transitions. The candidate will ignore these transitions, 383 | // but they are still polled for periodically to ensure there are no 384 | // stale transitions in case the Replica's state changes. 385 | self.load_new_transitions(); 386 | } 387 | 388 | fn broadcast_message(&self, message_generator: F) 389 | where 390 | F: Fn(usize) -> Message, 391 | { 392 | self.peer_ids.iter().for_each(|peer_id| { 393 | self.cluster 394 | .lock() 395 | .unwrap() 396 | .send_message(*peer_id, message_generator(*peer_id)) 397 | }); 398 | } 399 | 400 | // Get log entries that have not been acknowledged by the peer. 401 | fn get_entries_for_peer(&self, peer_id: ReplicaID) -> Vec> { 402 | // TODO: double check 403 | self.log[self.next_index[&peer_id] - self.index_offset..self.log.len()].to_vec() 404 | } 405 | 406 | // Apply entries that are ready to be applied. 407 | fn apply_ready_entries(&mut self) { 408 | if self.log.is_empty() { 409 | return; 410 | } 411 | 412 | // Move the commit index to the latest log index that has been 413 | // replicated on the majority of the replicas. 414 | let mut state_machine = self.state_machine.lock().unwrap(); 415 | let mut n = self.log.len() - 1 + self.index_offset; 416 | if self.state == State::Leader && self.commit_index < n { 417 | let old_commit_index = self.commit_index; 418 | while n > self.commit_index { 419 | let num_replications = 420 | self.match_index.iter().fold( 421 | 0, 422 | |acc, mtch_idx| if mtch_idx.1 >= &n { acc + 1 } else { acc }, 423 | ); 424 | 425 | if num_replications * 2 >= self.peer_ids.len() 426 | && self.log[n - self.index_offset].term == self.current_term 427 | { 428 | self.commit_index = n; 429 | } 430 | n -= 1; 431 | } 432 | 433 | for i in old_commit_index + 1..=self.commit_index { 434 | state_machine.register_transition_state( 435 | self.log[i - self.index_offset].transition.get_id(), 436 | TransitionState::Committed, 437 | ); 438 | } 439 | } 440 | 441 | // Apply entries that are behind the currently committed index. 442 | while self.commit_index > self.last_applied { 443 | self.last_applied += 1; 444 | let local_idx = self.last_applied - self.index_offset; 445 | state_machine.apply_transition(self.log[local_idx].transition.clone()); 446 | state_machine.register_transition_state( 447 | self.log[local_idx].transition.get_id(), 448 | TransitionState::Applied, 449 | ); 450 | } 451 | 452 | // If snapshot_delta is greater than 0, check whether it's time for log 453 | // compaction. 454 | if self.snapshot_delta > 0 { 455 | // Calculate number of applied logs that haven't been compacted yet. 456 | let curr_delta = self.last_applied + 1 - self.index_offset; 457 | // If the number of accumulated logs is greater than or equal to the 458 | // configured delta, do compaction. 459 | if curr_delta >= self.snapshot_delta { 460 | let last_applied = self.last_applied; 461 | self.snapshot = Some(state_machine.create_snapshot( 462 | last_applied, 463 | self.log[last_applied - self.index_offset].term, 464 | )); 465 | self.log.retain(|l| l.index > last_applied); 466 | self.index_offset = last_applied + 1; 467 | } 468 | } 469 | } 470 | 471 | fn load_new_transitions(&mut self) { 472 | // Load new transitions. Ignore the transitions if the replica is not 473 | // the Leader. 474 | let mut state_machine = self.state_machine.lock().unwrap(); 475 | let transitions = state_machine.get_pending_transitions(); 476 | for transition in transitions { 477 | if self.state == State::Leader { 478 | self.log.push(LogEntry { 479 | index: self.log.len() + self.index_offset, 480 | transition: transition.clone(), 481 | term: self.current_term, 482 | }); 483 | 484 | state_machine 485 | .register_transition_state(transition.get_id(), TransitionState::Queued); 486 | } else { 487 | state_machine.register_transition_state( 488 | transition.get_id(), 489 | TransitionState::Abandoned(TransitionAbandonedReason::NotLeader), 490 | ); 491 | } 492 | } 493 | } 494 | 495 | fn process_message_as_leader(&mut self, message: Message) { 496 | match message { 497 | Message::AppendEntryResponse { 498 | from_id, 499 | term, 500 | success, 501 | last_index, 502 | mismatch_index, 503 | } => { 504 | if term > self.current_term { 505 | // Become follower if another node's term is higher. 506 | self.cluster.lock().unwrap().register_leader(None); 507 | self.become_follower(term); 508 | } else if success { 509 | // Update information about the peer's logs. 510 | self.next_index.insert(from_id, last_index + 1); 511 | self.match_index.insert(from_id, last_index); 512 | } else { 513 | // Update information about the peer's logs. 514 | // 515 | // If the mismatch_index is greater than or equal to the 516 | // existing next_index, then we know that this rejection is a 517 | // stray out-of-order or duplicate rejection, which we can 518 | // ignore. The reason we know that is because mismatch_index is 519 | // set by the follower to prev_log_index, which was in turn set 520 | // by the leader to next_index-1. Hence mismatch_index can't be 521 | // greater than or equal to next_index. 522 | // 523 | // If the mismatch_index isn't stray, we set next_index to the 524 | // min of next_index and last_index; this is equivalent to the 525 | // Raft paper's guidance on decreasing next_index by one at a 526 | // time, but is more performant in cases when we can cut 527 | // straight to the follower's last_index+1. 528 | if let Some(mismatch_index) = mismatch_index { 529 | if mismatch_index < self.next_index[&from_id] { 530 | let next_index = cmp::min(mismatch_index, last_index + 1); 531 | self.next_index.insert(from_id, next_index); 532 | } 533 | } 534 | } 535 | } 536 | Message::InstallSnapshotResponse { 537 | from_id, 538 | term, 539 | last_included_index, 540 | } => { 541 | if term > self.current_term { 542 | // Become follower if another node's term is higher. 543 | self.cluster.lock().unwrap().register_leader(None); 544 | self.become_follower(term); 545 | } else { 546 | self.next_index.insert(from_id, last_included_index + 1); 547 | self.match_index.insert(from_id, last_included_index); 548 | } 549 | } 550 | _ => {} 551 | } 552 | } 553 | 554 | fn process_vote_request_as_follower( 555 | &mut self, 556 | from_id: ReplicaID, 557 | term: usize, 558 | last_log_index: usize, 559 | last_log_term: usize, 560 | ) { 561 | match self.current_term.cmp(&term) { 562 | Ordering::Greater => { 563 | // Do not vote for Replicas that are behind. 564 | self.cluster.lock().unwrap().send_message( 565 | from_id, 566 | Message::VoteResponse { 567 | from_id: self.id, 568 | term: self.current_term, 569 | vote_granted: false, 570 | }, 571 | ); 572 | } 573 | Ordering::Less => { 574 | // Become a follower if the other replica's term is higher. 575 | self.cluster.lock().unwrap().register_leader(None); 576 | self.become_follower(term); 577 | } 578 | _ => {} 579 | } 580 | 581 | let self_last_log_index = self.get_last_log_index(); 582 | let self_last_log_term = self.get_last_log_term(); 583 | if (self.voted_for == None || self.voted_for == Some(from_id)) 584 | && self_last_log_index <= last_log_index 585 | && self_last_log_term <= last_log_term 586 | { 587 | // If the criteria are met, grant the vote. 588 | let mut cluster = self.cluster.lock().unwrap(); 589 | cluster.register_leader(None); 590 | cluster.send_message( 591 | from_id, 592 | Message::VoteResponse { 593 | from_id: self.id, 594 | term: self.current_term, 595 | vote_granted: true, 596 | }, 597 | ); 598 | self.voted_for = Some(from_id); 599 | return; 600 | } 601 | 602 | // If the criteria are not met or if already voted for someone else, do 603 | // not grant the vote. 604 | self.cluster.lock().unwrap().send_message( 605 | from_id, 606 | Message::VoteResponse { 607 | from_id: self.id, 608 | term: self.current_term, 609 | vote_granted: false, 610 | }, 611 | ); 612 | } 613 | 614 | fn process_install_snapshot_request_as_follower( 615 | &mut self, 616 | from_id: ReplicaID, 617 | term: usize, 618 | last_included_index: usize, 619 | last_included_term: usize, 620 | _offset: usize, 621 | data: D, 622 | _done: bool, 623 | ) { 624 | if self.current_term > term { 625 | self.cluster.lock().unwrap().send_message( 626 | from_id, 627 | Message::InstallSnapshotResponse { 628 | from_id: self.id, 629 | term: self.current_term, 630 | last_included_index: self.get_last_log_index(), 631 | }, 632 | ); 633 | return; 634 | } 635 | 636 | let snapshot = Snapshot { 637 | last_included_index, 638 | last_included_term, 639 | data, 640 | }; 641 | 642 | // Retain only logs not already in the snapshot. These logs are 643 | // guaranteed to not be committed yet (otherwise we wouldn't be 644 | // receiving the snapshot in the first place), so it is correct to 645 | // restore StateMachine state from the snapshot. 646 | let mut state_machine = self.state_machine.lock().unwrap(); 647 | self.log.retain(|l| l.index > last_included_index); 648 | state_machine.set_snapshot(snapshot.clone()); 649 | self.snapshot = Some(snapshot); 650 | self.index_offset = last_included_index + 1; 651 | self.commit_index = last_included_index; 652 | self.last_applied = last_included_index; 653 | // It is likely that the snapshot contained new information, so we need 654 | // to update our current term. 655 | self.current_term = self.get_last_log_term(); 656 | self.cluster.lock().unwrap().send_message( 657 | from_id, 658 | Message::InstallSnapshotResponse { 659 | from_id: self.id, 660 | term: self.current_term, 661 | last_included_index: self.get_last_log_index(), 662 | }, 663 | ); 664 | } 665 | 666 | fn process_append_entry_request_as_follower( 667 | &mut self, 668 | from_id: ReplicaID, 669 | term: usize, 670 | prev_log_index: usize, 671 | prev_log_term: usize, 672 | entries: Vec>, 673 | commit_index: usize, 674 | ) { 675 | // Check that the leader's term is at least as large as ours. 676 | if self.current_term > term { 677 | self.cluster.lock().unwrap().send_message( 678 | from_id, 679 | Message::AppendEntryResponse { 680 | from_id: self.id, 681 | term: self.current_term, 682 | success: false, 683 | last_index: self.get_last_log_index(), 684 | mismatch_index: None, 685 | }, 686 | ); 687 | return; 688 | } 689 | 690 | // If our log doesn't contain an entry at prev_log_index with the 691 | // prev_log_term term, reply false. 692 | if prev_log_index >= self.log.len() + self.index_offset 693 | || self.get_term_at_index(prev_log_index).unwrap() != prev_log_term 694 | { 695 | self.cluster.lock().unwrap().send_message( 696 | from_id, 697 | Message::AppendEntryResponse { 698 | from_id: self.id, 699 | term: self.current_term, 700 | success: false, 701 | last_index: self.get_last_log_index(), 702 | mismatch_index: Some(prev_log_index), 703 | }, 704 | ); 705 | return; 706 | } 707 | 708 | self.process_entries(entries); 709 | 710 | // Update local commit index to either the received commit index or the 711 | // latest local log position, whichever is smaller. 712 | if commit_index > self.commit_index && !self.log.is_empty() { 713 | self.commit_index = cmp::min(commit_index, self.log[self.log.len() - 1].index); 714 | } 715 | 716 | let mut cluster = self.cluster.lock().unwrap(); 717 | cluster.register_leader(Some(from_id)); 718 | cluster.send_message( 719 | from_id, 720 | Message::AppendEntryResponse { 721 | from_id: self.id, 722 | term: self.current_term, 723 | success: true, 724 | last_index: self.get_last_log_index(), 725 | mismatch_index: None, 726 | }, 727 | ); 728 | } 729 | 730 | fn process_entries(&mut self, entries: Vec>) { 731 | let mut state_machine = self.state_machine.lock().unwrap(); 732 | for entry in entries { 733 | // Drop local inconsistent logs. 734 | if entry.index <= self.get_last_log_index() 735 | && entry.term != self.get_term_at_index(entry.index).unwrap() 736 | { 737 | for i in entry.index..self.log.len() { 738 | state_machine.register_transition_state( 739 | self.log[i].transition.get_id(), 740 | TransitionState::Abandoned(TransitionAbandonedReason::ConflictWithLeader), 741 | ); 742 | } 743 | self.log.truncate(entry.index); 744 | } 745 | 746 | // Push received logs. 747 | if entry.index == self.log.len() + self.index_offset { 748 | self.log.push(entry); 749 | } 750 | } 751 | } 752 | 753 | fn process_message_as_follower(&mut self, message: Message) { 754 | match message { 755 | Message::VoteRequest { 756 | from_id, 757 | term, 758 | last_log_index, 759 | last_log_term, 760 | } => { 761 | self.process_vote_request_as_follower(from_id, term, last_log_index, last_log_term) 762 | } 763 | Message::AppendEntryRequest { 764 | term, 765 | from_id, 766 | prev_log_index, 767 | prev_log_term, 768 | entries, 769 | commit_index, 770 | } => self.process_append_entry_request_as_follower( 771 | from_id, 772 | term, 773 | prev_log_index, 774 | prev_log_term, 775 | entries, 776 | commit_index, 777 | ), 778 | Message::InstallSnapshotRequest { 779 | from_id, 780 | term, 781 | last_included_index, 782 | last_included_term, 783 | offset, 784 | data, 785 | done, 786 | } => self.process_install_snapshot_request_as_follower( 787 | from_id, 788 | term, 789 | last_included_index, 790 | last_included_term, 791 | offset, 792 | data, 793 | done, 794 | ), 795 | _ => { /* ignore */ } 796 | } 797 | } 798 | 799 | fn process_message_as_candidate(&mut self, message: Message) { 800 | match message { 801 | Message::AppendEntryRequest { term, from_id, .. } => { 802 | self.process_append_entry_request_as_candidate(term, from_id, message) 803 | } 804 | Message::VoteRequest { term, from_id, .. } => { 805 | self.process_vote_request_as_candidate(term, from_id, message) 806 | } 807 | Message::VoteResponse { 808 | from_id, 809 | term, 810 | vote_granted, 811 | } => self.process_vote_response_as_candidate(from_id, term, vote_granted), 812 | Message::InstallSnapshotRequest { from_id, term, .. } => { 813 | self.process_install_snapshot_request_as_candidate(from_id, term, message) 814 | } 815 | _ => { /* ignore */ } 816 | } 817 | } 818 | 819 | fn process_install_snapshot_request_as_candidate( 820 | &mut self, 821 | from_id: ReplicaID, 822 | term: usize, 823 | message: Message, 824 | ) { 825 | // If the term is greater or equal to current term, then there's an 826 | // active Leader, so convert self to a follower. If the term is smaller 827 | // than the current term, inform the sender of your current term. 828 | if term >= self.current_term { 829 | self.cluster.lock().unwrap().register_leader(None); 830 | self.become_follower(term); 831 | self.process_message(message); 832 | } else { 833 | self.cluster.lock().unwrap().send_message( 834 | from_id, 835 | Message::InstallSnapshotResponse { 836 | from_id: self.id, 837 | last_included_index: self.get_last_log_index(), 838 | term: self.current_term, 839 | }, 840 | ); 841 | } 842 | } 843 | 844 | fn process_vote_response_as_candidate( 845 | &mut self, 846 | from_id: ReplicaID, 847 | term: usize, 848 | vote_granted: bool, 849 | ) { 850 | if term > self.current_term { 851 | self.cluster.lock().unwrap().register_leader(None); 852 | self.become_follower(term); 853 | } else if vote_granted && term == self.current_term { 854 | // Record that the vote has been granted. 855 | if let Some(cur_votes) = &mut self.current_votes { 856 | cur_votes.insert(from_id); 857 | // If more than half of the cluster has voted for the Replica 858 | // (the Replica itself included), it's time to become the 859 | // Leader. 860 | if cur_votes.len() * 2 > self.peer_ids.len() { 861 | self.become_leader(); 862 | } 863 | } 864 | } 865 | } 866 | 867 | fn process_vote_request_as_candidate( 868 | &mut self, 869 | term: usize, 870 | from_id: ReplicaID, 871 | message: Message, 872 | ) { 873 | if term > self.current_term { 874 | self.cluster.lock().unwrap().register_leader(None); 875 | self.become_follower(term); 876 | self.process_message(message); 877 | } else { 878 | self.cluster.lock().unwrap().send_message( 879 | from_id, 880 | Message::VoteResponse { 881 | from_id: self.id, 882 | term: self.current_term, 883 | vote_granted: false, 884 | }, 885 | ); 886 | } 887 | } 888 | 889 | fn process_append_entry_request_as_candidate( 890 | &mut self, 891 | term: usize, 892 | from_id: ReplicaID, 893 | message: Message, 894 | ) { 895 | if term >= self.current_term { 896 | self.cluster.lock().unwrap().register_leader(None); 897 | self.become_follower(term); 898 | self.process_message(message); 899 | } else { 900 | self.cluster.lock().unwrap().send_message( 901 | from_id, 902 | Message::AppendEntryResponse { 903 | from_id: self.id, 904 | term: self.current_term, 905 | success: false, 906 | last_index: self.get_last_log_index(), 907 | mismatch_index: None, 908 | }, 909 | ); 910 | } 911 | } 912 | 913 | fn become_leader(&mut self) { 914 | self.cluster.lock().unwrap().register_leader(Some(self.id)); 915 | self.state = State::Leader; 916 | self.current_votes = None; 917 | self.voted_for = None; 918 | self.next_index = BTreeMap::new(); 919 | self.match_index = BTreeMap::new(); 920 | for peer_id in &self.peer_ids { 921 | self.next_index 922 | .insert(*peer_id, self.log.len() + self.index_offset); 923 | self.match_index.insert(*peer_id, 0); 924 | } 925 | 926 | // If the previous Leader had some uncommitted entries that were 927 | // replicated to this now-Leader server, this replica will not commit 928 | // them until its commit index advances to a log entry appended in this 929 | // Leader's term. To carry out this operation as soon as the new Leader 930 | // emerges, append a no-op entry. This is a neat optimization described 931 | // in the part 8 of the paper. 932 | self.log.push(LogEntry { 933 | index: self.log.len() + self.index_offset, 934 | transition: self.noop_transition.clone(), 935 | term: self.current_term, 936 | }); 937 | } 938 | 939 | fn become_follower(&mut self, term: usize) { 940 | self.current_term = term; 941 | self.state = State::Follower; 942 | self.current_votes = None; 943 | self.voted_for = None; 944 | } 945 | 946 | fn become_candidate(&mut self) { 947 | // Increase current term. 948 | self.current_term += 1; 949 | // Claim yourself a candidate. 950 | self.state = State::Candidate; 951 | // Initialize votes. Vote for yourself. 952 | let mut votes = BTreeSet::new(); 953 | votes.insert(self.id); 954 | self.current_votes = Some(Box::new(votes)); 955 | self.voted_for = Some(self.id); 956 | // Fan out vote requests. 957 | self.broadcast_message(|_: usize| Message::VoteRequest { 958 | from_id: self.id, 959 | term: self.current_term, 960 | last_log_index: self.get_last_log_index(), 961 | last_log_term: self.get_last_log_term(), 962 | }); 963 | 964 | if self.peer_ids.is_empty() { 965 | self.become_leader(); 966 | } 967 | } 968 | 969 | fn get_last_log_index(&self) -> usize { 970 | if let Some(log) = self.log.last() { 971 | log.index 972 | } else { 973 | self.index_offset - 1 974 | } 975 | } 976 | 977 | fn get_last_log_term(&self) -> usize { 978 | if let Some(log) = self.log.last() { 979 | log.term 980 | } else { 981 | self.snapshot.as_ref().unwrap().last_included_term 982 | } 983 | } 984 | } 985 | --------------------------------------------------------------------------------