├── .gitignore ├── .travis.yml ├── src ├── utils │ ├── mod.rs │ ├── time.rs │ ├── math.rs │ ├── serde.rs │ └── bindings.rs ├── hasher │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── plugins │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── proc_macro │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── tcp │ ├── mod.rs │ ├── shortcut.rs │ ├── client.rs │ └── server.rs ├── lib.rs ├── raft │ ├── state_machine │ │ ├── mod.rs │ │ ├── callback │ │ │ ├── client.rs │ │ │ ├── mod.rs │ │ │ └── server.rs │ │ ├── configs.rs │ │ ├── master.rs │ │ └── macros.rs │ └── disk.rs ├── rpc │ ├── cluster.rs │ └── proto.rs ├── conshash │ └── weights.rs ├── membership │ ├── member.rs │ ├── client.rs │ └── mod.rs └── vector_clock │ └── mod.rs ├── Cargo.toml ├── LICENSE ├── examples └── graceful_shutdown.rs ├── README.md ├── RECOVERY_IMPROVEMENTS.md ├── MEMBERSHIP_GUIDE.md ├── tests ├── single_node_recovery_test.rs └── graceful_shutdown_tests.rs └── SNAPSHOT_GUIDE.md /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .idea/ 4 | bifrost.iml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | rust: 4 | - nightly -------------------------------------------------------------------------------- /src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod time; 2 | #[macro_use] 3 | pub mod bindings; 4 | pub mod math; 5 | pub mod serde; 6 | -------------------------------------------------------------------------------- /src/hasher/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bifrost_hasher" 3 | version = "0.1.0" 4 | authors = ["Hao Shi "] 5 | 6 | [lib] 7 | name = "bifrost_hasher" 8 | 9 | [dependencies] 10 | twox-hash = "1" -------------------------------------------------------------------------------- /src/plugins/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bifrost_plugins" 3 | version = "0.1.0" 4 | authors = ["Hao Shi "] 5 | 6 | [lib] 7 | proc-macro = true 8 | 9 | [dependencies] 10 | bifrost_hasher = { path = "../hasher" } 11 | syn = "2" 12 | quote = "1" -------------------------------------------------------------------------------- /src/proc_macro/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bifrost_proc_macro" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [lib] 9 | proc-macro = true 10 | 11 | [dependencies] 12 | syn = { version = "*", features = ["extra-traits"] } 13 | quote = "*" 14 | proc-macro2 = "*" -------------------------------------------------------------------------------- /src/tcp/mod.rs: -------------------------------------------------------------------------------- 1 | use bifrost_hasher::hash_str; 2 | 3 | pub mod client; 4 | pub mod server; 5 | pub mod shortcut; 6 | 7 | pub static STANDALONE_ADDRESS: &'static str = "STANDALONE"; 8 | 9 | lazy_static! { 10 | pub static ref STANDALONE_ADDRESS_STRING: String = String::from(STANDALONE_ADDRESS); 11 | pub static ref STANDALONE_SERVER_ID: u64 = hash_str(&STANDALONE_ADDRESS_STRING); 12 | } 13 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![crate_type = "lib"] 2 | #![feature(proc_macro_hygiene)] 3 | #![feature(trait_alias)] 4 | 5 | #[cfg(disable_shortcut)] 6 | pub static DISABLE_SHORTCUT: bool = true; 7 | 8 | #[cfg(not(disable_shortcut))] 9 | pub static DISABLE_SHORTCUT: bool = false; 10 | 11 | #[macro_use] 12 | pub mod utils; 13 | pub mod tcp; 14 | #[macro_use] 15 | pub mod rpc; 16 | #[macro_use] 17 | pub mod raft; 18 | pub mod conshash; 19 | pub mod membership; 20 | pub mod vector_clock; 21 | 22 | #[macro_use] 23 | extern crate log; 24 | 25 | #[macro_use] 26 | extern crate lazy_static; 27 | pub extern crate bytes; 28 | -------------------------------------------------------------------------------- /src/hasher/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::DefaultHasher; 2 | use std::hash::Hasher; 3 | 4 | extern crate twox_hash; 5 | 6 | pub fn hash_bytes(bytes: &[u8]) -> u64 { 7 | let mut hasher = twox_hash::XxHash::default(); 8 | hasher.write(bytes); 9 | hasher.finish() 10 | } 11 | 12 | pub fn hash_str<'a>(text: &'a str) -> u64 { // the same as the one in utils hash 13 | let text_bytes = text.as_bytes(); 14 | hash_bytes(text_bytes) 15 | } 16 | 17 | pub fn hash_bytes_secondary(bytes: &[u8]) -> u64 { 18 | let mut hasher = DefaultHasher::default(); 19 | hasher.write(bytes); 20 | hasher.finish() 21 | } -------------------------------------------------------------------------------- /src/utils/time.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | use std::time::SystemTime; 3 | use tokio::time::sleep; 4 | 5 | pub fn get_time() -> i64 { 6 | //Get current time 7 | let current_time = SystemTime::now(); 8 | let duration = current_time.duration_since(SystemTime::UNIX_EPOCH).unwrap(); 9 | //Calculate milliseconds 10 | return duration_to_ms(duration) as i64; 11 | } 12 | 13 | pub fn duration_to_ms(duration: Duration) -> u64 { 14 | let nanos = duration.subsec_nanos() as u64; 15 | (1000 * 1000 * 1000 * duration.as_secs() + nanos) / (1000 * 1000) 16 | } 17 | 18 | pub async fn async_wait(duration: Duration) { 19 | sleep(duration).await; 20 | } 21 | 22 | pub async fn async_wait_secs() { 23 | async_wait(Duration::from_secs(2)).await; 24 | } 25 | -------------------------------------------------------------------------------- /src/plugins/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate proc_macro; 2 | extern crate bifrost_hasher; 3 | extern crate syn; 4 | 5 | use proc_macro::TokenStream; 6 | use bifrost_hasher::hash_str; 7 | use proc_macro::TokenTree; 8 | use syn::{parse_macro_input, LitStr}; 9 | 10 | #[proc_macro] 11 | pub fn hash_ident(item: TokenStream) -> TokenStream { 12 | let item_clone = item.clone(); 13 | let tokens: Vec<_> = item.into_iter().collect(); 14 | if tokens.len() != 1 { 15 | panic!("argument should be a single identifier, but got {} arguments {:?}", 16 | tokens.len(), tokens); 17 | } 18 | let text = match tokens[0] { 19 | TokenTree::Ident(ref ident) => ident.to_string(), 20 | _ => parse_macro_input!(item_clone as LitStr).value(), 21 | // _ => panic!("argument only support ident or string literal, found '{:?}', parsing {:?}", tokens, tokens[0]) 22 | }; 23 | let text = &*text; 24 | let str = String::from(text); 25 | format!("{}", hash_str(&str)).parse().unwrap() 26 | } -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bifrost" 3 | version = "0.1.0" 4 | authors = ["Hao Shi "] 5 | edition = "2018" 6 | 7 | [lib] 8 | name = "bifrost" 9 | 10 | [dependencies] 11 | serde_cbor = "0.11.1" 12 | serde_json = "1.0.51" 13 | byteorder = "1" 14 | log = "*" 15 | serde = { version = "1.0", features = ["derive"] } 16 | bifrost_plugins = { path = "src/plugins" } 17 | bifrost_hasher = { path = "src/hasher" } 18 | bifrost_proc_macro = { path = "src/proc_macro" } 19 | rand = "*" 20 | 21 | lazy_static = "*" 22 | threadpool = "1" 23 | num_cpus = "1" 24 | parking_lot = {version = "*", features = ["nightly"]} 25 | thread-id = "5" 26 | 27 | tokio = { version = "1", features = ["full"] } 28 | tokio-util = {version = "0.7", features = ["full"]} 29 | tokio-stream = "0.1" 30 | bytes = "1" 31 | crc32fast = "*" 32 | 33 | futures = {version = "0.3", features = ["executor", "thread-pool"] } 34 | futures-timer = "3" 35 | async-std = "1" 36 | lightning-containers = { git = "ssh://git@192.168.10.134/shisoft-x/Lightning.git", branch = "develop" } 37 | 38 | [dev-dependencies] 39 | env_logger = "*" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Hao Shi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/raft/state_machine/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::raft::client::RaftClient; 2 | use std::any::Any; 3 | use std::sync::Arc; 4 | 5 | pub enum Storage { 6 | MEMORY, 7 | DISK(String), 8 | } 9 | 10 | #[derive(Debug)] 11 | pub enum OpType { 12 | COMMAND, 13 | QUERY, 14 | SUBSCRIBE, 15 | } 16 | 17 | pub trait StateMachineCtl: Sync + Send + Any { 18 | fn id(&self) -> u64; 19 | fn snapshot(&self) -> Vec; 20 | fn recover(&mut self, data: Vec) -> ::futures::future::BoxFuture<()>; 21 | fn recoverable(&self) -> bool; 22 | fn fn_dispatch_qry<'a>( 23 | &'a self, 24 | fn_id: u64, 25 | data: &'a Vec, 26 | ) -> ::futures::future::BoxFuture<'a, Option>>; 27 | fn fn_dispatch_cmd<'a>( 28 | &'a mut self, 29 | fn_id: u64, 30 | data: &'a Vec, 31 | ) -> ::futures::future::BoxFuture<'a, Option>>; 32 | fn op_type(&mut self, fn_id: u64) -> Option; 33 | } 34 | 35 | pub trait OpTypes { 36 | fn op_type(&self, fn_id: u64) -> Option; 37 | } 38 | 39 | pub trait StateMachineClient { 40 | fn new_instance(sm_id: u64, client: &Arc) -> Self; 41 | } 42 | 43 | #[macro_use] 44 | pub mod macros; 45 | pub mod callback; 46 | pub mod configs; 47 | pub mod master; 48 | -------------------------------------------------------------------------------- /src/tcp/shortcut.rs: -------------------------------------------------------------------------------- 1 | use crate::tcp::server::{TcpReq, TcpRes}; 2 | use async_std::sync::*; 3 | use bifrost_hasher::hash_str; 4 | use bytes::BytesMut; 5 | use std::collections::BTreeMap; 6 | use std::io::{Error, ErrorKind, Result}; 7 | use std::sync::Arc; 8 | 9 | trait TcpCallbackFunc = Fn(TcpReq) -> TcpRes; 10 | trait TcpCallbackFuncShareable = TcpCallbackFunc + Send + Sync; 11 | 12 | lazy_static! { 13 | pub static ref TCP_CALLBACKS: RwLock>> = 14 | RwLock::new(BTreeMap::new()); 15 | } 16 | 17 | pub async fn register_server( 18 | server_address: &String, 19 | callback: &Arc, 20 | ) { 21 | let server_id = hash_str(server_address); 22 | let mut servers_cbs = TCP_CALLBACKS.write().await; 23 | servers_cbs.insert(server_id, callback.clone()); 24 | } 25 | 26 | pub async fn call(server_id: u64, data: TcpReq) -> Result { 27 | let server_cbs = TCP_CALLBACKS.read().await; 28 | match server_cbs.get(&server_id) { 29 | Some(c) => Ok(c(data).await), 30 | _ => Err(Error::new( 31 | ErrorKind::Other, 32 | "Cannot found callback for shortcut", 33 | )), 34 | } 35 | } 36 | 37 | pub async fn is_local(server_id: u64) -> bool { 38 | let cbs = TCP_CALLBACKS.read().await; 39 | cbs.contains_key(&server_id) 40 | } 41 | -------------------------------------------------------------------------------- /src/utils/math.rs: -------------------------------------------------------------------------------- 1 | pub fn min(nums: &Vec) -> Option 2 | where 3 | T: Ord + Copy, 4 | { 5 | nums.iter().fold(None, |min, x| match min { 6 | None => Some(*x), 7 | Some(y) => Some(if *x < y { *x } else { y }), 8 | }) 9 | } 10 | pub fn max(nums: &Vec) -> Option 11 | where 12 | T: Ord + Copy, 13 | { 14 | nums.iter().fold(None, |max, x| match max { 15 | None => Some(*x), 16 | Some(y) => Some(if *x > y { *x } else { y }), 17 | }) 18 | } 19 | pub fn avg_scale(nums: &Vec) -> Option { 20 | if nums.len() > 0 { 21 | let count = nums.len() as u64; 22 | //let max_num = max(nums).unwrap(); 23 | let min_num = min(nums).unwrap(); 24 | let sum: u64 = nums.iter().sum(); 25 | let mid_abs = (sum - (min_num * count)) / count; 26 | return Some(min_num + mid_abs); 27 | } 28 | return None; 29 | } 30 | 31 | #[cfg(test)] 32 | mod test { 33 | use crate::utils::math; 34 | 35 | #[test] 36 | fn max() { 37 | assert_eq!(math::max(&vec!(1, 2, 3, 4, 5)).unwrap(), 5); 38 | assert_eq!(math::max(&vec!(1, 2, 9, 4, 5)).unwrap(), 9); 39 | assert_eq!(math::max(&Vec::::new()), None); 40 | } 41 | 42 | #[test] 43 | fn min() { 44 | assert_eq!(math::min(&vec!(1, 2, 3, 4, 5)).unwrap(), 1); 45 | assert_eq!(math::min(&vec!(1, 2, -10, 4, 5)).unwrap(), -10); 46 | assert_eq!(math::min(&Vec::::new()), None); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/utils/serde.rs: -------------------------------------------------------------------------------- 1 | use bifrost_hasher::hash_bytes; 2 | use serde; 3 | 4 | #[cfg(not(debug_assertions))] 5 | pub fn serialize(obj: &T) -> Vec 6 | where 7 | T: serde::Serialize, 8 | { 9 | match serde_cbor::to_vec(obj) { 10 | Ok(data) => data, 11 | Err(e) => panic!("Cannot serialize: {:?}", e), 12 | } 13 | } 14 | 15 | #[cfg(not(debug_assertions))] 16 | pub fn deserialize<'a, T>(data: &'a [u8]) -> Option 17 | where 18 | T: serde::Deserialize<'a>, 19 | { 20 | match serde_cbor::from_slice(data) { 21 | Ok(obj) => Some(obj), 22 | Err(e) => { 23 | warn!( 24 | "Error on decoding data for type '{}', {}", 25 | std::any::type_name::(), 26 | e 27 | ); 28 | None 29 | } 30 | } 31 | } 32 | 33 | #[cfg(debug_assertions)] 34 | pub fn serialize(obj: &T) -> Vec 35 | where 36 | T: serde::Serialize, 37 | { 38 | match serde_json::to_vec(obj) { 39 | Ok(data) => data, 40 | Err(e) => panic!("Cannot serialize: {:?}", e), 41 | } 42 | } 43 | 44 | #[cfg(debug_assertions)] 45 | pub fn deserialize<'a, T>(data: &'a [u8]) -> Option 46 | where 47 | T: serde::Deserialize<'a>, 48 | { 49 | let type_name = std::any::type_name::(); 50 | match serde_json::from_slice(data) { 51 | Ok(obj) => Some(obj), 52 | Err(e) => { 53 | warn!( 54 | "Error on decoding data for type '{}', {}, json: {}", 55 | type_name, 56 | e, 57 | String::from_utf8_lossy(data) 58 | ); 59 | None 60 | } 61 | } 62 | } 63 | 64 | pub fn hash(obj: &T) -> u64 65 | where 66 | T: serde::Serialize, 67 | { 68 | let data = serialize(obj); 69 | hash_bytes(data.as_slice()) 70 | } 71 | -------------------------------------------------------------------------------- /src/raft/state_machine/callback/client.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use crate::utils::time::get_time; 3 | use async_std::sync::*; 4 | use futures::future::BoxFuture; 5 | use futures::stream::FuturesUnordered; 6 | use std::collections::HashMap; 7 | use std::sync::Arc; 8 | 9 | trait SubFunc = Fn(Vec) -> BoxFuture<'static, ()>; 10 | trait BoxedSubFunc = SubFunc + Send + Sync; 11 | 12 | pub struct SubscriptionService { 13 | pub subs: RwLock, u64)>>>, 14 | pub server_address: String, 15 | pub session_id: u64, 16 | } 17 | 18 | impl Service for SubscriptionService { 19 | fn notify<'a>(&'a self, key: SubKey, data: &'a Vec) -> BoxFuture<'a, ()> { 20 | debug!("Received notification for key {:?}", key); 21 | async move { 22 | let subs = self.subs.read().await; 23 | if let Some(subs) = subs.get(&key) { 24 | let subs = Pin::new(subs); 25 | let futs: FuturesUnordered<_> = subs 26 | .iter() 27 | .map(|(fun, _)| { 28 | let fun_pinned = Pin::new(fun); 29 | fun_pinned(data.clone()) 30 | }) 31 | .collect(); 32 | // Spawn async task DETACHED with the function to avoid deadlocks inside raft state machine 33 | tokio::spawn(async move { 34 | let _: Vec<_> = futs.collect().await; 35 | }); 36 | } 37 | } 38 | .boxed() 39 | } 40 | } 41 | dispatch_rpc_service_functions!(SubscriptionService); 42 | service_with_id!(SubscriptionService, DEFAULT_SERVICE_ID); 43 | 44 | impl SubscriptionService { 45 | pub async fn initialize(server: &Arc) -> Arc { 46 | let service = Arc::new(SubscriptionService { 47 | subs: RwLock::new(HashMap::new()), 48 | server_address: server.address().clone(), 49 | session_id: get_time() as u64, 50 | }); 51 | server.register_service(&service).await; 52 | service 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /examples/graceful_shutdown.rs: -------------------------------------------------------------------------------- 1 | /// Example demonstrating graceful shutdown of Bifrost services 2 | /// 3 | /// This example shows how to: 4 | /// 1. Start a Raft service with an RPC server 5 | /// 2. Handle shutdown signals (Ctrl+C) 6 | /// 3. Gracefully shutdown all services 7 | /// 8 | /// Run with: cargo run --example graceful_shutdown 9 | 10 | use bifrost::raft::{RaftService, Options, Storage, DEFAULT_SERVICE_ID}; 11 | use bifrost::rpc::Server; 12 | use std::sync::Arc; 13 | use tokio::signal; 14 | 15 | #[tokio::main] 16 | async fn main() { 17 | env_logger::init(); 18 | 19 | let address = "127.0.0.1:9000".to_string(); 20 | 21 | println!("Starting Bifrost services on {}...", address); 22 | 23 | // Create Raft service 24 | let raft_service = RaftService::new(Options { 25 | storage: Storage::MEMORY, 26 | address: address.clone(), 27 | service_id: DEFAULT_SERVICE_ID, 28 | }); 29 | 30 | // Create and start RPC server 31 | let server = Server::new(&address); 32 | Server::listen_and_resume(&server).await; 33 | server.register_service(&raft_service).await; 34 | 35 | // Start Raft service 36 | if RaftService::start(&raft_service, false).await { 37 | println!("Raft service started successfully"); 38 | raft_service.bootstrap().await; 39 | println!("Raft cluster bootstrapped"); 40 | } else { 41 | eprintln!("Failed to start Raft service"); 42 | return; 43 | } 44 | 45 | println!("\nServices running. Press Ctrl+C to trigger graceful shutdown...\n"); 46 | 47 | // Wait for Ctrl+C signal 48 | match signal::ctrl_c().await { 49 | Ok(()) => { 50 | println!("\n\nReceived Ctrl+C, initiating graceful shutdown...\n"); 51 | } 52 | Err(err) => { 53 | eprintln!("Unable to listen for shutdown signal: {}", err); 54 | return; 55 | } 56 | } 57 | 58 | // Gracefully shutdown all services 59 | println!("1. Shutting down Raft service..."); 60 | raft_service.shutdown().await; 61 | println!(" ✓ Raft service shut down"); 62 | 63 | println!("2. Shutting down RPC server..."); 64 | server.shutdown().await; 65 | println!(" ✓ RPC server shut down"); 66 | 67 | println!("\n✓ All services shut down gracefully\n"); 68 | 69 | // Give a moment for any final log messages 70 | tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; 71 | } 72 | 73 | -------------------------------------------------------------------------------- /src/rpc/cluster.rs: -------------------------------------------------------------------------------- 1 | use std::{future::Future, sync::Arc}; 2 | 3 | use crate::{ 4 | conshash::ConsistentHashing, 5 | raft::state_machine::master::ExecError, 6 | rpc::{RPCError, DEFAULT_CLIENT_POOL}, 7 | }; 8 | use futures::stream::FuturesUnordered; 9 | use tokio_stream::StreamExt; 10 | 11 | use super::{RPCClient, ServiceClientWithId}; 12 | 13 | pub async fn broadcast_to_members( 14 | conshash: &Arc, 15 | func: F, 16 | ) -> Result)>, ExecError> 17 | where 18 | C: ServiceClientWithId, 19 | F: Fn(Arc) -> Fut + Clone + Send + 'static, 20 | Fut: Future> + Send, 21 | { 22 | let server_ids = all_server_ids(&conshash).await?; 23 | broadcast_with_server_ids(server_ids, &conshash, func).await 24 | } 25 | 26 | pub async fn all_server_ids( 27 | conshash: &Arc, 28 | ) -> Result, ExecError> { 29 | let (members, _) = conshash.membership().all_members(true).await?; 30 | Ok(members.into_iter().map(|m| m.id)) 31 | } 32 | 33 | pub async fn broadcast_with_server_ids( 34 | server_ids: I, 35 | conshash: &Arc, 36 | func: F, 37 | ) -> Result)>, ExecError> 38 | where 39 | I: Iterator, 40 | C: ServiceClientWithId, 41 | F: Fn(Arc) -> Fut + Clone + Send + 'static, 42 | Fut: Future> + Send, 43 | { 44 | let member_futs: FuturesUnordered<_> = server_ids 45 | .map(|sid| { 46 | let func = func.clone(); 47 | async move { 48 | let client = match client_by_server_id(&conshash, sid).await { 49 | Ok(client) => client, 50 | Err(e) => { 51 | error!("Failed to get client by server id {}: {:?}", sid, e); 52 | return (sid, Err(e)); 53 | } 54 | }; 55 | return (sid, func(client).await); 56 | } 57 | }) 58 | .collect(); 59 | let results = member_futs.collect::>().await; 60 | Ok(results) 61 | } 62 | 63 | pub async fn client_by_server_id( 64 | conshash: &Arc, 65 | server_id: u64, 66 | ) -> Result, RPCError> 67 | where 68 | C: ServiceClientWithId, 69 | { 70 | DEFAULT_CLIENT_POOL 71 | .get_by_id(server_id, move |sid| conshash.to_server_name(sid)) 72 | .await 73 | .map_err(|e| RPCError::IOError(e)) 74 | .map(|c| client_by_rpc_client(&c)) 75 | } 76 | 77 | pub fn client_by_rpc_client(client: &Arc) -> Arc 78 | where 79 | C: ServiceClientWithId, 80 | { 81 | C::new_with_service_id(C::SERVICE_ID, client) 82 | } 83 | -------------------------------------------------------------------------------- /src/conshash/weights.rs: -------------------------------------------------------------------------------- 1 | use crate::raft::state_machine::StateMachineCtl; 2 | use crate::raft::RaftService; 3 | use bifrost_plugins::hash_ident; 4 | use futures::FutureExt; 5 | use std::collections::HashMap; 6 | use std::sync::Arc; 7 | 8 | pub static DEFAULT_SERVICE_ID: u64 = hash_ident!(BIFROST_DHT_WEIGHTS) as u64; 9 | 10 | raft_state_machine! { 11 | def cmd set_weight(group: u64, id: u64, weight: u64); 12 | def qry get_weights(group: u64) -> Option>; 13 | def qry get_weight(group: u64, id: u64) -> Option; 14 | } 15 | pub struct Weights { 16 | pub groups: HashMap>, 17 | pub id: u64, 18 | } 19 | impl StateMachineCmds for Weights { 20 | fn set_weight(&mut self, group: u64, id: u64, weight: u64) -> BoxFuture<()> { 21 | *self 22 | .groups 23 | .entry(group) 24 | .or_insert_with(|| HashMap::new()) 25 | .entry(id) 26 | .or_insert_with(|| 0) = weight; 27 | future::ready(()).boxed() 28 | } 29 | fn get_weights(&self, group: u64) -> BoxFuture>> { 30 | future::ready(match self.groups.get(&group) { 31 | Some(m) => Some(m.clone()), 32 | None => None, 33 | }) 34 | .boxed() 35 | } 36 | fn get_weight(&self, group: u64, id: u64) -> BoxFuture> { 37 | future::ready(match self.groups.get(&group) { 38 | Some(m) => match m.get(&id) { 39 | Some(w) => Some(*w), 40 | None => None, 41 | }, 42 | None => None, 43 | }) 44 | .boxed() 45 | } 46 | } 47 | impl StateMachineCtl for Weights { 48 | raft_sm_complete!(); 49 | fn id(&self) -> u64 { 50 | self.id 51 | } 52 | fn snapshot(&self) -> Vec { 53 | crate::utils::serde::serialize(&self.groups) 54 | } 55 | fn recover(&mut self, data: Vec) -> BoxFuture<()> { 56 | match crate::utils::serde::deserialize::>>(data.as_slice()) { 57 | Some(groups) => self.groups = groups, 58 | None => { 59 | error!("Failed to deserialize weights state machine snapshot. Starting with empty groups."); 60 | self.groups.clear(); 61 | } 62 | } 63 | future::ready(()).boxed() 64 | } 65 | fn recoverable(&self) -> bool { 66 | true 67 | } 68 | } 69 | impl Weights { 70 | pub async fn new_with_id(id: u64, raft_service: &Arc) { 71 | raft_service 72 | .register_state_machine(Box::new(Weights { 73 | groups: HashMap::new(), 74 | id, 75 | })) 76 | .await 77 | } 78 | pub async fn new(raft_service: &Arc) { 79 | Self::new_with_id(DEFAULT_SERVICE_ID, raft_service).await 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bifrost 2 | [![Build Status](https://travis-ci.org/ShisoftResearch/bifrost.svg?branch=master)](https://travis-ci.org/ShisoftResearch/bifrost) 3 | 4 | Pure rust building block for distributed systems 5 | 6 | ### Objective 7 | 8 | The objective of bifrost is to build a solid foundation for distributed systems in rust. 9 | It is similar to one of my Clojure project [cluster-connecter](https://github.com/shisoft/cluster-connector), but no longer require any third-party software like Zookeeper or etcd. 10 | Bifrost will ship with it's own reliable data store based on [raft consensus algorithm](https://raft.github.io/) state machines. Users are also able to build their own reliable data structures by implementing state machine commands. 11 | 12 | **Bifrost is still in very early stage of development and it is not suggested to be used in any kinds of projects until it is stabilized and fully tested** 13 | 14 | ### Progress Check List 15 | 16 | - [ ] RPC 17 | - [x] TCP Server 18 | - [x] Protocol 19 | - [x] Event driven server 20 | - [x] Sync client 21 | - [x] Async client 22 | - [X] Multiplexing pluggable services 23 | - [X] Shortcut (for both TCP and RPC APIs) 24 | - [ ] Raft (data replication) 25 | - [x] Leader election 26 | - [x] Log replication 27 | - [x] Master/subs state machine framework 28 | - [ ] State machine client 29 | - [x] Sync 30 | - [x] PubSub 31 | - [ ] Master state machine snapshot 32 | - [x] Generate 33 | - [x] Install 34 | - [ ] Generate in chunks 35 | - [ ] Install in chunks 36 | - [ ] Automation 37 | - [ ] Persistent to disk 38 | - [ ] Recover from disk 39 | - [ ] Incremental snapshot 40 | - [ ] Membership changes 41 | - [x] State machine 42 | - [x] New Member 43 | - [x] Delete Member 44 | - [x] Snapshot 45 | - [x] Recover 46 | - [X] Interfaces 47 | - [X] Update procedures 48 | - [x] Cluster bootstrap 49 | - [x] Client 50 | - [x] Command 51 | - [x] Query 52 | - [x] Concurrency 53 | - [x] Failover 54 | - [x] Membership changes 55 | - [x] Subscription 56 | - [ ] Raft Group 57 | - [ ] Tests 58 | - [x] State machine framework 59 | - [x] Leader selection 60 | - [x] Log replication 61 | - [ ] Snapshot 62 | - [ ] Membership changes 63 | - [x] New member 64 | - [x] Delete member 65 | - [ ] Safety 66 | - [ ] Stress and benchmark 67 | - [ ] Stress + Safety 68 | - [ ] Sharding 69 | - [x] Consistent hash 70 | - [ ] Reliable data store 71 | - [x] Client group membership 72 | - [x] Client group leader election 73 | - [x] Map 74 | - [ ] Set 75 | - [ ] Array 76 | - [ ] Queue 77 | - [x] Value 78 | - [x] Number 79 | - [ ] Lock 80 | - [ ] Integration (API) 81 | - [ ] gPRC 82 | - [ ] Utility 83 | - [x] [Global bindings](https://clojuredocs.org/clojure.core/binding) 84 | - [x] Consistent hashing 85 | - [x] Vector clock 86 | -------------------------------------------------------------------------------- /src/utils/bindings.rs: -------------------------------------------------------------------------------- 1 | use parking_lot::RwLock; 2 | use std::collections::HashMap; 3 | use std::sync::Arc; 4 | use thread_id; 5 | 6 | pub struct Binding 7 | where 8 | T: Clone, 9 | { 10 | default: T, 11 | thread_vals: RwLock>, 12 | } 13 | 14 | impl Binding 15 | where 16 | T: Clone, 17 | { 18 | pub fn new(default: T) -> Binding { 19 | Binding { 20 | default, 21 | thread_vals: RwLock::new(HashMap::new()), 22 | } 23 | } 24 | pub fn get(&self) -> T { 25 | let tid = thread_id::get(); 26 | let thread_map = self.thread_vals.read(); 27 | match thread_map.get(&tid) { 28 | Some(v) => v.clone(), 29 | None => self.default.clone(), 30 | } 31 | } 32 | pub fn set(&self, val: T) { 33 | let tid = thread_id::get(); 34 | let mut thread_map = self.thread_vals.write(); 35 | thread_map.insert(tid, val); 36 | } 37 | pub fn del(&self) { 38 | let tid = thread_id::get(); 39 | let mut thread_map = self.thread_vals.write(); 40 | thread_map.remove(&tid); 41 | } 42 | } 43 | 44 | pub struct RefBinding { 45 | bind: Binding>, 46 | } 47 | impl RefBinding { 48 | pub fn new(default: T) -> RefBinding { 49 | RefBinding { 50 | bind: Binding::new(Arc::new(default)), 51 | } 52 | } 53 | pub fn get(&self) -> Arc { 54 | self.bind.get() 55 | } 56 | pub fn set(&self, val: T) { 57 | self.bind.set(Arc::new(val)) 58 | } 59 | pub fn del(&self) { 60 | self.bind.del() 61 | } 62 | } 63 | 64 | #[macro_export] 65 | macro_rules! def_bindings { 66 | ($( 67 | bind $bt:ident $name:ident : $t:ty = $def_val:expr; 68 | )*) => { 69 | def_bindings! {{$( 70 | bind $bt $name : $t = $def_val; 71 | )*}} 72 | }; 73 | ( 74 | { 75 | bind val $name:ident : $t:ty = $def_val:expr; 76 | $( $unexpanded:tt )* 77 | } 78 | $( $expanded:tt )* 79 | ) => { 80 | def_bindings! { 81 | { $( $unexpanded )* } 82 | $( $expanded )* 83 | bind Binding $name : $t = $def_val; 84 | } 85 | }; 86 | ( 87 | { 88 | bind ref $name:ident : $t:ty = $def_val:expr; 89 | $( $unexpanded:tt )* 90 | } 91 | $( $expanded:tt )* 92 | ) => { 93 | def_bindings! { 94 | { $( $unexpanded )* } 95 | $( $expanded )* 96 | bind RefBinding $name : $t = $def_val; 97 | } 98 | }; 99 | ({}$( 100 | bind $bt:ident $name:ident : $t:ty = $def_val:expr; 101 | )*) => 102 | { 103 | lazy_static! { 104 | $( 105 | pub static ref $name : $crate::utils::bindings::$bt<$t> = $crate::utils::bindings::$bt::new($def_val); 106 | )* 107 | } 108 | }; 109 | } 110 | 111 | #[macro_export] 112 | macro_rules! with_bindings { 113 | ( 114 | $( 115 | $bind:path : $val:expr 116 | ),* => 117 | $stat:block 118 | ) => { 119 | { 120 | $( 121 | $bind.set($val); 122 | )* 123 | let r = $stat; 124 | $( 125 | $bind.del(); 126 | )* 127 | r 128 | } 129 | }; 130 | } 131 | 132 | #[cfg(test)] 133 | mod struct_test { 134 | def_bindings! { 135 | bind val TEST_VAL: u64 = 0; 136 | bind ref TEST_REF: String = String::from("Hello"); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/membership/member.rs: -------------------------------------------------------------------------------- 1 | use super::client::{MemberClient, ObserverClient}; 2 | use super::heartbeat_rpc::*; 3 | use super::raft::client::SMClient; 4 | use bifrost_hasher::hash_str; 5 | use futures::prelude::*; 6 | use std::sync::atomic::{AtomicBool, Ordering}; 7 | use std::sync::Arc; 8 | use tokio::{runtime, time}; 9 | 10 | use crate::membership::DEFAULT_SERVICE_ID; 11 | use crate::raft::client::RaftClient; 12 | use crate::raft::state_machine::master::ExecError; 13 | use crate::raft::RaftService; 14 | use crate::utils::time::get_time; 15 | 16 | static PING_INTERVAL: u64 = 500; 17 | 18 | pub struct MemberService { 19 | member_client: MemberClient, 20 | sm_client: Arc, 21 | raft_client: Arc, 22 | closed: AtomicBool, 23 | id: u64, 24 | } 25 | 26 | impl MemberService { 27 | pub async fn new( 28 | server_address: &String, 29 | raft_client: &Arc, 30 | raft_service: &Arc, 31 | ) -> Arc { 32 | let server_id = hash_str(server_address); 33 | let sm_client = Arc::new(SMClient::new(DEFAULT_SERVICE_ID, &raft_client)); 34 | let service = Arc::new(MemberService { 35 | sm_client: sm_client.clone(), 36 | member_client: MemberClient { 37 | id: server_id, 38 | sm_client: sm_client.clone(), 39 | }, 40 | raft_client: raft_client.clone(), 41 | closed: AtomicBool::new(false), 42 | id: server_id, 43 | }); 44 | let _join_res = sm_client.join(&server_address).await; 45 | let service_clone = service.clone(); 46 | raft_service.rt.spawn(async move { 47 | while !service_clone.closed.load(Ordering::Relaxed) { 48 | let start_time = get_time(); 49 | let rpc_client = service_clone.raft_client.current_leader_rpc_client().await; 50 | if let Ok(rpc_client) = rpc_client { 51 | let _ping_res = 52 | ImmeServiceClient::ping(DEFAULT_SERVICE_ID, &rpc_client, service_clone.id) 53 | .await; 54 | } else { 55 | error!("Cannot find RPC client for membership heartbeat to leader"); 56 | } 57 | let time_now = get_time(); 58 | let elapsed_time = time_now - start_time; 59 | trace!( 60 | "Membership ping at time {}, elapsed {}ms", 61 | time_now, 62 | elapsed_time 63 | ); 64 | if (elapsed_time as u64) < PING_INTERVAL { 65 | let wait_time = PING_INTERVAL - elapsed_time as u64; 66 | trace!("Waiting membership heartbeat for {}ms", wait_time); 67 | time::sleep(time::Duration::from_millis(wait_time)).await; 68 | } 69 | } 70 | debug!("Member service closed"); 71 | }); 72 | return service; 73 | } 74 | pub fn close(&self) { 75 | self.closed.store(true, Ordering::Relaxed); 76 | } 77 | pub async fn leave(&self) -> Result { 78 | self.close(); 79 | self.sm_client.leave(&self.id).await 80 | } 81 | pub async fn join_group(&self, group: &String) -> Result { 82 | self.member_client.join_group(group).await 83 | } 84 | pub async fn leave_group(&self, group: &String) -> Result { 85 | self.member_client.leave_group(group).await 86 | } 87 | pub fn client(&self) -> ObserverClient { 88 | ObserverClient::new_from_sm(&self.sm_client) 89 | } 90 | pub fn get_server_id(&self) -> u64 { 91 | self.id 92 | } 93 | } 94 | 95 | impl Drop for MemberService { 96 | fn drop(&mut self) { 97 | let sm_client = self.sm_client.clone(); 98 | let self_id = self.id; 99 | tokio::spawn(async move { sm_client.leave(&self_id).await }.boxed()); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/raft/state_machine/callback/mod.rs: -------------------------------------------------------------------------------- 1 | use bifrost_plugins::hash_ident; 2 | use server::SMCallback; 3 | 4 | pub mod client; 5 | pub mod server; 6 | // (raft_sid, sm_id, fn_id, pattern_id) 7 | pub type SubKey = (u64, u64, u64, u64); 8 | 9 | pub static DEFAULT_SERVICE_ID: u64 = hash_ident!(BIFROST_RAFT_SM_CALLBACK_DEFAULT_SERVICE) as u64; 10 | 11 | service! { 12 | rpc notify(key: SubKey, data: &Vec); 13 | } 14 | 15 | #[cfg(test)] 16 | mod test { 17 | use crate::raft::client::RaftClient; 18 | use crate::raft::state_machine::callback::server::SMCallback; 19 | use crate::raft::state_machine::StateMachineCtl; 20 | use crate::raft::{Options, RaftService, Storage, DEFAULT_SERVICE_ID}; 21 | use crate::rpc::Server; 22 | use crate::utils::time::async_wait_secs; 23 | use future::FutureExt; 24 | use std::sync::atomic::*; 25 | use std::sync::Arc; 26 | 27 | pub struct Trigger { 28 | count: u64, 29 | callback: SMCallback, 30 | } 31 | 32 | raft_state_machine! { 33 | def cmd trigger(); 34 | def sub on_trigged() -> u64; 35 | } 36 | 37 | impl StateMachineCmds for Trigger { 38 | fn trigger(&mut self) -> BoxFuture<()> { 39 | self.count += 1; 40 | async move { 41 | self.callback 42 | .notify(commands::on_trigged::new(), self.count) 43 | .await 44 | .unwrap(); 45 | } 46 | .boxed() 47 | } 48 | } 49 | 50 | impl StateMachineCtl for Trigger { 51 | raft_sm_complete!(); 52 | fn id(&self) -> u64 { 53 | 10 54 | } 55 | fn snapshot(&self) -> Vec { 56 | unreachable!() 57 | } 58 | fn recover(&mut self, _: Vec) -> BoxFuture<()> { 59 | future::ready(()).boxed() 60 | } 61 | fn recoverable(&self) -> bool { 62 | false 63 | } 64 | } 65 | 66 | #[tokio::test(flavor = "multi_thread")] 67 | async fn dummy() { 68 | let _ = env_logger::try_init(); 69 | info!("TESTING CALLBACK"); 70 | let addr = String::from("127.0.0.1:2110"); 71 | let raft_service = RaftService::new(Options { 72 | storage: Storage::default(), 73 | address: addr.clone(), 74 | service_id: DEFAULT_SERVICE_ID, 75 | }); 76 | let server = Server::new(&addr); 77 | let dummy_sm = Trigger { 78 | count: 0, 79 | callback: SMCallback::new(10, raft_service.clone()).await, 80 | }; 81 | let sm_id = dummy_sm.id(); 82 | server.register_service(&raft_service).await; 83 | Server::listen_and_resume(&server).await; 84 | RaftService::start(&raft_service, false).await; 85 | raft_service 86 | .register_state_machine(Box::new(dummy_sm)) 87 | .await; 88 | raft_service.bootstrap().await; 89 | 90 | async_wait_secs().await; 91 | 92 | let raft_client = RaftClient::new(&vec![addr], DEFAULT_SERVICE_ID) 93 | .await 94 | .unwrap(); 95 | let sm_client = Arc::new(client::SMClient::new(sm_id, &raft_client)); 96 | let loops = 10; 97 | let counter = Arc::new(AtomicUsize::new(0)); 98 | let counter_clone = counter.clone(); 99 | let sumer = Arc::new(AtomicUsize::new(0)); 100 | let sumer_clone = sumer.clone(); 101 | let mut expected_sum = 0; 102 | RaftClient::prepare_subscription(&server).await; 103 | sm_client 104 | .on_trigged(move |res: u64| { 105 | counter_clone.fetch_add(1, Ordering::Relaxed); 106 | sumer_clone.fetch_add(res as usize, Ordering::Relaxed); 107 | info!("CALLBACK TRIGGERED {}", res); 108 | future::ready(()).boxed() 109 | }) 110 | .await 111 | .unwrap() 112 | .unwrap(); 113 | 114 | for i in 0..loops { 115 | let sm_client = sm_client.clone(); 116 | expected_sum += i + 1; 117 | tokio::spawn(async move { 118 | sm_client.trigger().await.unwrap(); 119 | }); 120 | } 121 | 122 | async_wait_secs().await; 123 | 124 | assert_eq!(counter.load(Ordering::Relaxed), loops); 125 | assert_eq!(sumer.load(Ordering::Relaxed), expected_sum); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/proc_macro/src/lib.rs: -------------------------------------------------------------------------------- 1 | use syn::{Ident, Type, parse_macro_input, punctuated::Punctuated, Token, TypeTuple, TypeReference, parse::{ParseStream, ParseBuffer, Parse}, Result, FnArg, Pat, PatType, ItemTrait, TraitItem, TraitItemFn, Lifetime}; 2 | use quote::quote; 3 | use proc_macro::TokenStream; 4 | 5 | struct Args { 6 | args: Punctuated, 7 | } 8 | 9 | impl Parse for Args { 10 | fn parse(input: ParseStream) -> Result { 11 | let args = Punctuated::parse_terminated(input)?; 12 | Ok(Args { args }) 13 | } 14 | } 15 | 16 | #[proc_macro] 17 | pub fn adjust_caller_identifiers(input: TokenStream) -> TokenStream { 18 | let input = syn::parse_macro_input!(input as Args); 19 | let output = input.args.into_iter().map(|arg| { 20 | match arg { 21 | FnArg::Typed(pat_type) => { 22 | let pat = &*pat_type.pat; 23 | let ty = &*pat_type.ty; 24 | 25 | match (&pat, ty) { 26 | (Pat::Ident(pat_ident), Type::Reference(_)) => { 27 | let ident = &pat_ident.ident; 28 | quote! { ref #ident } 29 | }, 30 | (Pat::Ident(pat_ident), Type::Group(group)) => { 31 | let ident = &pat_ident.ident; 32 | if let Type::Reference(_) = &*group.elem { 33 | quote! { ref #ident } 34 | } else { 35 | quote! { #ident } 36 | } 37 | }, 38 | (Pat::Ident(pat_ident), _) => { 39 | let ident = &pat_ident.ident; 40 | quote! { #ident } 41 | }, 42 | _ => panic!("Unsupported pattern!"), 43 | } 44 | }, 45 | _ => panic!("Variadic arguments are not supported!"), 46 | } 47 | }).collect::>(); 48 | 49 | quote! { 50 | ( #(#output),* ) 51 | }.into() 52 | } 53 | 54 | #[proc_macro] 55 | pub fn adjust_function_signature(input: TokenStream) -> TokenStream { 56 | let input = parse_macro_input!(input as TraitItemFn); 57 | let mut output_trait_fn = input.clone(); 58 | let sig = &mut output_trait_fn.sig; 59 | // eprintln!("Adjust {:?}", sig); 60 | for input in &mut sig.inputs { 61 | match input { 62 | FnArg::Typed(pat_type) => { 63 | // eprintln!("Checking lifetime {:?}", pat_type); 64 | match *pat_type.ty { 65 | Type::Reference(ref mut ref_type) => { 66 | if ref_type.lifetime.is_none() { 67 | ref_type.lifetime = Some(Lifetime::new("'a", proc_macro2::Span::call_site())); 68 | //eprintln!("Assigning lifetime {:?}", ref_type); 69 | } 70 | }, 71 | Type::Group(ref mut group) => { 72 | if let Type::Reference(ref mut ref_type) = &mut *group.elem { 73 | if ref_type.lifetime.is_none() { 74 | ref_type.lifetime = Some(Lifetime::new("'a", proc_macro2::Span::call_site())); 75 | //eprintln!("Assigning lifetime {:?}", ref_type); 76 | } 77 | } 78 | } 79 | _ => {} 80 | } 81 | }, 82 | FnArg::Receiver(ref mut receiver) => { 83 | if let &mut Some((_, ref mut lifetime)) = &mut receiver.reference { 84 | if lifetime.is_none() { 85 | *lifetime = Some(Lifetime::new("'a", proc_macro2::Span::call_site())); 86 | } 87 | } 88 | } 89 | _ => {} 90 | } 91 | } 92 | 93 | quote!(#output_trait_fn).into() 94 | } 95 | 96 | 97 | #[proc_macro] 98 | pub fn deref_tuple_types(input: TokenStream) -> TokenStream { 99 | let input = parse_macro_input!(input as TypeTuple); 100 | 101 | let transformed_types: Vec<_> = input.elems.into_iter().map(|ty| { 102 | match ty { 103 | Type::Reference(TypeReference { elem, .. }) => *elem, 104 | Type::Group(group) => { 105 | if let Type::Reference(TypeReference { elem, .. }) = *group.elem { 106 | *elem 107 | } else { 108 | Type::Group(group) 109 | } 110 | }, 111 | other => other, 112 | } 113 | }).collect(); 114 | 115 | let tokens = quote! { (#(#transformed_types),*) }; 116 | tokens.into() 117 | } 118 | -------------------------------------------------------------------------------- /src/tcp/client.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::time::Duration; 3 | 4 | use crate::tcp::{shortcut, STANDALONE_ADDRESS}; 5 | use crate::DISABLE_SHORTCUT; 6 | use bifrost_hasher::hash_str; 7 | 8 | use crate::tcp::server::TcpReq; 9 | use async_std::sync::Mutex; 10 | use bytes::{Buf, BufMut, Bytes, BytesMut}; 11 | use futures::prelude::*; 12 | use futures::stream::SplitSink; 13 | use futures::SinkExt; 14 | use parking_lot::Mutex as SyncMutex; 15 | use std::collections::HashMap; 16 | use std::sync::atomic::AtomicU64; 17 | use std::sync::atomic::Ordering::Relaxed; 18 | use tokio::io; 19 | use tokio::net::TcpStream; 20 | use tokio::sync::oneshot; 21 | use tokio::time; 22 | use tokio_util::codec::{Framed, LengthDelimitedCodec}; 23 | 24 | pub struct Client { 25 | //client: Option>, 26 | client: Option, Bytes>>>, 27 | msg_counter: AtomicU64, 28 | senders: Arc>>>, 29 | timeout: Duration, 30 | pub server_id: u64, 31 | } 32 | 33 | impl Client { 34 | pub async fn connect_with_timeout(address: &String, timeout: Duration) -> io::Result { 35 | let server_id = hash_str(address); 36 | let senders = Arc::new(SyncMutex::new(HashMap::>::new())); 37 | debug!( 38 | "TCP connect to {}, server id {}, timeout {}ms", 39 | address, 40 | server_id, 41 | timeout.as_millis() 42 | ); 43 | let client = { 44 | if !DISABLE_SHORTCUT && shortcut::is_local(server_id).await { 45 | debug!("Local connection, using shortcut"); 46 | None 47 | } else { 48 | if address.eq(&STANDALONE_ADDRESS) { 49 | return Err(io::Error::new( 50 | io::ErrorKind::Other, 51 | "STANDALONE server is not found", 52 | )); 53 | } 54 | debug!("Create socket on {}", address); 55 | let socket = time::timeout(timeout, TcpStream::connect(address)).await??; 56 | let transport = Framed::new(socket, LengthDelimitedCodec::new()); 57 | let (writer, mut reader) = transport.split(); 58 | let cloned_senders = senders.clone(); 59 | debug!("Streaming messages for {}", address); 60 | let address = address.clone(); 61 | tokio::spawn(async move { 62 | while let Some(res) = reader.next().await { 63 | if let Ok(mut data) = res { 64 | let res_msg_id = data.get_u64_le(); 65 | trace!("Received msg for {}, size {}", res_msg_id, data.len()); 66 | let mut senders = cloned_senders.lock(); 67 | if let Some(sender) = senders.remove(&res_msg_id) { 68 | if let Err(e) = sender.send(data) { 69 | error!("Failed to send response for msg {}: {:?}", res_msg_id, e); 70 | } 71 | } else { 72 | error!("No sender found for response msg {}", res_msg_id); 73 | } 74 | } 75 | } 76 | debug!("Stream from TCP server {} broken", address); 77 | }); 78 | Some(Mutex::new(writer)) 79 | } 80 | }; 81 | Ok(Client { 82 | client, 83 | server_id, 84 | senders, 85 | timeout, 86 | msg_counter: AtomicU64::new(0), 87 | }) 88 | } 89 | pub async fn connect(address: &String) -> io::Result { 90 | Client::connect_with_timeout(address, Duration::from_secs(2)).await 91 | } 92 | pub async fn send_msg(&self, msg: TcpReq) -> io::Result { 93 | if let Some(ref transport) = self.client { 94 | let msg_id = self.msg_counter.fetch_add(1, Relaxed); 95 | let mut frame = BytesMut::with_capacity(8 + msg.len()); 96 | let rx = { 97 | frame.put_u64_le(msg_id); 98 | frame.extend_from_slice(msg.as_ref()); 99 | let (tx, rx) = oneshot::channel(); 100 | let mut senders = self.senders.lock(); 101 | senders.insert(msg_id, tx); 102 | rx 103 | }; 104 | trace!("Sending msg {}, size {}", msg_id, frame.len()); 105 | time::timeout(self.timeout, transport.lock().await.send(frame.freeze())).await??; 106 | trace!("Sent msg {}", msg_id); 107 | match time::timeout(self.timeout, rx).await? { 108 | Ok(response) => Ok(response), 109 | Err(e) => { 110 | error!("Failed to receive response for msg {}: {:?}", msg_id, e); 111 | Err(std::io::Error::new(std::io::ErrorKind::BrokenPipe, "Response channel closed")) 112 | } 113 | } 114 | } else { 115 | Ok(shortcut::call(self.server_id, msg).await?) 116 | } 117 | } 118 | } 119 | 120 | unsafe impl Send for Client {} 121 | -------------------------------------------------------------------------------- /src/tcp/server.rs: -------------------------------------------------------------------------------- 1 | use super::STANDALONE_ADDRESS; 2 | use crate::tcp::shortcut; 3 | use bytes::{Buf, BufMut, BytesMut}; 4 | use futures::SinkExt; 5 | use std::error::Error; 6 | use std::future::Future; 7 | use std::pin::Pin; 8 | use std::sync::Arc; 9 | use tokio::net::TcpListener; 10 | use tokio::sync::broadcast; 11 | use tokio_stream::StreamExt; 12 | use tokio_util::codec::{Framed, LengthDelimitedCodec}; 13 | 14 | pub type RPCFuture = dyn Future; 15 | pub type BoxedRPCFuture = Box; 16 | pub type TcpReq = BytesMut; 17 | pub type TcpRes = Pin + Send>>; 18 | 19 | pub struct Server { 20 | shutdown_tx: broadcast::Sender<()>, 21 | } 22 | 23 | impl Server { 24 | pub fn new() -> Server { 25 | let (shutdown_tx, _) = broadcast::channel(1); 26 | Server { shutdown_tx } 27 | } 28 | 29 | pub fn shutdown_handle(&self) -> broadcast::Sender<()> { 30 | self.shutdown_tx.clone() 31 | } 32 | 33 | pub async fn listen( 34 | &self, 35 | addr: &String, 36 | callback: Arc TcpRes + Send + Sync>, 37 | ) -> Result<(), Box> { 38 | shortcut::register_server(addr, &callback).await; 39 | if !addr.eq(&STANDALONE_ADDRESS) { 40 | let listener = TcpListener::bind(&addr).await?; 41 | let mut shutdown_rx = self.shutdown_tx.subscribe(); 42 | 43 | info!("TCP server listening on {}", addr); 44 | 45 | loop { 46 | tokio::select! { 47 | accept_result = listener.accept() => { 48 | match accept_result { 49 | Ok((socket, addr)) => { 50 | debug!("Accepted connection from {}", addr); 51 | let callback = callback.clone(); 52 | let mut conn_shutdown_rx = self.shutdown_tx.subscribe(); 53 | 54 | tokio::spawn(async move { 55 | let mut transport = Framed::new(socket, LengthDelimitedCodec::new()); 56 | loop { 57 | tokio::select! { 58 | result = transport.next() => { 59 | match result { 60 | Some(Ok(mut data)) => { 61 | let msg_id = data.get_u64_le(); 62 | let call_back_data = callback(data).await; 63 | let mut res = 64 | BytesMut::with_capacity(8 + call_back_data.len()); 65 | res.put_u64_le(msg_id); 66 | res.extend_from_slice(call_back_data.as_ref()); 67 | if let Err(e) = transport.send(res.freeze()).await { 68 | error!("Error on TCP callback {:?}", e); 69 | break; 70 | } 71 | } 72 | Some(Err(e)) => { 73 | error!("error on decoding from socket; error = {:?}", e); 74 | break; 75 | } 76 | None => { 77 | debug!("Connection closed by client"); 78 | break; 79 | } 80 | } 81 | } 82 | _ = conn_shutdown_rx.recv() => { 83 | info!("Connection handler received shutdown signal"); 84 | break; 85 | } 86 | } 87 | } 88 | // The connection will be closed at this point 89 | }); 90 | } 91 | Err(e) => error!("error accepting socket; error = {:?}", e), 92 | } 93 | } 94 | _ = shutdown_rx.recv() => { 95 | info!("TCP server on {} received shutdown signal, stopping accept loop", addr); 96 | break; 97 | } 98 | } 99 | } 100 | } 101 | info!("TCP server on {} shut down gracefully", addr); 102 | Ok(()) 103 | } 104 | 105 | pub fn shutdown(&self) { 106 | info!("Initiating TCP server shutdown"); 107 | let _ = self.shutdown_tx.send(()); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/raft/state_machine/configs.rs: -------------------------------------------------------------------------------- 1 | use crate::raft::state_machine::callback::server::Subscriptions; 2 | use crate::raft::state_machine::callback::SubKey; 3 | use crate::raft::state_machine::StateMachineCtl; 4 | use crate::raft::AsyncServiceClient; 5 | use crate::rpc::{self, ServiceClient}; 6 | use async_std::sync::*; 7 | use bifrost_hasher::hash_str; 8 | use futures::FutureExt; 9 | use serde::{Deserialize, Serialize}; 10 | use std::collections::{HashMap, HashSet}; 11 | use std::sync::Arc; 12 | 13 | pub const CONFIG_SM_ID: u64 = 1; 14 | 15 | #[derive(Clone)] 16 | pub struct RaftMember { 17 | pub rpc: Arc, 18 | pub address: String, 19 | pub id: u64, 20 | } 21 | 22 | pub struct Configures { 23 | pub members: HashMap, 24 | // keep it in arc lock for reference in callback server.rs 25 | pub subscriptions: Arc>, 26 | service_id: u64, 27 | } 28 | 29 | pub type MemberConfigSnapshot = HashSet; 30 | 31 | #[derive(Serialize, Deserialize, Debug)] 32 | pub struct ConfigSnapshot { 33 | members: MemberConfigSnapshot, 34 | //TODO: snapshot for subscriptions 35 | } 36 | 37 | raft_state_machine! { 38 | def cmd new_member_(address: String) -> bool; 39 | def cmd del_member_(address: String); 40 | def qry member_address() -> Vec; 41 | 42 | def cmd subscribe(key: SubKey, address: String, session_id: u64) -> Result; 43 | def cmd unsubscribe(sub_id: u64); 44 | } 45 | 46 | impl StateMachineCmds for Configures { 47 | fn new_member_(&mut self, address: String) -> BoxFuture { 48 | async move { 49 | let addr = address.clone(); 50 | let id = hash_str(&addr); 51 | if !self.members.contains_key(&id) { 52 | match rpc::DEFAULT_CLIENT_POOL.get(&address).await { 53 | Ok(client) => { 54 | self.members.insert( 55 | id, 56 | RaftMember { 57 | rpc: AsyncServiceClient::new_with_service_id( 58 | self.service_id, 59 | &client, 60 | ), 61 | address, 62 | id, 63 | }, 64 | ); 65 | return true; 66 | } 67 | Err(_) => {} 68 | } 69 | } 70 | false 71 | } 72 | .boxed() 73 | } 74 | fn del_member_(&mut self, address: String) -> BoxFuture<()> { 75 | async move { 76 | let hash = hash_str(&address); 77 | self.members.remove(&hash); 78 | } 79 | .boxed() 80 | } 81 | fn member_address(&self) -> BoxFuture> { 82 | future::ready(self.members.values().map(|m| m.address.clone()).collect()).boxed() 83 | } 84 | fn subscribe( 85 | &mut self, 86 | key: SubKey, 87 | address: String, 88 | session_id: u64, 89 | ) -> BoxFuture> { 90 | async move { 91 | let mut subs = self.subscriptions.write().await; 92 | subs.subscribe(key, &address, session_id).await 93 | } 94 | .boxed() 95 | } 96 | fn unsubscribe(&mut self, sub_id: u64) -> BoxFuture<()> { 97 | async move { 98 | let mut subs = self.subscriptions.write().await; 99 | subs.remove_subscription(sub_id); 100 | } 101 | .boxed() 102 | } 103 | } 104 | 105 | impl StateMachineCtl for Configures { 106 | raft_sm_complete!(); 107 | fn id(&self) -> u64 { 108 | CONFIG_SM_ID 109 | } 110 | fn snapshot(&self) -> Vec { 111 | let mut snapshot = ConfigSnapshot { 112 | members: HashSet::with_capacity(self.members.len()), 113 | }; 114 | for (_, member) in self.members.iter() { 115 | snapshot.members.insert(member.address.clone()); 116 | } 117 | crate::utils::serde::serialize(&snapshot) 118 | } 119 | fn recover(&mut self, data: Vec) -> BoxFuture<()> { 120 | match crate::utils::serde::deserialize::(&data) { 121 | Some(snapshot) => self.recover_members(snapshot.members).boxed(), 122 | None => { 123 | error!("Failed to deserialize config state machine snapshot. Config recovery failed."); 124 | // Return empty future - state machine will start with empty config 125 | future::ready(()).boxed() 126 | } 127 | } 128 | } 129 | fn recoverable(&self) -> bool { 130 | true 131 | } 132 | } 133 | 134 | impl Configures { 135 | pub fn new(service_id: u64) -> Configures { 136 | Configures { 137 | members: HashMap::new(), 138 | service_id, 139 | subscriptions: Arc::new(RwLock::new(Subscriptions::new())), 140 | } 141 | } 142 | async fn recover_members(&mut self, snapshot: MemberConfigSnapshot) { 143 | let mut curr_members: MemberConfigSnapshot = HashSet::with_capacity(self.members.len()); 144 | for (_, member) in self.members.iter() { 145 | curr_members.insert(member.address.clone()); 146 | } 147 | let to_del = curr_members.difference(&snapshot); 148 | let to_add = snapshot.difference(&curr_members); 149 | for addr in to_del { 150 | self.del_member(addr.clone()).await; 151 | } 152 | for addr in to_add { 153 | self.new_member(addr.clone()).await; 154 | } 155 | } 156 | pub async fn new_member(&mut self, address: String) -> bool { 157 | self.new_member_(address).await 158 | } 159 | pub async fn del_member(&mut self, address: String) { 160 | self.del_member_(address).await 161 | } 162 | pub fn member_existed(&self, id: u64) -> bool { 163 | self.members.contains_key(&id) 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/membership/client.rs: -------------------------------------------------------------------------------- 1 | use crate::membership::raft::client::SMClient; 2 | use crate::membership::DEFAULT_SERVICE_ID; 3 | use crate::raft::client::{RaftClient, SubscriptionError, SubscriptionReceipt}; 4 | use crate::raft::state_machine::master::ExecError; 5 | use bifrost_hasher::hash_str; 6 | use futures::future::BoxFuture; 7 | use serde::{Deserialize, Serialize}; 8 | use std::collections::BTreeMap; 9 | use std::sync::Arc; 10 | 11 | use super::server::MemberGroup; 12 | 13 | #[derive(Serialize, Deserialize, Debug, Clone)] 14 | pub struct Member { 15 | pub id: u64, 16 | pub address: String, 17 | pub online: bool, 18 | } 19 | 20 | #[derive(Serialize, Deserialize, Debug, Clone)] 21 | pub struct Group { 22 | pub id: u64, 23 | pub name: String, 24 | pub members: u64, 25 | } 26 | 27 | pub struct MemberClient { 28 | pub id: u64, 29 | pub sm_client: Arc, 30 | } 31 | 32 | impl MemberClient { 33 | pub async fn join_group(&self, group: &String) -> Result { 34 | self.sm_client.join_group(group, &self.id).await 35 | } 36 | pub async fn leave_group(&self, group: &String) -> Result { 37 | self.sm_client.leave_group(&hash_str(group), &self.id).await 38 | } 39 | } 40 | 41 | pub struct ObserverClient { 42 | pub sm_client: Arc, 43 | } 44 | 45 | impl ObserverClient { 46 | pub fn new(raft_client: &Arc) -> ObserverClient { 47 | ObserverClient { 48 | sm_client: Arc::new(SMClient::new(DEFAULT_SERVICE_ID, &raft_client)), 49 | } 50 | } 51 | pub fn new_from_sm(sm_client: &Arc) -> ObserverClient { 52 | ObserverClient { 53 | sm_client: sm_client.clone(), 54 | } 55 | } 56 | pub async fn new_group(&self, name: &String) -> Result, ExecError> { 57 | self.sm_client.new_group(name).await 58 | } 59 | pub async fn del_group(&self, name: &String) -> Result { 60 | self.sm_client.del_group(&hash_str(name)).await 61 | } 62 | pub async fn group_leader( 63 | &self, 64 | group: &String, 65 | ) -> Result, u64)>, ExecError> { 66 | self.sm_client.group_leader(&hash_str(group)).await 67 | } 68 | pub async fn group_members( 69 | &self, 70 | group: &String, 71 | online_only: bool, 72 | ) -> Result, u64)>, ExecError> { 73 | self.sm_client 74 | .group_members(&hash_str(group), &online_only) 75 | .await 76 | } 77 | pub async fn all_members(&self, online_only: bool) -> Result<(Vec, u64), ExecError> { 78 | self.sm_client.all_members(&online_only).await 79 | } 80 | pub async fn on_group_member_offline( 81 | &self, 82 | f: F, 83 | group: &str, 84 | ) -> Result, ExecError> 85 | where 86 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 87 | { 88 | self.sm_client 89 | .on_group_member_offline(f, &hash_str(group)) 90 | .await 91 | } 92 | pub async fn on_any_member_offline( 93 | &self, 94 | f: F, 95 | ) -> Result, ExecError> 96 | where 97 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 98 | { 99 | self.sm_client.on_any_member_offline(f).await 100 | } 101 | pub async fn on_group_member_online( 102 | &self, 103 | f: F, 104 | group: &str, 105 | ) -> Result, ExecError> 106 | where 107 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 108 | { 109 | self.sm_client 110 | .on_group_member_online(f, &hash_str(group)) 111 | .await 112 | } 113 | pub async fn on_any_member_online( 114 | &self, 115 | f: F, 116 | ) -> Result, ExecError> 117 | where 118 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 119 | { 120 | self.sm_client.on_any_member_online(f).await 121 | } 122 | pub async fn on_group_member_joined( 123 | &self, 124 | f: F, 125 | group: &str, 126 | ) -> Result, ExecError> 127 | where 128 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 129 | { 130 | self.sm_client 131 | .on_group_member_joined(f, &hash_str(group)) 132 | .await 133 | } 134 | pub async fn on_any_member_joined( 135 | &self, 136 | f: F, 137 | ) -> Result, ExecError> 138 | where 139 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 140 | { 141 | self.sm_client.on_any_member_joined(f).await 142 | } 143 | pub async fn on_group_member_left( 144 | &self, 145 | f: F, 146 | group: &str, 147 | ) -> Result, ExecError> 148 | where 149 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 150 | { 151 | self.sm_client 152 | .on_group_member_left(f, &hash_str(group)) 153 | .await 154 | } 155 | pub async fn on_any_member_left( 156 | &self, 157 | f: F, 158 | ) -> Result, ExecError> 159 | where 160 | F: Fn((Member, u64)) -> BoxFuture<'static, ()> + 'static + Send + Sync, 161 | { 162 | self.sm_client.on_any_member_left(f).await 163 | } 164 | pub async fn on_group_leader_changed( 165 | &self, 166 | f: F, 167 | group: &String, 168 | ) -> Result, ExecError> 169 | where 170 | F: Fn((Option, Option, u64)) -> BoxFuture<'static, ()> 171 | + 'static 172 | + Send 173 | + Sync, 174 | { 175 | self.sm_client 176 | .on_group_leader_changed(f, &hash_str(group)) 177 | .await 178 | } 179 | pub async fn all_groups(&self) -> Result, ExecError> { 180 | self.sm_client.all_groups().await 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /RECOVERY_IMPROVEMENTS.md: -------------------------------------------------------------------------------- 1 | # Node Recovery and Temporary Failure Handling Improvements 2 | 3 | ## Overview 4 | 5 | This document describes the improvements made to handle nodes that temporarily miss heartbeats due to being under load, ensuring they can properly recover and rejoin the cluster either as a leader or follower. 6 | 7 | ## Problem Statement 8 | 9 | When running under heavy load, nodes may temporarily fail to respond to heartbeats, leading to: 10 | 11 | 1. **Premature offline marking**: Nodes marked offline after a single timeout 12 | 2. **Leadership churn**: Rapid leadership changes causing instability 13 | 3. **Flapping**: Nodes bouncing between online/offline states 14 | 4. **Panic on errors**: Unwrap() calls causing crashes during transient failures 15 | 16 | ## Solutions Implemented 17 | 18 | ### 1. Grace Period with Consecutive Failure Tracking 19 | 20 | **New Configuration Constants:** 21 | ```rust 22 | static MAX_TIMEOUT: i64 = 10_000; // 10 seconds before considering potentially offline 23 | static OFFLINE_GRACE_CHECKS: u8 = 3; // Require 3 consecutive failures before marking offline 24 | static ONLINE_GRACE_CHECKS: u8 = 2; // Require 2 consecutive successes before marking online 25 | static MIN_STATE_CHANGE_INTERVAL: i64 = 5_000; // Minimum 5 seconds between state changes 26 | ``` 27 | 28 | **Benefits:** 29 | - **Resilience**: Tolerates temporary hiccups (up to 3 timeout checks × 500ms = ~1.5 seconds grace) 30 | - **Anti-flapping**: Minimum 5 second interval prevents rapid state oscillation 31 | - **Smooth recovery**: Requires 2 consecutive successful heartbeats before marking node back online 32 | 33 | ### 2. Enhanced HeartbeatStatus Tracking 34 | 35 | **New HBStatus Fields:** 36 | ```rust 37 | struct HBStatus { 38 | last_updated: i64, 39 | online: bool, 40 | consecutive_failures: u8, // Count of consecutive timeout checks 41 | consecutive_successes: u8, // Count of consecutive successful checks 42 | last_state_change: i64, // Timestamp of last state transition 43 | } 44 | ``` 45 | 46 | **Behavior:** 47 | - **Online → Offline**: Tracks consecutive timeouts, only transitions after reaching threshold AND minimum interval 48 | - **Offline → Online**: Tracks consecutive successful heartbeats, transitions after reaching threshold AND minimum interval 49 | - **Stable states**: Resets counters when nodes are consistently responsive 50 | 51 | ### 3. Improved Error Handling 52 | 53 | All `.unwrap()` calls replaced with proper error handling: 54 | 55 | **Fixed Functions:** 56 | - `compose_client_member`: Now returns `Option` instead of panicking 57 | - `group_leader_candidate_available`: Logs errors instead of panicking 58 | - `group_leader_candidate_unavailable`: Handles all failure cases gracefully 59 | - `notify_for_member_*`: Early returns with error logging on failures 60 | - Mutex lock failures: Gracefully handled with error logging 61 | 62 | **Result:** 63 | - No more panics during transient failures 64 | - Clear error logs for debugging 65 | - System continues operating even when individual operations fail 66 | 67 | ### 4. Leadership Transfer Grace Period 68 | 69 | When leadership transfers (e.g., during reelection): 70 | ```rust 71 | async fn transfer_leadership(&self) { 72 | // Give all online members fresh timestamps 73 | // Reset all failure/success counters 74 | // Prevents immediate timeout after leadership change 75 | } 76 | ``` 77 | 78 | **Benefits:** 79 | - New leader gets time to stabilize before checking heartbeats 80 | - Prevents cascading failures during leadership transitions 81 | - All members get a "fresh start" under new leadership 82 | 83 | ## Recovery Scenarios 84 | 85 | ### Scenario 1: Node Under Temporary Load 86 | 87 | **Timeline:** 88 | 1. Node A is leader and becomes overloaded 89 | 2. Misses heartbeat at T+10s (consecutive_failures = 1) 90 | 3. Misses heartbeat at T+10.5s (consecutive_failures = 2) 91 | 4. Misses heartbeat at T+11s (consecutive_failures = 3) 92 | 5. **Now marked offline** (after 3 consecutive failures) 93 | 6. Leadership election: Node B becomes leader 94 | 7. Node A recovers, starts sending heartbeats again 95 | 8. Receives heartbeat at T+15s (consecutive_successes = 1) 96 | 9. Receives heartbeat at T+15.5s (consecutive_successes = 2) 97 | 10. **Marked back online** (after 2 consecutive successes AND 5s minimum interval) 98 | 11. Node A becomes follower of Node B 99 | 100 | **Key Points:** 101 | - ~1.5 second tolerance before marking offline (3 × 500ms checks) 102 | - Minimum 5 second offline period (anti-flapping protection) 103 | - Node A does NOT automatically reclaim leadership (stability) 104 | - Node A properly syncs as follower under Node B 105 | 106 | ### Scenario 2: Brief Network Hiccup 107 | 108 | **Timeline:** 109 | 1. Node experiences single timeout (consecutive_failures = 1) 110 | 2. Next heartbeat succeeds (consecutive_failures reset to 0) 111 | 3. **Node remains online** - no state change 112 | 113 | **Key Points:** 114 | - Single hiccups don't trigger state changes 115 | - Prevents unnecessary leadership elections 116 | - Maintains cluster stability 117 | 118 | ### Scenario 3: Persistent Failure 119 | 120 | **Timeline:** 121 | 1. Node genuinely fails (hardware/crash) 122 | 2. Consecutive failures accumulate: 1, 2, 3 123 | 3. Marked offline after 3 checks 124 | 4. Leadership transfers to healthy node 125 | 5. Eventually removed from cluster if doesn't recover 126 | 127 | **Key Points:** 128 | - Real failures still detected quickly (~1.5 seconds) 129 | - System continues with remaining healthy nodes 130 | - No false positives from temporary load 131 | 132 | ## Monitoring and Observability 133 | 134 | ### New Log Messages 135 | 136 | **During failure detection:** 137 | ``` 138 | DEBUG: Member 12345 timeout check 1/3 (10500ms since last update, 2000ms since last state change) 139 | DEBUG: Member 12345 timeout check 2/3 (11000ms since last update, 2500ms since last state change) 140 | WARN: Marking member 12345 as offline after 3 consecutive timeout checks (11500ms since last update) 141 | ``` 142 | 143 | **During recovery:** 144 | ``` 145 | DEBUG: Member 12345 recovery check 1/2 (3000ms since last state change) 146 | INFO: Marking member 12345 as back online after 2 consecutive successful checks 147 | ``` 148 | 149 | **Error scenarios:** 150 | ``` 151 | ERROR: Failed to change leader for group 789 to member 12345 152 | ERROR: Failed to find online member for group 789 after member 12345 became unavailable 153 | ERROR: Failed to compose client member 12345 for online notification 154 | ``` 155 | 156 | ## Configuration Tuning 157 | 158 | You can adjust these constants based on your needs: 159 | 160 | - **Increase `OFFLINE_GRACE_CHECKS`**: More tolerance for slow responses (longer detection time) 161 | - **Decrease `OFFLINE_GRACE_CHECKS`**: Faster failure detection (less tolerance) 162 | - **Increase `MIN_STATE_CHANGE_INTERVAL`**: More aggressive anti-flapping (longer recovery time) 163 | - **Decrease `MIN_STATE_CHANGE_INTERVAL`**: Faster recovery (more risk of flapping) 164 | - **Increase `MAX_TIMEOUT`**: More lenient heartbeat requirements 165 | - **Decrease `MAX_TIMEOUT`**: Stricter heartbeat requirements 166 | 167 | ## Testing Recommendations 168 | 169 | 1. **Load testing**: Verify nodes can recover under realistic load 170 | 2. **Network partition**: Test with simulated network splits 171 | 3. **Chaos testing**: Randomly kill/restart nodes to test recovery paths 172 | 4. **Long-running stability**: Monitor for log growth and state flapping 173 | 174 | ## Backward Compatibility 175 | 176 | All changes are backward compatible: 177 | - Wire protocol unchanged 178 | - State machine behavior unchanged (only timing/resilience improved) 179 | - Existing clusters will benefit immediately upon upgrade 180 | 181 | ## Performance Impact 182 | 183 | - **Minimal CPU overhead**: Simple counter increments 184 | - **Minimal memory overhead**: 3 extra bytes per member (2 u8 counters + i64 timestamp) 185 | - **Reduced network churn**: Fewer unnecessary state changes = less Raft log entries 186 | - **Improved stability**: Less leadership churn = better overall performance 187 | 188 | ## Future Enhancements 189 | 190 | Potential future improvements: 191 | 1. **Configurable parameters**: Make timeouts/thresholds runtime-configurable 192 | 2. **Adaptive timeouts**: Adjust based on observed network latency 193 | 3. **Priority-based leader election**: Prefer certain nodes as leaders 194 | 4. **Health scoring**: Multi-factor health beyond just heartbeats 195 | 5. **Metrics export**: Prometheus/OpenTelemetry integration for monitoring 196 | 197 | -------------------------------------------------------------------------------- /src/raft/state_machine/master.rs: -------------------------------------------------------------------------------- 1 | use self::configs::{Configures, RaftMember, CONFIG_SM_ID}; 2 | use super::super::*; 3 | use super::*; 4 | use std::collections::HashMap; 5 | use std::error::Error; 6 | use std::fmt; 7 | use std::fmt::Display; 8 | use std::fmt::Formatter; 9 | 10 | #[derive(Serialize, Deserialize, Debug, Clone)] 11 | pub enum ExecError { 12 | SmNotFound(u64), 13 | FnNotFound(u64, u64), // (sm_id, fn_id) 14 | ServersUnreachable, 15 | CannotConstructClient, 16 | NotCommitted, 17 | Unknown, 18 | TooManyRetry, 19 | } 20 | 21 | pub enum RegisterResult { 22 | OK, 23 | EXISTED, 24 | RESERVED, 25 | } 26 | 27 | pub type ExecOk = Vec; 28 | pub type ExecResult = Result; 29 | pub type SubStateMachine = Box; 30 | pub type SnapshotDataItem = (u64, Vec); 31 | pub type SnapshotDataItems = Vec; 32 | 33 | raft_state_machine! {} 34 | 35 | pub struct MasterStateMachine { 36 | subs: HashMap, 37 | snapshots: HashMap>, 38 | pub configs: Configures, 39 | } 40 | 41 | impl StateMachineCmds for MasterStateMachine {} 42 | 43 | impl StateMachineCtl for MasterStateMachine { 44 | raft_sm_complete!(); 45 | fn id(&self) -> u64 { 46 | 0 47 | } 48 | fn snapshot(&self) -> Vec { 49 | let mut sms: SnapshotDataItems = Vec::with_capacity(self.subs.len()); 50 | for (sm_id, smc) in self.subs.iter() { 51 | if !smc.recoverable() { 52 | continue; 53 | } 54 | let sub_snapshot = smc.snapshot(); 55 | sms.push((*sm_id, sub_snapshot)); 56 | } 57 | sms.push((self.configs.id(), self.configs.snapshot())); 58 | let data = crate::utils::serde::serialize(&sms); 59 | data 60 | } 61 | fn recover(&mut self, data: Vec) -> BoxFuture<()> { 62 | match crate::utils::serde::deserialize::(data.as_slice()) { 63 | Some(sms) => { 64 | for (sm_id, snapshot) in sms { 65 | self.snapshots.insert(sm_id, snapshot); 66 | } 67 | } 68 | None => { 69 | error!("Failed to deserialize master state machine snapshot. State machine recovery failed."); 70 | // Clear snapshots to start fresh - this is safer than leaving corrupted state 71 | self.snapshots.clear(); 72 | } 73 | } 74 | future::ready(()).boxed() 75 | } 76 | fn recoverable(&self) -> bool { 77 | true 78 | } 79 | } 80 | 81 | pub fn parse_output<'a>(r: Option>) -> ExecResult { 82 | if let Some(d) = r { 83 | Ok(d) 84 | } else { 85 | // Caller will wrap with correct (sm_id, fn_id); default to (0,0) if unknown 86 | Err(ExecError::FnNotFound(0, 0)) 87 | } 88 | } 89 | 90 | impl MasterStateMachine { 91 | pub fn new(service_id: u64) -> MasterStateMachine { 92 | let msm = MasterStateMachine { 93 | subs: HashMap::new(), 94 | snapshots: HashMap::new(), 95 | configs: Configures::new(service_id), 96 | }; 97 | msm 98 | } 99 | 100 | /// Whether a given state machine id should be persisted/recovered. 101 | pub fn is_recoverable(&self, sm_id: u64) -> bool { 102 | if sm_id == CONFIG_SM_ID { 103 | return self.configs.recoverable(); 104 | } 105 | if let Some(sm) = self.subs.get(&sm_id) { 106 | return sm.recoverable(); 107 | } 108 | // Default to true if SM is not yet registered so we don't skip WAL 109 | true 110 | } 111 | 112 | pub fn register(&mut self, mut smc: SubStateMachine) -> RegisterResult { 113 | let id = smc.id(); 114 | if id < 2 { 115 | return RegisterResult::RESERVED; 116 | } 117 | if self.subs.contains_key(&id) { 118 | return RegisterResult::EXISTED; 119 | }; 120 | if let Some(snapshot) = self.snapshots.remove(&id) { 121 | smc.recover(snapshot); 122 | } 123 | self.subs.insert(id, smc); 124 | RegisterResult::OK 125 | } 126 | 127 | pub fn members(&self) -> &HashMap { 128 | &self.configs.members 129 | } 130 | 131 | pub async fn commit_cmd(&mut self, entry: &LogEntry) -> ExecResult { 132 | match entry.sm_id { 133 | CONFIG_SM_ID => { 134 | let out = self.configs.fn_dispatch_cmd(entry.fn_id, &entry.data).await; 135 | match out { 136 | Some(d) => Ok(d), 137 | None => { 138 | warn!( 139 | "FN not found for cmd sm_id={}, fn_id={} at log_id={}", 140 | entry.sm_id, entry.fn_id, entry.id 141 | ); 142 | Err(ExecError::FnNotFound(entry.sm_id, entry.fn_id)) 143 | } 144 | } 145 | } 146 | _ => { 147 | match self.subs.get_mut(&entry.sm_id) { 148 | Some(sm) => { 149 | let out = sm.as_mut().fn_dispatch_cmd(entry.fn_id, &entry.data).await; 150 | match out { 151 | Some(data) => Ok(data), 152 | None => { 153 | warn!( 154 | "FN not found for cmd sm_id={}, fn_id={} at log_id={}", 155 | entry.sm_id, entry.fn_id, entry.id 156 | ); 157 | Err(ExecError::FnNotFound(entry.sm_id, entry.fn_id)) 158 | } 159 | } 160 | } 161 | None => { 162 | warn!( 163 | "SM not found for cmd sm_id={} at log_id={}, have SMs: {:?}", 164 | entry.sm_id, 165 | entry.id, 166 | self.subs.keys().collect::>() 167 | ); 168 | Err(ExecError::SmNotFound(entry.sm_id)) 169 | } 170 | } 171 | } 172 | } 173 | } 174 | pub async fn exec_qry(&self, entry: &LogEntry) -> ExecResult { 175 | match entry.sm_id { 176 | CONFIG_SM_ID => { 177 | let out = self.configs.fn_dispatch_qry(entry.fn_id, &entry.data).await; 178 | match out { 179 | Some(d) => Ok(d), 180 | None => { 181 | warn!( 182 | "FN not found for qry sm_id={}, fn_id={} at log_id={}", 183 | entry.sm_id, entry.fn_id, entry.id 184 | ); 185 | Err(ExecError::FnNotFound(entry.sm_id, entry.fn_id)) 186 | } 187 | } 188 | } 189 | _ => { 190 | match self.subs.get(&entry.sm_id) { 191 | Some(sm) => { 192 | let out = sm.fn_dispatch_qry(entry.fn_id, &entry.data).await; 193 | match out { 194 | Some(data) => Ok(data), 195 | None => { 196 | warn!( 197 | "FN not found for qry sm_id={}, fn_id={} at log_id={}", 198 | entry.sm_id, entry.fn_id, entry.id 199 | ); 200 | Err(ExecError::FnNotFound(entry.sm_id, entry.fn_id)) 201 | } 202 | } 203 | } 204 | None => { 205 | warn!( 206 | "SM not found for qry sm_id={} at log_id={}, have SMs: {:?}", 207 | entry.sm_id, 208 | entry.id, 209 | self.subs.keys().collect::>() 210 | ); 211 | Err(ExecError::SmNotFound(entry.sm_id)) 212 | } 213 | } 214 | } 215 | } 216 | } 217 | pub fn clear_subs(&mut self) { 218 | self.subs.clear() 219 | } 220 | pub fn has_sub(&self, id: &u64) -> bool { 221 | self.subs.contains_key(&id) 222 | } 223 | } 224 | 225 | impl Error for ExecError {} 226 | impl Display for ExecError { 227 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 228 | write!(f, "{:?}", self) 229 | } 230 | } 231 | -------------------------------------------------------------------------------- /src/rpc/proto.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | macro_rules! dispatch_rpc_service_functions { 3 | ($s:ty) => { 4 | use $crate::bytes::BytesMut; 5 | impl $crate::rpc::RPCService for $s { 6 | fn dispatch<'a>( 7 | &'a self, 8 | data: BytesMut, 9 | ) -> ::std::pin::Pin< 10 | Box< 11 | dyn Future< 12 | Output = Result<$crate::bytes::BytesMut, $crate::rpc::RPCRequestError>, 13 | > + Send 14 | + 'a, 15 | >, 16 | > 17 | where 18 | Self: Sized, 19 | { 20 | self.inner_dispatch(data) 21 | } 22 | fn register_shortcut_service( 23 | &self, 24 | service_ptr: usize, 25 | server_id: u64, 26 | service_id: u64, 27 | ) -> ::std::pin::Pin + Send>> { 28 | async move { 29 | let mut cbs = RPC_SVRS.write().await; 30 | let service = unsafe { Arc::from_raw(service_ptr as *const $s) }; 31 | cbs.insert((server_id, service_id), service); 32 | } 33 | .boxed() 34 | } 35 | fn service_symbol(&self) -> &'static str { 36 | stringify!($s) 37 | } 38 | } 39 | }; 40 | } 41 | 42 | // this macro expansion design took credits from tarpc by Google Inc. 43 | #[macro_export] 44 | macro_rules! service { 45 | ( 46 | $( 47 | $(#[$attr:meta])* 48 | rpc $fn_name:ident( $( $arg:ident : $in_:ty ),* ) $(-> $out:ty)*; 49 | )* 50 | ) => { 51 | service! {{ 52 | $( 53 | $(#[$attr])* 54 | rpc $fn_name( $( $arg : $in_ ),* ) $(-> $out)*; 55 | )* 56 | }} 57 | }; 58 | ( 59 | { 60 | $(#[$attr:meta])* 61 | rpc $fn_name:ident( $( $arg:ident : $in_:ty ),* ); // No return, no error 62 | 63 | $( $unexpanded:tt )* 64 | } 65 | $( $expanded:tt )* 66 | ) => { 67 | service! { 68 | { $( $unexpanded )* } 69 | 70 | $( $expanded )* 71 | 72 | $(#[$attr])* 73 | rpc $fn_name( $( $arg : $in_ ),* ) -> (); 74 | } 75 | }; 76 | ( 77 | { 78 | $(#[$attr:meta])* 79 | rpc $fn_name:ident( $( $arg:ident : $in_:ty ),* ) -> $out:ty; 80 | 81 | $( $unexpanded:tt )* 82 | } 83 | $( $expanded:tt )* 84 | ) => { 85 | service! { 86 | { $( $unexpanded )* } 87 | 88 | $( $expanded )* 89 | 90 | $(#[$attr])* 91 | rpc $fn_name( $( $arg : $in_ ),* ) -> $out; 92 | } 93 | }; 94 | ( 95 | {} // all expanded 96 | $( 97 | $(#[$attr:meta])* 98 | rpc $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty; 99 | )* 100 | ) => { 101 | 102 | use std::sync::Arc; 103 | use $crate::rpc::*; 104 | #[allow(unused_imports)] 105 | use futures::prelude::*; 106 | use std::pin::Pin; 107 | use bifrost_proc_macro::{deref_tuple_types, adjust_caller_identifiers, adjust_function_signature}; 108 | 109 | lazy_static! { 110 | pub static ref RPC_SVRS: 111 | async_std::sync::RwLock<::std::collections::BTreeMap<(u64, u64), Arc>> 112 | = async_std::sync::RwLock::new(::std::collections::BTreeMap::new()); 113 | } 114 | 115 | pub trait Service : RPCService { 116 | $( 117 | $(#[$attr])* 118 | adjust_function_signature!{ 119 | fn $fn_name<'a>(&self, $($arg:$in_),*) -> ::futures::future::BoxFuture<'a, $out>; 120 | } 121 | )* 122 | fn inner_dispatch<'a>(&'a self, data: $crate::bytes::BytesMut) -> Pin> + Send + 'a>> { 123 | let (func_id, body) = read_u64_head(data); 124 | async move { 125 | match func_id as usize { 126 | $(::bifrost_plugins::hash_ident!($fn_name) => { 127 | if let Some(data) = $crate::utils::serde::deserialize(body.as_ref()) { 128 | #[allow(unused_parens)] 129 | let tuple : deref_tuple_types!(($($in_,)*)) = data; 130 | let adjust_caller_identifiers!($($arg: $in_),*) = tuple; 131 | let f_result = self.$fn_name($($arg,)*).await; 132 | let res_data = $crate::bytes::BytesMut::from($crate::utils::serde::serialize(&f_result).as_slice()); 133 | Ok(res_data) 134 | } else { 135 | Err(RPCRequestError::BadRequest) 136 | } 137 | }),* 138 | _ => { 139 | Err(RPCRequestError::FunctionIdNotFound) 140 | } 141 | } 142 | }.boxed() 143 | } 144 | } 145 | 146 | #[allow(dead_code)] 147 | pub async fn get_local(server_id: u64, service_id: u64) -> Option> { 148 | let svrs = RPC_SVRS.read().await; 149 | match svrs.get(&(server_id, service_id)) { 150 | Some(s) => Some(s.clone()), 151 | _ => None 152 | } 153 | } 154 | 155 | #[allow(dead_code)] 156 | pub struct AsyncServiceClient { 157 | pub service_id: u64, 158 | pub client: Arc, 159 | } 160 | 161 | #[allow(dead_code)] 162 | impl AsyncServiceClient { 163 | $( 164 | #[allow(non_camel_case_types)] 165 | $(#[$attr])* 166 | pub async fn $fn_name(&self, $($arg:$in_),*) -> Result<$out, RPCError> { 167 | ImmeServiceClient::$fn_name(self.service_id, &self.client, $($arg),*).await 168 | } 169 | )* 170 | } 171 | impl ServiceClient for AsyncServiceClient { 172 | fn new_instance_with_service_id(service_id: u64, client: &Arc) -> Self { 173 | AsyncServiceClient{ 174 | service_id: service_id, 175 | client: client.clone() 176 | } 177 | } 178 | fn server_id(&self) -> u64 { 179 | self.client.server_id 180 | } 181 | } 182 | pub struct ImmeServiceClient; 183 | impl ImmeServiceClient { 184 | $( 185 | $(#[$attr])* 186 | /// Judgement: Use data ownership transfer instead of borrowing. 187 | /// Some applications highly depend on RPC shortcut to achieve performance advantages. 188 | /// Cloning for shortcut will significantly increase overhead. Eg. Hivemind immutable queue 189 | pub async fn $fn_name(service_id: u64, client: &Arc, $($arg:$in_),*) -> Result<$out, RPCError> { 190 | if let Some(ref local) = get_local(client.server_id, service_id).await { 191 | Ok(local.$fn_name($($arg),*).await) 192 | } else { 193 | let req_data = ($($arg,)*); 194 | let req_data_bytes = $crate::bytes::BytesMut::from($crate::utils::serde::serialize(&req_data).as_slice()); 195 | let req_bytes = prepend_u64(::bifrost_plugins::hash_ident!($fn_name) as u64, req_data_bytes); 196 | let res_bytes = RPCClient::send_async(Pin::new(&*client), service_id, req_bytes).await; 197 | if let Ok(res_bytes) = res_bytes { 198 | if let Some(data) = $crate::utils::serde::deserialize(&res_bytes) { 199 | Ok(data) 200 | } else { 201 | Err(RPCError::ClientCannotDecodeResponse) 202 | } 203 | } else { 204 | Err(res_bytes.err().unwrap()) 205 | } 206 | } 207 | } 208 | )* 209 | } 210 | } 211 | } 212 | 213 | #[macro_export] 214 | macro_rules! service_with_id { 215 | ($s:ty, $id:expr) => { 216 | impl $crate::rpc::RPCServiceWithId for $s { 217 | const SERVICE_ID: u64 = $id; 218 | } 219 | impl $crate::rpc::ServiceClientWithId for AsyncServiceClient { 220 | const SERVICE_ID: u64 = $id; 221 | } 222 | }; 223 | } 224 | 225 | mod syntax_test { 226 | service! { 227 | rpc test(a: u32, b: u32) -> bool; 228 | rpc test2(a: u32); 229 | rpc test3(a: u32, b: u32, c: u32, d: u32); 230 | rpc test4(a: u32, b: Vec, c: &Vec, d: u32); 231 | } 232 | } 233 | 234 | #[cfg(test)] 235 | mod struct_test { 236 | use serde::{Deserialize, Serialize}; 237 | #[derive(Serialize, Deserialize, Debug, Clone)] 238 | pub struct A { 239 | b: u32, 240 | d: u64, 241 | e: String, 242 | f: f32, 243 | } 244 | 245 | service! { 246 | rpc test(a: A, b: u32) -> bool; 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /MEMBERSHIP_GUIDE.md: -------------------------------------------------------------------------------- 1 | # Membership Guide 2 | 3 | This document explains how membership works in Bifrost and the difference between **Raft Cluster Membership** and the **Membership Service**. 4 | 5 | ## Two Types of Membership 6 | 7 | Bifrost has two distinct membership systems that serve different purposes: 8 | 9 | ### 1. Raft Cluster Membership (PERSISTED ✅) 10 | 11 | **Location**: `src/raft/state_machine/configs.rs` 12 | 13 | **Purpose**: Tracks which servers are part of the Raft consensus cluster 14 | 15 | **Persistence**: **YES** - Fully persisted to disk via: 16 | - Write-Ahead Log (WAL) 17 | - Snapshots 18 | 19 | **Members**: Raft servers that participate in consensus (leader election, log replication) 20 | 21 | **Operations**: 22 | - `new_member_(address)` - Add a Raft server to the cluster 23 | - `del_member_(address)` - Remove a Raft server from the cluster 24 | - `member_address()` - Query all Raft cluster members 25 | 26 | **Recovery**: On restart, Raft cluster membership is recovered from: 27 | 1. Latest snapshot on disk 28 | 2. WAL log replay 29 | 30 | **Why Persisted?**: Critical for Raft consensus. The cluster must know its membership to: 31 | - Calculate quorum (majority) 32 | - Elect leaders 33 | - Replicate logs correctly 34 | 35 | **Code Example**: 36 | ```rust 37 | // These members are persisted and recovered on restart 38 | service.join(&vec!["node1:5000".to_string()]).await; 39 | ``` 40 | 41 | ### 2. Membership Service (NOT PERSISTED ❌) 42 | 43 | **Location**: `src/membership/server.rs` 44 | 45 | **Purpose**: Tracks member groups, heartbeat status, and online/offline state 46 | 47 | **Persistence**: **NO** - Intentionally ephemeral 48 | 49 | **Members**: Applications or clients using the membership service for: 50 | - Group membership 51 | - Leader election within groups 52 | - Liveness tracking 53 | - Membership change notifications 54 | 55 | **Operations**: 56 | - `join(address)` - Join as a member 57 | - `leave(id)` - Leave the membership service 58 | - `join_group(group_name, id)` - Join a group 59 | - `leave_group(group, id)` - Leave a group 60 | - `ping(id)` - Send heartbeat 61 | 62 | **Recovery**: On restart, starts with **empty state** and rebuilds through: 63 | 1. Members calling `join()` again 64 | 2. Heartbeat `ping()` messages 65 | 3. Group operations 66 | 67 | **Why NOT Persisted?**: 68 | - Membership should reflect **current network reality** 69 | - Stale disk state would be misleading after crashes 70 | - Members must actively rejoin to prove they're alive 71 | - Groups are transient application-level constructs 72 | 73 | **Code Example**: 74 | ```rust 75 | // After restart, this state is gone - members must rejoin 76 | let client = MemberClient::new(...).await; 77 | client.join().await; // Must be called again after restart 78 | client.join_group("workers".to_string()).await; 79 | ``` 80 | 81 | ## Comparison Table 82 | 83 | | Feature | Raft Cluster Membership | Membership Service | 84 | |---------|------------------------|-------------------| 85 | | **Persisted** | ✅ Yes (WAL + Snapshot) | ❌ No (Always fresh) | 86 | | **Purpose** | Raft consensus | Application groups/heartbeats | 87 | | **Scope** | Cluster-wide | Per-service | 88 | | **Recovery** | From disk | From network rediscovery | 89 | | **State Machine ID** | `CONFIG_SM_ID` (1) | `DEFAULT_SERVICE_ID` | 90 | | **Critical for Raft** | ✅ Yes | ❌ No | 91 | | **Survives Restart** | ✅ Yes | ❌ No | 92 | 93 | ## How They Work Together 94 | 95 | ``` 96 | ┌─────────────────────────────────────────────────────────┐ 97 | │ Bifrost Cluster │ 98 | ├─────────────────────────────────────────────────────────┤ 99 | │ │ 100 | │ Raft Cluster Membership (Persisted) │ 101 | │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ 102 | │ │ Server1 │ │ Server2 │ │ Server3 │ │ 103 | │ │ :5000 │ │ :5001 │ │ :5002 │ │ 104 | │ └────┬────┘ └────┬────┘ └────┬────┘ │ 105 | │ │ │ │ │ 106 | │ ├────────────┴────────────┤ │ 107 | │ │ Raft Consensus │ │ 108 | │ │ (Leader Election, │ │ 109 | │ │ Log Replication) │ │ 110 | │ └─────────────────────────┘ │ 111 | │ │ 112 | │ Membership Service (NOT Persisted - Fresh on restart) │ 113 | │ ┌─────────────────────────────────────────────┐ │ 114 | │ │ Members: { │ │ 115 | │ │ App1 -> online, groups: [workers] │ │ 116 | │ │ App2 -> online, groups: [workers] │ │ 117 | │ │ App3 -> offline, groups: [storage] │ │ 118 | │ │ } │ │ 119 | │ │ Groups: { │ │ 120 | │ │ workers -> leader: App1, members: [1,2] │ │ 121 | │ │ storage -> leader: None, members: [3] │ │ 122 | │ │ } │ │ 123 | │ └─────────────────────────────────────────────┘ │ 124 | │ ↑ This is cleared on restart │ 125 | └─────────────────────────────────────────────────────────┘ 126 | ``` 127 | 128 | ## Startup Sequence After Restart 129 | 130 | ### Raft Cluster Membership (Automatic) 131 | ```rust 132 | // Server restarts 133 | let raft_service = RaftService::new(options); 134 | RaftService::start(&raft_service).await; 135 | 136 | // ✅ Cluster membership automatically recovered from disk 137 | // ✅ Knows about Server1, Server2, Server3 138 | // ✅ Can participate in consensus immediately 139 | ``` 140 | 141 | ### Membership Service (Manual Rejoin Required) 142 | ```rust 143 | // Server restarts 144 | let membership_client = MemberClient::new(...).await; 145 | 146 | // ❌ Membership service starts EMPTY 147 | // ❌ Previous groups/members are forgotten 148 | 149 | // ✅ Applications must rejoin 150 | membership_client.join().await; // Rejoin as member 151 | membership_client.join_group("workers").await; // Rejoin group 152 | membership_client.start_heartbeat(); // Start sending pings 153 | 154 | // ✅ Membership service rebuilds state from these actions 155 | ``` 156 | 157 | ## Why This Design? 158 | 159 | ### Raft Cluster Membership: Persisted 160 | - **Safety**: Raft consensus requires consistent membership for quorum 161 | - **Correctness**: Must survive crashes to maintain cluster integrity 162 | - **Availability**: Cluster can restart without manual intervention 163 | 164 | ### Membership Service: NOT Persisted 165 | - **Freshness**: Ensures membership reflects current reality 166 | - **Simplicity**: No stale data to reconcile 167 | - **Self-Healing**: Dead members naturally disappear (no heartbeat = offline) 168 | - **Flexibility**: Applications control their own membership lifecycle 169 | 170 | ## Code Examples 171 | 172 | ### Example 1: Raft Member Survives Restart 173 | 174 | ```rust 175 | // Initial setup 176 | let raft = RaftService::new(Options { 177 | storage: Storage::DISK(disk_opts), 178 | address: "node1:5000".to_string(), 179 | service_id: DEFAULT_SERVICE_ID, 180 | }); 181 | raft.join(&vec!["node2:5000".to_string()]).await; 182 | 183 | // --- CRASH AND RESTART --- 184 | 185 | // After restart 186 | let raft = RaftService::new(Options { 187 | storage: Storage::DISK(disk_opts), // Same disk path 188 | address: "node1:5000".to_string(), 189 | service_id: DEFAULT_SERVICE_ID, 190 | }); 191 | RaftService::start(&raft).await; 192 | 193 | // ✅ Still knows about node2:5000 (recovered from disk) 194 | // ✅ Can participate in cluster immediately 195 | ``` 196 | 197 | ### Example 2: Membership Service Starts Fresh 198 | 199 | ```rust 200 | // Initial setup 201 | let member = MemberClient::new(...).await; 202 | member.join().await; 203 | member.join_group("workers").await; 204 | 205 | // --- CRASH AND RESTART --- 206 | 207 | // After restart 208 | let member = MemberClient::new(...).await; 209 | 210 | // ❌ Not in any groups 211 | // ❌ Not registered as a member 212 | 213 | // Must rejoin explicitly 214 | member.join().await; // Required 215 | member.join_group("workers").await; // Required 216 | member.start_heartbeat(); // Required 217 | ``` 218 | 219 | ## Best Practices 220 | 221 | ### For Raft Cluster Members 222 | 1. Use `Storage::DISK` for production deployments 223 | 2. Membership changes are committed via Raft consensus 224 | 3. No need to rejoin after restart 225 | 226 | ### For Membership Service Users 227 | 1. **Always rejoin after restart**: 228 | ```rust 229 | async fn on_startup() { 230 | member_client.join().await; 231 | for group in my_groups { 232 | member_client.join_group(group).await; 233 | } 234 | member_client.start_heartbeat(); 235 | } 236 | ``` 237 | 238 | 2. **Handle disconnections gracefully** - May need to rejoin 239 | 240 | 3. **Monitor membership changes** via subscriptions: 241 | ```rust 242 | client.on_any_member_joined(|member, version| { 243 | println!("New member joined: {:?}", member); 244 | }).await; 245 | ``` 246 | 247 | ## Summary 248 | 249 | - **Raft Cluster Membership**: Persisted for consensus correctness ✅ 250 | - **Membership Service**: NOT persisted, learns from network ❌ 251 | - Both serve different purposes and have different persistence requirements 252 | - This design ensures both safety (for Raft) and freshness (for membership) 253 | 254 | -------------------------------------------------------------------------------- /src/raft/state_machine/macros.rs: -------------------------------------------------------------------------------- 1 | //TODO: Use higher order macro to merge with rpc service! macro when possible to do this in Rust. 2 | //Current major problem is inner repeated macro will be recognized as outer macro which breaks expand 3 | 4 | #[macro_export] 5 | macro_rules! raft_trait_fn { 6 | (qry $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty) => { 7 | fn $fn_name<'a>(&'a self, $($arg:$in_),*) -> ::futures::future::BoxFuture<$out>; 8 | }; 9 | (cmd $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty) => { 10 | fn $fn_name<'a>(&'a mut self, $($arg:$in_),*) -> ::futures::future::BoxFuture<$out>; 11 | }; 12 | (sub $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty) => {} 13 | } 14 | 15 | #[macro_export] 16 | macro_rules! raft_client_fn { 17 | (sub $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty) => { 18 | pub fn $fn_name(&self, f: F, $($arg:$in_),* ) 19 | -> BoxFuture, $crate::raft::state_machine::master::ExecError>> 20 | where F: Fn($out) -> BoxFuture<'static, ()> + 'static + Send + Sync 21 | { 22 | self.client.subscribe( 23 | self.sm_id, 24 | $fn_name::new($($arg,)*), 25 | f 26 | ).boxed() 27 | } 28 | }; 29 | ($others:ident $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty) => { 30 | pub async fn $fn_name(&self, $($arg:$in_),*) -> Result<$out, $crate::raft::state_machine::master::ExecError> { 31 | self.client.execute( 32 | self.sm_id, 33 | $fn_name::new($($arg,)*) 34 | ).await 35 | } 36 | }; 37 | } 38 | 39 | #[macro_export] 40 | macro_rules! raft_fn_op_type { 41 | (qry) => { 42 | $crate::raft::state_machine::OpType::QUERY 43 | }; 44 | (cmd) => { 45 | $crate::raft::state_machine::OpType::COMMAND 46 | }; 47 | (sub) => { 48 | $crate::raft::state_machine::OpType::SUBSCRIBE 49 | }; 50 | } 51 | 52 | #[macro_export] 53 | macro_rules! raft_dispatch_fn { 54 | ($fn_name:ident $s: ident $d: ident ( $( $arg:ident : $in_:ty ),* )) => {{ 55 | let decoded: ($($in_,)*) = match $crate::utils::serde::deserialize($d) { 56 | Some(decoded) => decoded, 57 | None => panic!("Failed to deserialize function call data for function: {}, s: {}, d: {}", stringify!($fn_name), stringify!($s), stringify!($d)), 58 | }; 59 | let ($($arg,)*) = decoded; 60 | let f_result = $s.$fn_name($($arg),*).await; 61 | Some($crate::utils::serde::serialize(&f_result)) 62 | }}; 63 | } 64 | 65 | #[macro_export] 66 | macro_rules! raft_dispatch_cmd { 67 | (cmd $fn_name:ident $s: ident $d: ident ( $( $arg:ident : $in_:ty ),* )) => { 68 | raft_dispatch_fn!($fn_name $s $d( $( $arg : $in_ ),* )) 69 | }; 70 | ($others:ident $fn_name:ident $s: ident $d: ident ( $( $arg:ident : $in_:ty ),* )) => {None}; 71 | } 72 | 73 | #[macro_export] 74 | macro_rules! raft_dispatch_qry { 75 | (qry $fn_name:ident $s: ident $d: ident ( $( $arg:ident : $in_:ty ),* )) => { 76 | raft_dispatch_fn!($fn_name $s $d( $( $arg : $in_ ),* )) 77 | }; 78 | ($others:ident $fn_name:ident $s: ident $d: ident ( $( $arg:ident : $in_:ty ),* )) => {None}; 79 | } 80 | 81 | #[macro_export] 82 | macro_rules! raft_sm_complete { 83 | () => { 84 | fn fn_dispatch_cmd<'a>( 85 | &'a mut self, 86 | fn_id: u64, 87 | data: &'a Vec, 88 | ) -> ::futures::future::BoxFuture<'a, Option>> { 89 | self.dispatch_cmd_(fn_id, data) 90 | } 91 | fn fn_dispatch_qry<'a>( 92 | &'a self, 93 | fn_id: u64, 94 | data: &'a Vec, 95 | ) -> ::futures::future::BoxFuture<'a, Option>> { 96 | self.dispatch_qry_(fn_id, data) 97 | } 98 | fn op_type(&mut self, fn_id: u64) -> Option<$crate::raft::state_machine::OpType> { 99 | self.op_type_(fn_id) 100 | } 101 | }; 102 | } 103 | 104 | #[macro_export] 105 | macro_rules! raft_state_machine { 106 | ( 107 | $( 108 | $(#[$attr:meta])* 109 | def $smt:ident $fn_name:ident( $( $arg:ident : $in_:ty ),* ) $(-> $out:ty)* ; 110 | )* 111 | ) => { 112 | raft_state_machine! {{ 113 | $( 114 | $(#[$attr])* 115 | def $smt $fn_name( $( $arg : $in_ ),* ) $(-> $out)*; 116 | )* 117 | }} 118 | }; 119 | ( 120 | { 121 | $(#[$attr:meta])* 122 | def $smt:ident $fn_name:ident( $( $arg:ident : $in_:ty ),* ); // No return 123 | 124 | $( $unexpanded:tt )* 125 | } 126 | $( $expanded:tt )* 127 | ) => { 128 | raft_state_machine! { 129 | { $( $unexpanded )* } 130 | 131 | $( $expanded )* 132 | 133 | $(#[$attr])* 134 | def $smt $fn_name( $( $arg : $in_ ),* ) -> (); 135 | } 136 | }; 137 | ( 138 | { 139 | $(#[$attr:meta])* 140 | def $smt:ident $fn_name:ident( $( $arg:ident : $in_:ty ),* ) -> $out:ty; 141 | 142 | $( $unexpanded:tt )* 143 | } 144 | $( $expanded:tt )* 145 | ) => { 146 | raft_state_machine! { 147 | { $( $unexpanded )* } 148 | 149 | $( $expanded )* 150 | 151 | $(#[$attr])* 152 | def $smt $fn_name( $( $arg : $in_ ),* ) -> $out; 153 | } 154 | }; 155 | ( 156 | {} // all expanded 157 | $( 158 | $(#[$attr:meta])* 159 | def $smt:ident $fn_name:ident ( $( $arg:ident : $in_:ty ),* ) -> $out:ty; 160 | )* 161 | ) => { 162 | #[allow(unused_imports)] 163 | use futures::prelude::*; 164 | use futures::future::BoxFuture; 165 | 166 | #[allow(dead_code)] 167 | #[allow(unused_imports)] 168 | pub mod commands { 169 | use super::*; 170 | use futures::prelude::*; 171 | use serde::{Serialize, Deserialize}; 172 | $( 173 | #[derive(Serialize, Deserialize, Debug)] 174 | #[allow(non_camel_case_types)] 175 | pub struct $fn_name { 176 | pub data: Vec 177 | } 178 | impl $crate::raft::RaftMsg<$out> for $fn_name { 179 | fn encode(self) -> (u64, $crate::raft::state_machine::OpType, Vec) { 180 | ( 181 | ::bifrost_plugins::hash_ident!($fn_name) as u64, 182 | raft_fn_op_type!($smt), 183 | self.data 184 | ) 185 | } 186 | fn decode_return(data: &Vec) -> $out { 187 | $crate::utils::serde::deserialize(data).unwrap() 188 | } 189 | } 190 | impl $fn_name { 191 | pub fn new($($arg:&$in_),*) -> $fn_name { 192 | let req_data = ($($arg,)*); 193 | $fn_name { 194 | data: $crate::utils::serde::serialize(&req_data) 195 | } 196 | } 197 | } 198 | )* 199 | } 200 | 201 | #[allow(dead_code)] 202 | #[allow(unused_variables)] 203 | pub trait StateMachineCmds: $crate::raft::state_machine::StateMachineCtl { 204 | $( 205 | $(#[$attr])* 206 | raft_trait_fn!($smt $fn_name( $( $arg : $in_ ),* ) -> $out); 207 | )* 208 | fn op_type_(&self, fn_id: u64) -> Option<$crate::raft::state_machine::OpType> { 209 | match fn_id as usize { 210 | $(::bifrost_plugins::hash_ident!($fn_name) => { 211 | Some(raft_fn_op_type!($smt)) 212 | }),* 213 | _ => { 214 | debug!("Undefined function id: {}", fn_id); 215 | None 216 | } 217 | } 218 | } 219 | fn dispatch_cmd_<'a>(&'a mut self, fn_id: u64, data: &'a Vec) -> BoxFuture>> { 220 | async move { 221 | match fn_id as usize { 222 | $(::bifrost_plugins::hash_ident!($fn_name) => { 223 | raft_dispatch_cmd!($smt $fn_name self data( $( $arg : $in_ ),* )) 224 | }),* 225 | _ => { 226 | debug!("Undefined function id: {}. We have {}", fn_id, concat!(stringify!($($fn_name),*))); 227 | None 228 | } 229 | } 230 | }.boxed() 231 | } 232 | fn dispatch_qry_<'a>(&'a self, fn_id: u64, data: &'a Vec) -> BoxFuture>> { 233 | async move { 234 | match fn_id as usize { 235 | $(::bifrost_plugins::hash_ident!($fn_name) => { 236 | raft_dispatch_qry!($smt $fn_name self data( $( $arg : $in_ ),* )) 237 | }),* 238 | _ => { 239 | debug!("Undefined function id: {}", fn_id); 240 | None 241 | } 242 | } 243 | }.boxed() 244 | } 245 | } 246 | 247 | #[allow(dead_code)] 248 | #[allow(unused_imports)] 249 | pub mod client { 250 | use super::*; 251 | use std::sync::Arc; 252 | use super::commands::*; 253 | use $crate::raft::client::*; 254 | use $crate::raft::state_machine::master::ExecError; 255 | use $crate::raft::state_machine::StateMachineClient; 256 | use $crate::raft::client::{RaftClient, SubscriptionError, SubscriptionReceipt}; 257 | 258 | pub struct SMClient { 259 | client: Arc, 260 | sm_id: u64 261 | } 262 | impl SMClient { 263 | $( 264 | $(#[$attr])* 265 | raft_client_fn!($smt $fn_name( $( $arg : &$in_ ),* ) -> $out); 266 | )* 267 | pub fn new(sm_id: u64, client: &Arc) -> Self { 268 | Self { 269 | client: client.clone(), 270 | sm_id: sm_id 271 | } 272 | } 273 | } 274 | impl StateMachineClient for SMClient { 275 | fn new_instance (sm_id: u64, client: &Arc) -> Self { 276 | Self::new(sm_id, client) 277 | } 278 | } 279 | } 280 | }; 281 | } 282 | -------------------------------------------------------------------------------- /tests/single_node_recovery_test.rs: -------------------------------------------------------------------------------- 1 | /// Test for single-node Raft cluster recovery from disk 2 | /// 3 | /// This test verifies the fix for the bug where single-node clusters 4 | /// failed to elect themselves as leader after recovering from persistent storage. 5 | 6 | use bifrost::raft::{RaftService, Options, Storage, DEFAULT_SERVICE_ID, client::RaftClient}; 7 | use bifrost::raft::disk::DiskOptions; 8 | use bifrost::rpc::Server; 9 | use std::time::Duration; 10 | use tokio::time::sleep; 11 | 12 | #[tokio::test(flavor = "multi_thread")] 13 | async fn test_single_node_cluster_recovery_becomes_leader() { 14 | let _ = env_logger::try_init(); 15 | 16 | // Use test-specific directory 17 | let data_path = "/tmp/bifrost_test_single_node_recovery_18000".to_string(); 18 | // Clean up any existing data from previous runs 19 | let _ = std::fs::remove_dir_all(&data_path); 20 | std::fs::create_dir_all(&data_path).unwrap(); 21 | 22 | let address = "127.0.0.1:18000".to_string(); 23 | 24 | println!("=== Phase 1: Create initial single-node cluster ==="); 25 | 26 | let initial_leader_id; 27 | 28 | // Phase 1: Create and run a single-node cluster with disk storage 29 | { 30 | let raft_service = RaftService::new(Options { 31 | storage: Storage::DISK(DiskOptions { 32 | path: data_path.clone(), 33 | take_snapshots: true, 34 | append_logs: true, 35 | trim_logs: false, 36 | snapshot_log_threshold: 5, 37 | log_compaction_threshold: 10, 38 | }), 39 | address: address.clone(), 40 | service_id: DEFAULT_SERVICE_ID, 41 | }); 42 | 43 | let server = Server::new(&address); 44 | Server::listen_and_resume(&server).await; 45 | server.register_service(&raft_service).await; 46 | 47 | // Start and bootstrap 48 | let started = RaftService::start(&raft_service, true).await; 49 | assert!(started, "Phase 1: Should start successfully"); 50 | 51 | raft_service.bootstrap().await; 52 | sleep(Duration::from_millis(500)).await; 53 | 54 | // Verify it's a leader 55 | assert!(raft_service.is_leader(), "Phase 1: Should be leader"); 56 | initial_leader_id = raft_service.leader_id().await; 57 | assert!(initial_leader_id != 0, "Phase 1: Should have valid leader ID"); 58 | 59 | println!("Phase 1: Leader ID = {}, Server ID = {}", initial_leader_id, raft_service.id); 60 | 61 | // Perform some operations to generate logs 62 | let client = RaftClient::new(&vec![address.clone()], DEFAULT_SERVICE_ID).await.unwrap(); 63 | println!("Phase 1: Created Raft client"); 64 | 65 | // Generate some logs by performing state machine operations 66 | // Use the config state machine to add/remove a dummy member 67 | use bifrost::raft::state_machine::configs::commands; 68 | 69 | // Add a dummy member (will generate a log entry) 70 | let result1 = client.execute( 71 | bifrost::raft::state_machine::configs::CONFIG_SM_ID, 72 | commands::new_member_::new(&"dummy:9999".to_string()) 73 | ).await; 74 | println!("Phase 1: Added dummy member: {:?}", result1.is_ok()); 75 | 76 | // Remove the dummy member (another log entry) 77 | let result2 = client.execute( 78 | bifrost::raft::state_machine::configs::CONFIG_SM_ID, 79 | commands::del_member_::new(&"dummy:9999".to_string()) 80 | ).await; 81 | println!("Phase 1: Removed dummy member: {:?}", result2.is_ok()); 82 | 83 | // Wait for persistence 84 | sleep(Duration::from_millis(1000)).await; 85 | 86 | // Check that logs exist 87 | let num_logs = raft_service.num_logs().await; 88 | println!("Phase 1: Number of logs: {}", num_logs); 89 | assert!(num_logs > 0, "Phase 1: Should have generated some logs"); 90 | 91 | // Shutdown gracefully 92 | println!("Phase 1: Shutting down..."); 93 | drop(client); // Drop client first 94 | raft_service.shutdown().await; 95 | server.shutdown().await; 96 | sleep(Duration::from_secs(2)).await; 97 | 98 | // Prevent runtime drop panic in test context 99 | std::mem::forget(raft_service); 100 | std::mem::forget(server); 101 | } 102 | 103 | // Give OS time to release resources 104 | sleep(Duration::from_millis(500)).await; 105 | 106 | println!("\n=== Phase 2: Restart and recover from disk ==="); 107 | 108 | // Phase 2: Restart the server with same storage - THIS IS THE BUG FIX TEST 109 | { 110 | let raft_service2 = RaftService::new(Options { 111 | storage: Storage::DISK(DiskOptions { 112 | path: data_path.clone(), // Same path - will recover state 113 | take_snapshots: true, 114 | append_logs: true, 115 | trim_logs: false, 116 | snapshot_log_threshold: 5, 117 | log_compaction_threshold: 10, 118 | }), 119 | address: address.clone(), // Same address 120 | service_id: DEFAULT_SERVICE_ID, 121 | }); 122 | 123 | let server2 = Server::new(&address); 124 | Server::listen_and_resume(&server2).await; 125 | server2.register_service(&raft_service2).await; 126 | 127 | // Start - should recover and immediately become leader 128 | let started2 = RaftService::start(&raft_service2, true).await; 129 | assert!(started2, "Phase 2: Should start successfully"); 130 | 131 | // Give it a moment to stabilize 132 | sleep(Duration::from_millis(1000)).await; 133 | 134 | // THE KEY TEST: Should be leader immediately after recovery 135 | let is_leader = raft_service2.is_leader(); 136 | println!("Phase 2: Is leader? {}", is_leader); 137 | 138 | let leader_id2 = raft_service2.leader_id().await; 139 | println!("Phase 2: Leader ID = {}, Server ID = {}", leader_id2, raft_service2.id); 140 | 141 | assert!(is_leader, "Phase 2: CRITICAL - Should be leader after recovery (single-node cluster)"); 142 | assert!(leader_id2 != 0, "Phase 2: Should have valid leader ID after recovery"); 143 | assert_eq!(leader_id2, raft_service2.id, "Phase 2: Should be its own leader"); 144 | 145 | // Verify client can connect successfully 146 | let client2_result = RaftClient::new(&vec![address.clone()], DEFAULT_SERVICE_ID).await; 147 | assert!( 148 | client2_result.is_ok(), 149 | "Phase 2: Should be able to create RaftClient (leader should be elected)" 150 | ); 151 | 152 | println!("Phase 2: ✅ Single-node cluster successfully recovered and became leader!"); 153 | 154 | // Cleanup 155 | raft_service2.shutdown().await; 156 | server2.shutdown().await; 157 | sleep(Duration::from_millis(500)).await; 158 | 159 | // Prevent runtime drop panic in test context 160 | std::mem::forget(raft_service2); 161 | std::mem::forget(server2); 162 | } 163 | 164 | // Cleanup test directory 165 | let _ = std::fs::remove_dir_all(&data_path); 166 | 167 | println!("\n✅ TEST PASSED: Single-node cluster recovery works correctly!"); 168 | } 169 | 170 | #[tokio::test(flavor = "multi_thread")] 171 | async fn test_single_node_multiple_restart_cycles() { 172 | let _ = env_logger::try_init(); 173 | 174 | let data_path = "/tmp/bifrost_test_single_node_cycles_18001".to_string(); 175 | // Clean up any existing data from previous runs 176 | let _ = std::fs::remove_dir_all(&data_path); 177 | std::fs::create_dir_all(&data_path).unwrap(); 178 | 179 | let address = "127.0.0.1:18001".to_string(); 180 | 181 | // Perform 3 restart cycles 182 | for cycle in 1..=3 { 183 | println!("\n=== Cycle {} ===", cycle); 184 | 185 | let raft_service = RaftService::new(Options { 186 | storage: Storage::DISK(DiskOptions { 187 | path: data_path.clone(), 188 | take_snapshots: true, 189 | append_logs: true, 190 | trim_logs: false, 191 | snapshot_log_threshold: 5, 192 | log_compaction_threshold: 10, 193 | }), 194 | address: address.clone(), 195 | service_id: DEFAULT_SERVICE_ID, 196 | }); 197 | 198 | let server = Server::new(&address); 199 | Server::listen_and_resume(&server).await; 200 | server.register_service(&raft_service).await; 201 | 202 | let started = RaftService::start(&raft_service, true).await; 203 | assert!(started, "Cycle {}: Should start", cycle); 204 | 205 | if cycle == 1 { 206 | // First cycle: bootstrap 207 | raft_service.bootstrap().await; 208 | } 209 | 210 | sleep(Duration::from_millis(1000)).await; 211 | 212 | // Should be leader in all cycles 213 | assert!(raft_service.is_leader(), "Cycle {}: Should be leader", cycle); 214 | let leader_id = raft_service.leader_id().await; 215 | assert!(leader_id != 0, "Cycle {}: Should have valid leader ID", cycle); 216 | 217 | // Verify client works and generate some logs 218 | let client = RaftClient::new(&vec![address.clone()], DEFAULT_SERVICE_ID).await; 219 | assert!(client.is_ok(), "Cycle {}: RaftClient should connect", cycle); 220 | 221 | // Generate logs to ensure persistence 222 | if let Ok(ref client) = client { 223 | use bifrost::raft::state_machine::configs::commands; 224 | let dummy_addr = format!("dummy{}:9999", cycle); 225 | let _ = client.execute( 226 | bifrost::raft::state_machine::configs::CONFIG_SM_ID, 227 | commands::new_member_::new(&dummy_addr) 228 | ).await; 229 | let _ = client.execute( 230 | bifrost::raft::state_machine::configs::CONFIG_SM_ID, 231 | commands::del_member_::new(&dummy_addr) 232 | ).await; 233 | sleep(Duration::from_millis(500)).await; // Wait for persistence 234 | } 235 | 236 | let num_logs = raft_service.num_logs().await; 237 | println!("Cycle {}: ✅ Leader elected successfully, {} logs", cycle, num_logs); 238 | 239 | // Shutdown 240 | raft_service.shutdown().await; 241 | server.shutdown().await; 242 | sleep(Duration::from_secs(2)).await; 243 | 244 | // Prevent runtime drop panic in test context 245 | std::mem::forget(raft_service); 246 | std::mem::forget(server); 247 | } 248 | 249 | // Cleanup test directory 250 | let _ = std::fs::remove_dir_all(&data_path); 251 | 252 | println!("\n✅ All 3 restart cycles passed!"); 253 | } 254 | 255 | -------------------------------------------------------------------------------- /src/raft/state_machine/callback/server.rs: -------------------------------------------------------------------------------- 1 | use super::super::OpType; 2 | use super::*; 3 | use crate::raft::{RaftMsg, RaftService}; 4 | use crate::rpc; 5 | use async_std::sync::*; 6 | use bifrost_hasher::{hash_bytes, hash_str}; 7 | use futures::stream::FuturesUnordered; 8 | use serde; 9 | use serde::{Deserialize, Serialize}; 10 | use std::any::Any; 11 | use std::collections::{HashMap, HashSet}; 12 | use std::sync::Arc; 13 | 14 | pub struct Subscriber { 15 | pub session_id: u64, 16 | pub client: Arc, 17 | } 18 | 19 | pub struct Subscriptions { 20 | next_id: u64, 21 | subscribers: HashMap, 22 | suber_subs: HashMap>, //suber_id -> sub_id 23 | subscriptions: HashMap>, // key -> sub_id 24 | sub_suber: HashMap, 25 | sub_to_key: HashMap, //sub_id -> sub_key 26 | } 27 | 28 | impl Subscriptions { 29 | pub fn new() -> Subscriptions { 30 | Subscriptions { 31 | next_id: 0, 32 | subscribers: HashMap::new(), 33 | suber_subs: HashMap::new(), 34 | subscriptions: HashMap::new(), 35 | sub_suber: HashMap::new(), 36 | sub_to_key: HashMap::new(), 37 | } 38 | } 39 | 40 | pub async fn subscribe( 41 | &mut self, 42 | key: SubKey, 43 | address: &String, 44 | session_id: u64, 45 | ) -> Result { 46 | let suber_id = hash_str(address); 47 | let suber_exists = self.subscribers.contains_key(&suber_id); 48 | let sub_id = self.next_id; 49 | let (_, _, fn_id, pattern_id) = key; 50 | debug!( 51 | "Subscription {:?} from {}, address {}, fn {}, pattern {}", 52 | key, suber_id, address, fn_id, pattern_id 53 | ); 54 | let require_reload_suber = if suber_exists { 55 | match self.subscribers.get(&suber_id) { 56 | Some(subscriber) => { 57 | let session_match = subscriber.session_id == session_id; 58 | if !session_match { 59 | self.remove_subscriber(suber_id); 60 | true 61 | } else { 62 | false 63 | } 64 | } 65 | None => { 66 | error!("Subscriber {} exists flag is true but not found in map - data inconsistency", suber_id); 67 | // Treat as if subscriber doesn't exist - require reload 68 | true 69 | } 70 | } 71 | } else { 72 | true 73 | }; 74 | if !suber_exists || require_reload_suber { 75 | self.subscribers.insert( 76 | suber_id, 77 | Subscriber { 78 | session_id, 79 | client: { 80 | if let Ok(client) = RPCClient::new_async(address).await { 81 | AsyncServiceClient::new(&client) 82 | } else { 83 | return Err(()); 84 | } 85 | }, 86 | }, 87 | ); 88 | } 89 | self.suber_subs 90 | .entry(suber_id) 91 | .or_insert_with(|| HashSet::new()) 92 | .insert(sub_id); 93 | self.subscriptions 94 | .entry(key) 95 | .or_insert_with(|| HashSet::new()) 96 | .insert(sub_id); 97 | self.sub_to_key.insert(sub_id, key); 98 | self.sub_suber.insert(sub_id, suber_id); 99 | 100 | self.next_id += 1; 101 | Ok(sub_id) 102 | } 103 | 104 | pub fn remove_subscriber(&mut self, suber_id: u64) { 105 | debug!("Removing subscriber {}", suber_id); 106 | let suber_subs = if let Some(sub_ids) = self.suber_subs.get(&suber_id) { 107 | sub_ids.iter().cloned().collect() 108 | } else { 109 | Vec::::new() 110 | }; 111 | for subs_id in suber_subs { 112 | self.remove_subscription(subs_id) 113 | } 114 | self.subscribers.remove(&suber_id); 115 | self.suber_subs.remove(&suber_id); 116 | } 117 | 118 | pub fn remove_subscription(&mut self, id: u64) { 119 | debug!("Removing subscription {}", id); 120 | let sub_key = self.sub_to_key.remove(&id); 121 | if let Some(sub_key) = sub_key { 122 | if let Some(ref mut sub_subers) = self.subscriptions.get_mut(&sub_key) { 123 | sub_subers.remove(&id); 124 | self.sub_suber.remove(&id); 125 | } 126 | } 127 | } 128 | } 129 | 130 | // used for raft services to subscribe directly from state machine instances 131 | pub struct InternalSubscription { 132 | action: Box, 133 | } 134 | 135 | pub struct SMCallback { 136 | pub subscriptions: Arc>, 137 | pub raft_service: Arc, 138 | pub internal_subs: RwLock>>, 139 | pub sm_id: u64, 140 | } 141 | 142 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 143 | pub enum NotifyError { 144 | IsNotLeader, 145 | OpTypeNotSubscribe, 146 | CannotFindSubscription, 147 | CannotFindSubscribers, 148 | CannotFindSubscriber, 149 | CannotCastInternalSub, 150 | } 151 | 152 | impl SMCallback { 153 | pub async fn new(state_machine_id: u64, raft_service: Arc) -> SMCallback { 154 | let meta = raft_service.meta.read().await; 155 | let sm = meta.state_machine.read().await; 156 | let subs = sm.configs.subscriptions.clone(); 157 | SMCallback { 158 | subscriptions: subs, 159 | raft_service: raft_service.clone(), 160 | sm_id: state_machine_id, 161 | internal_subs: RwLock::new(HashMap::new()), 162 | } 163 | } 164 | 165 | pub async fn notify( 166 | &self, 167 | msg: M, 168 | message: R, 169 | ) -> Result<(usize, Vec, Vec>), NotifyError> 170 | where 171 | R: serde::Serialize + Send + Sync + Clone + Any + Unpin + 'static, 172 | M: RaftMsg + 'static, 173 | { 174 | if !self.raft_service.is_leader() { 175 | debug!( 176 | "Will not send notification from {} because this node is not a leader", 177 | self.raft_service.get_server_id() 178 | ); 179 | return Err(NotifyError::IsNotLeader); 180 | } 181 | let (fn_id, op_type, pattern_data) = msg.encode(); 182 | return match op_type { 183 | OpType::SUBSCRIBE => { 184 | let pattern_id = hash_bytes(&pattern_data.as_slice()); 185 | let raft_sid = self.raft_service.options.service_id; 186 | let sm_id = self.sm_id; 187 | let key = (raft_sid, sm_id, fn_id, pattern_id); 188 | let internal_subs = self.internal_subs.read().await; 189 | let svr_subs = self.subscriptions.read().await; 190 | debug!( 191 | "Sending notification, func {}, op: {:?}, pattern_id {}", 192 | fn_id, op_type, pattern_id 193 | ); 194 | if let Some(internal_subs) = internal_subs.get(&pattern_id) { 195 | for is in internal_subs { 196 | (is.action)(&message) 197 | } 198 | } else { 199 | trace!("Cannot found internal subs {}", pattern_id); 200 | } 201 | if let Some(sub_ids) = svr_subs.subscriptions.get(&key) { 202 | let sub_result_futs: FuturesUnordered<_> = sub_ids 203 | .iter() 204 | .map(|sub_id| { 205 | let message = Pin::new(&message); 206 | async move { 207 | let svr_subs = self.subscriptions.read().await; 208 | if let Some(subscriber_id) = svr_subs.sub_suber.get(&sub_id) { 209 | if let Some(subscriber) = 210 | svr_subs.subscribers.get(&subscriber_id) 211 | { 212 | let data = crate::utils::serde::serialize(&*message); 213 | let client = &subscriber.client; 214 | debug!( 215 | "Sending out callback notification to sub id {}", 216 | sub_id 217 | ); 218 | let client_result = client.notify(key, &data).await; 219 | Ok(client_result) 220 | } else { 221 | Err(NotifyError::CannotFindSubscriber) 222 | } 223 | } else { 224 | Err(NotifyError::CannotFindSubscribers) 225 | } 226 | } 227 | }) 228 | .collect(); 229 | let sub_result: Vec<_> = sub_result_futs.collect().await; 230 | let errors: Vec = sub_result 231 | .iter() 232 | .filter_map(|r| { 233 | if let Err(e) = r { 234 | Some(e.clone()) 235 | } else { 236 | None 237 | } 238 | }) 239 | .collect(); 240 | let response: Vec<_> = sub_result 241 | .into_iter() 242 | .filter_map(|r| { 243 | if let Ok(value) = r { 244 | Some(value) 245 | } else { 246 | None 247 | } 248 | }) 249 | .collect(); 250 | Ok((sub_ids.len(), errors, response)) 251 | } else { 252 | Err(NotifyError::CannotFindSubscription) 253 | } 254 | } 255 | _ => Err(NotifyError::OpTypeNotSubscribe), 256 | }; 257 | } 258 | pub async fn internal_subscribe(&self, msg: M, trigger: F) -> Result<(), NotifyError> 259 | where 260 | M: RaftMsg, 261 | F: Fn(&R) + Sync + Send + 'static, 262 | R: 'static, 263 | { 264 | let (_, op_type, pattern_data) = msg.encode(); 265 | match op_type { 266 | OpType::SUBSCRIBE => { 267 | let pattern_id = hash_bytes(&pattern_data.as_slice()); 268 | let mut internal_subs = self.internal_subs.write().await; 269 | internal_subs 270 | .entry(pattern_id) 271 | .or_insert_with(|| Vec::new()) 272 | .push(InternalSubscription { 273 | action: Box::new(move |any: &dyn Any| match any.downcast_ref::() { 274 | Some(r) => trigger(r), 275 | None => warn!("type mismatch in internal subscription"), 276 | }), 277 | }); 278 | Ok(()) 279 | } 280 | _ => Err(NotifyError::OpTypeNotSubscribe), 281 | } 282 | } 283 | } 284 | 285 | pub async fn notify(callback: &Option, msg: M, data: F) 286 | where 287 | F: FnOnce() -> R, 288 | M: RaftMsg + Send + 'static, 289 | R: serde::Serialize + Send + Sync + Clone + Unpin + Any + 'static, 290 | { 291 | if let Some(ref callback) = *callback { 292 | match callback.notify(msg, data()).await { 293 | Ok(_) | Err(NotifyError::IsNotLeader) => {} 294 | Err(e) => warn!( 295 | "Cannot send nofication, failed after called due to: {:?}", 296 | e 297 | ), 298 | } 299 | } else { 300 | warn!("Cannot send notification, callback handler is empty"); 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /SNAPSHOT_GUIDE.md: -------------------------------------------------------------------------------- 1 | # Snapshot, Checkpointing, and Recovery Guide 2 | 3 | ## Overview 4 | 5 | Bifrost's Raft implementation now includes production-ready snapshot, checkpointing, and recovery functionality. This prevents unbounded memory growth and enables fast recovery after restarts. 6 | 7 | ## Features 8 | 9 | ✅ **Automatic Snapshot Creation**: Triggered by configurable log count thresholds 10 | ✅ **Persistent Storage**: Atomic writes with CRC32 corruption detection 11 | ✅ **Crash Recovery**: Automatically loads snapshots on restart 12 | ✅ **Log Compaction**: Removes old logs from memory after snapshots 13 | ✅ **Follower Catch-up**: Automatically sends snapshots to lagging nodes 14 | ✅ **Corruption Handling**: Graceful fallback when snapshot files are corrupted 15 | 16 | ## Quick Start 17 | 18 | ### 1. Basic Setup with Snapshots 19 | 20 | ```rust 21 | use bifrost::raft::{RaftService, Options, Storage, DEFAULT_SERVICE_ID}; 22 | use bifrost::raft::disk::DiskOptions; 23 | use bifrost::rpc::Server; 24 | 25 | #[tokio::main] 26 | async fn main() { 27 | // Create Raft service with disk storage 28 | let service = RaftService::new(Options { 29 | storage: Storage::DISK(DiskOptions::new("/var/lib/myapp/raft".to_string())), 30 | address: "127.0.0.1:5000".to_string(), 31 | service_id: DEFAULT_SERVICE_ID, 32 | }); 33 | 34 | let server = Server::new(&"127.0.0.1:5000"); 35 | server.register_service(&service).await; 36 | Server::listen_and_resume(&server).await; 37 | 38 | // Start automatically recovers from snapshot if it exists! 39 | RaftService::start(&service).await; 40 | 41 | // Bootstrap or join cluster 42 | service.bootstrap().await; 43 | // OR: service.join(&vec!["existing-node:5000".to_string()]).await; 44 | } 45 | ``` 46 | 47 | ### 2. Custom Configuration 48 | 49 | ```rust 50 | use bifrost::raft::disk::DiskOptions; 51 | 52 | let custom_opts = DiskOptions { 53 | path: "/data/raft".to_string(), 54 | take_snapshots: true, // Enable snapshots 55 | append_logs: true, // Enable log persistence 56 | trim_logs: true, // Enable log trimming 57 | snapshot_log_threshold: 5000, // Snapshot every 5000 applied logs 58 | log_compaction_threshold: 10000, // Compact when > 10000 logs 59 | }; 60 | 61 | let service = RaftService::new(Options { 62 | storage: Storage::DISK(custom_opts), 63 | address: "127.0.0.1:5000".to_string(), 64 | service_id: DEFAULT_SERVICE_ID, 65 | }); 66 | ``` 67 | 68 | ## How It Works 69 | 70 | ### Automatic Snapshot Creation 71 | 72 | **When**: After the leader applies `snapshot_log_threshold` logs since the last snapshot 73 | 74 | **What happens**: 75 | 1. Leader generates snapshot from all state machines 76 | 2. Persists snapshot to disk with CRC32 checksum 77 | 3. Updates snapshot metadata (index, term) 78 | 4. Compacts old logs from memory (if > compaction_threshold) 79 | 80 | ```rust 81 | // Triggered automatically in try_sync_log_to_followers() 82 | // After successfully committing logs to followers 83 | if should_take_snapshot() { 84 | take_snapshot(); // Generates, persists, and compacts 85 | } 86 | ``` 87 | 88 | ### Startup Recovery 89 | 90 | **When**: Every time `RaftService::start()` is called 91 | 92 | **What happens**: 93 | 1. Checks if snapshot file exists on disk 94 | 2. Validates CRC32 checksum 95 | 3. Deserializes snapshot data 96 | 4. Calls `state_machine.recover(snapshot_data)` 97 | 5. Updates indices and metadata 98 | 6. Compacts logs already covered by snapshot 99 | 7. Continues normal operation 100 | 101 | ```rust 102 | // Automatically called in RaftService::start() 103 | load_snapshot_on_startup(); 104 | ``` 105 | 106 | ### Follower Catch-up with Snapshots 107 | 108 | **When**: A follower needs logs that the leader has already compacted 109 | 110 | **Scenarios**: 111 | - New node joining the cluster 112 | - Node was offline during log compaction 113 | - Node is too slow and fell far behind 114 | 115 | **What happens**: 116 | 1. Leader detects: `follower.next_index <= leader.last_snapshot_index` 117 | 2. Leader generates snapshot from state machines 118 | 3. Leader sends via `install_snapshot` RPC 119 | 4. Follower receives snapshot 120 | 5. Follower recovers state machine 121 | 6. Follower persists snapshot to disk 122 | 7. Follower compacts old logs 123 | 8. Follower continues with normal log replication 124 | 125 | ```rust 126 | // In send_follower_heartbeat() 127 | if follower.next_index <= last_snapshot_index { 128 | // Follower needs compacted logs - send snapshot 129 | let snapshot = master_sm.snapshot().unwrap(); 130 | rpc.install_snapshot( 131 | term, 132 | leader_id, 133 | last_snapshot_index, 134 | last_snapshot_term, 135 | snapshot 136 | ).await; 137 | } 138 | ``` 139 | 140 | ## Implementing Snapshotable State Machines 141 | 142 | Your state machines must implement `snapshot()` and `recover()`: 143 | 144 | ```rust 145 | use bifrost::raft::state_machine::StateMachineCtl; 146 | use futures::future::BoxFuture; 147 | use serde::{Serialize, Deserialize}; 148 | 149 | #[derive(Serialize, Deserialize)] 150 | struct MyState { 151 | counter: i64, 152 | data: HashMap, 153 | } 154 | 155 | struct MyStateMachine { 156 | state: MyState, 157 | } 158 | 159 | impl StateMachineCtl for MyStateMachine { 160 | fn id(&self) -> u64 { 42 } 161 | 162 | fn snapshot(&self) -> Option> { 163 | // Serialize your entire state 164 | let data = bincode::serialize(&self.state).ok()?; 165 | Some(data) 166 | } 167 | 168 | fn recover(&mut self, data: Vec) -> BoxFuture<()> { 169 | // Deserialize and restore state 170 | if !data.is_empty() { 171 | if let Ok(state) = bincode::deserialize(&data) { 172 | self.state = state; 173 | println!("State machine recovered: counter={}", self.state.counter); 174 | } 175 | } 176 | Box::pin(async {}) 177 | } 178 | 179 | // ... command and query handlers ... 180 | } 181 | ``` 182 | 183 | ## File Layout 184 | 185 | When using disk storage, the following files are created: 186 | 187 | ``` 188 | /var/lib/myapp/raft/ 189 | ├── log.dat # Persisted Raft log entries 190 | ├── snapshot.dat # Latest snapshot with CRC32 191 | └── snapshot.dat.tmp # Temporary file during writes (atomic) 192 | ``` 193 | 194 | ### Snapshot File Format 195 | 196 | ``` 197 | [4 bytes] CRC32 checksum 198 | [8 bytes] Data length 199 | [N bytes] Serialized SnapshotEntity: 200 | { 201 | last_included_index: u64, 202 | last_included_term: u64, 203 | snapshot: Vec // Serialized state machine data 204 | } 205 | ``` 206 | 207 | ## Monitoring 208 | 209 | ### Check Snapshot Status 210 | 211 | ```rust 212 | let meta = service.read_meta().await; 213 | println!("Last snapshot: index={}, term={}", 214 | meta.last_snapshot_index, 215 | meta.last_snapshot_term); 216 | 217 | let num_logs = service.num_logs().await; 218 | println!("Logs in memory: {}", num_logs); 219 | ``` 220 | 221 | ### Manually Trigger Snapshot (Advanced) 222 | 223 | ```rust 224 | // Normally automatic, but you can manually trigger: 225 | let mut meta = service.write_meta().await; 226 | service.take_snapshot(&mut meta).await; 227 | ``` 228 | 229 | ## Safety Guarantees 230 | 231 | ### 1. **Crash Safety** 232 | - Atomic writes using temp file + rename pattern 233 | - If process crashes during snapshot write, old snapshot remains intact 234 | 235 | ### 2. **Corruption Detection** 236 | - CRC32 checksum on all snapshots 237 | - Corrupted snapshots are detected and ignored 238 | - System falls back to log-based recovery 239 | 240 | ### 3. **Raft Correctness** 241 | - Snapshots track correct term and index 242 | - No safety violations from compaction 243 | - Follows Raft paper specifications 244 | 245 | ### 4. **Consistency** 246 | - Followers always get consistent state via snapshots 247 | - State machine recovery is deterministic 248 | - All nodes eventually converge to same state 249 | 250 | ## Configuration Recommendations 251 | 252 | ### Small Applications (< 1000 ops/sec) 253 | ```rust 254 | snapshot_log_threshold: 1000, 255 | log_compaction_threshold: 2000, 256 | ``` 257 | 258 | ### Medium Applications (1000-10000 ops/sec) 259 | ```rust 260 | snapshot_log_threshold: 5000, 261 | log_compaction_threshold: 10000, 262 | ``` 263 | 264 | ### Large Applications (> 10000 ops/sec) 265 | ```rust 266 | snapshot_log_threshold: 10000, 267 | log_compaction_threshold: 20000, 268 | ``` 269 | 270 | ### Memory-Constrained Systems 271 | ```rust 272 | snapshot_log_threshold: 500, // Snapshot more frequently 273 | log_compaction_threshold: 1000, // Compact aggressively 274 | ``` 275 | 276 | ## Troubleshooting 277 | 278 | ### Issue: Logs keep growing 279 | **Solution**: Check that `take_snapshots: true` and thresholds are set appropriately 280 | 281 | ### Issue: Snapshot file not created 282 | **Solution**: 283 | - Verify disk permissions on path 284 | - Ensure state machines implement `snapshot()` correctly 285 | - Check logs for error messages 286 | 287 | ### Issue: Follower doesn't catch up 288 | **Solution**: 289 | - Check network connectivity 290 | - Verify `install_snapshot` RPC is working 291 | - Check follower logs for error messages 292 | 293 | ### Issue: Corrupted snapshot detected 294 | **Solution**: 295 | - Delete corrupted file, server will recover from logs 296 | - Investigate disk issues 297 | - Check for process crashes during snapshot writes 298 | 299 | ## Performance Considerations 300 | 301 | ### Snapshot Creation Cost 302 | - **Time**: O(state_size) to serialize state 303 | - **Disk I/O**: One sequential write 304 | - **Memory**: Temporary copy of state during serialization 305 | 306 | ### Log Compaction Cost 307 | - **Time**: O(logs_to_remove) to filter BTreeMap 308 | - **Memory**: Immediate reduction after compaction 309 | 310 | ### Recovery Cost 311 | - **Time**: O(state_size) to deserialize snapshot + O(remaining_logs) 312 | - **Disk I/O**: One sequential read 313 | 314 | ## Testing 315 | 316 | Run all snapshot tests: 317 | 318 | ```bash 319 | cargo test --lib test_snapshot test_log_compaction test_state_machine_snapshot test_install 320 | ``` 321 | 322 | Individual tests: 323 | - `test_snapshot_write_and_read` - I/O functionality 324 | - `test_snapshot_corruption_detection` - CRC validation 325 | - `test_log_compaction_removes_old_logs` - Memory reduction 326 | - `test_snapshot_threshold_configuration` - Threshold logic 327 | - `test_state_machine_snapshot_and_recovery` - SM serialization 328 | - `test_install_snapshot_compacts_logs` - Follower catch-up 329 | - `snapshot_disk_persistence` - End-to-end persistence 330 | - `snapshot_persistence_and_recovery` - Full recovery cycle 331 | 332 | ## Example: Multi-Server Deployment 333 | 334 | ```rust 335 | // server1.rs (Leader) 336 | let service = RaftService::new(Options { 337 | storage: Storage::DISK(DiskOptions::new("/data/node1".to_string())), 338 | address: "10.0.1.10:5000".to_string(), 339 | service_id: DEFAULT_SERVICE_ID, 340 | }); 341 | // ... setup ... 342 | service.bootstrap().await; 343 | 344 | // server2.rs (Follower) 345 | let service = RaftService::new(Options { 346 | storage: Storage::DISK(DiskOptions::new("/data/node2".to_string())), 347 | address: "10.0.1.11:5000".to_string(), 348 | service_id: DEFAULT_SERVICE_ID, 349 | }); 350 | // ... setup ... 351 | service.join(&vec!["10.0.1.10:5000".to_string()]).await; 352 | 353 | // server3.rs (New node joining later - will get snapshot automatically!) 354 | let service = RaftService::new(Options { 355 | storage: Storage::DISK(DiskOptions::new("/data/node3".to_string())), 356 | address: "10.0.1.12:5000".to_string(), 357 | service_id: DEFAULT_SERVICE_ID, 358 | }); 359 | // ... setup ... 360 | service.join(&vec!["10.0.1.10:5000".to_string()]).await; 361 | // ✅ Will automatically receive snapshot if logs are compacted! 362 | ``` 363 | 364 | ## Summary 365 | 366 | The Raft framework now has **industrial-grade snapshot and recovery capabilities**: 367 | 368 | - ✅ Automatic snapshot creation based on thresholds 369 | - ✅ Crash-safe atomic writes to disk 370 | - ✅ Automatic recovery on restart 371 | - ✅ Log compaction to prevent memory leaks 372 | - ✅ Automatic snapshot transfer to lagging/new nodes 373 | - ✅ Corruption detection and handling 374 | - ✅ Fully tested with comprehensive test suite 375 | 376 | New nodes joining the cluster automatically receive snapshots if they're too far behind - no manual intervention needed! 377 | 378 | -------------------------------------------------------------------------------- /tests/graceful_shutdown_tests.rs: -------------------------------------------------------------------------------- 1 | /// Tests for graceful shutdown functionality 2 | /// 3 | /// These tests verify that: 4 | /// 1. Servers actually shut down when shutdown() is called 5 | /// 2. Ports/addresses are released and can be reused 6 | /// 3. Background tasks stop cleanly 7 | 8 | use bifrost::raft::{RaftService, Options, Storage, DEFAULT_SERVICE_ID}; 9 | use bifrost::rpc::Server; 10 | use bifrost::tcp; 11 | use std::sync::Arc; 12 | use std::time::Duration; 13 | use tokio::net::TcpStream; 14 | use tokio::time::{sleep, timeout}; 15 | 16 | /// Test that TCP server releases the port after shutdown 17 | #[tokio::test(flavor = "multi_thread")] 18 | async fn test_tcp_server_shutdown_releases_port() { 19 | let _ = env_logger::try_init(); 20 | let address = "127.0.0.1:19001".to_string(); 21 | 22 | // Start first TCP server 23 | let tcp_server = Arc::new(tcp::server::Server::new()); 24 | let tcp_server_clone = tcp_server.clone(); 25 | let addr_clone = address.clone(); 26 | 27 | let handle = tokio::spawn(async move { 28 | tcp_server_clone 29 | .listen( 30 | &addr_clone, 31 | Arc::new(|data| { 32 | Box::pin(async move { data }) 33 | }), 34 | ) 35 | .await 36 | .unwrap(); 37 | }); 38 | 39 | // Give it time to bind 40 | sleep(Duration::from_millis(500)).await; 41 | 42 | // Verify server is listening by connecting to it 43 | let connect_result = timeout( 44 | Duration::from_secs(2), 45 | TcpStream::connect(&address) 46 | ).await; 47 | assert!(connect_result.is_ok(), "Should be able to connect to server"); 48 | 49 | // Shutdown the server 50 | tcp_server.shutdown(); 51 | 52 | // Wait for shutdown to complete 53 | let shutdown_result = timeout(Duration::from_secs(5), handle).await; 54 | assert!(shutdown_result.is_ok(), "Server should shut down within 5 seconds"); 55 | 56 | // Give a moment for the OS to release the port 57 | sleep(Duration::from_millis(500)).await; 58 | 59 | // Verify we can start a new server on the same port 60 | let tcp_server2 = Arc::new(tcp::server::Server::new()); 61 | let tcp_server2_clone = tcp_server2.clone(); 62 | let addr_clone2 = address.clone(); 63 | 64 | let handle2 = tokio::spawn(async move { 65 | let result = tcp_server2_clone 66 | .listen( 67 | &addr_clone2, 68 | Arc::new(|data| { 69 | Box::pin(async move { data }) 70 | }), 71 | ) 72 | .await; 73 | assert!(result.is_ok(), "Should be able to bind to the same port after shutdown"); 74 | }); 75 | 76 | // Give it time to bind 77 | sleep(Duration::from_millis(500)).await; 78 | 79 | // Verify second server is listening 80 | let connect_result2 = timeout( 81 | Duration::from_secs(2), 82 | TcpStream::connect(&address) 83 | ).await; 84 | assert!(connect_result2.is_ok(), "Should be able to connect to new server on same port"); 85 | 86 | // Cleanup 87 | tcp_server2.shutdown(); 88 | let _ = timeout(Duration::from_secs(5), handle2).await; 89 | } 90 | 91 | /// Test that RPC server releases the port after shutdown 92 | #[tokio::test(flavor = "multi_thread")] 93 | async fn test_rpc_server_shutdown_releases_port() { 94 | let _ = env_logger::try_init(); 95 | let address = "127.0.0.1:19002".to_string(); 96 | 97 | // Start first RPC server 98 | let server1 = Server::new(&address); 99 | Server::listen_and_resume(&server1).await; 100 | 101 | // Verify server is listening 102 | let connect_result = timeout( 103 | Duration::from_secs(2), 104 | TcpStream::connect(&address) 105 | ).await; 106 | assert!(connect_result.is_ok(), "Should be able to connect to RPC server"); 107 | 108 | // Shutdown the server 109 | server1.shutdown().await; 110 | 111 | // Give time for shutdown to complete and port to be released 112 | sleep(Duration::from_millis(1000)).await; 113 | 114 | // Verify we can start a new server on the same port 115 | let server2 = Server::new(&address); 116 | Server::listen_and_resume(&server2).await; 117 | 118 | // Verify second server is listening 119 | let connect_result2 = timeout( 120 | Duration::from_secs(2), 121 | TcpStream::connect(&address) 122 | ).await; 123 | assert!(connect_result2.is_ok(), "Should be able to connect to new RPC server on same port"); 124 | 125 | // Cleanup 126 | server2.shutdown().await; 127 | sleep(Duration::from_millis(500)).await; 128 | } 129 | 130 | /// Test that Raft service stops its background tasks after shutdown 131 | /// Note: Ignored because RaftService contains a nested tokio runtime which cannot 132 | /// be safely dropped within another tokio test runtime context. 133 | /// The full_stack_shutdown test covers Raft shutdown in a working configuration. 134 | #[tokio::test(flavor = "multi_thread")] 135 | #[ignore = "RaftService nested runtime causes drop issues in test context"] 136 | async fn test_raft_service_shutdown_stops_tasks() { 137 | let _ = env_logger::try_init(); 138 | let address = "127.0.0.1:19003".to_string(); 139 | 140 | // Use scope to ensure proper cleanup 141 | { 142 | // Create and start Raft service 143 | let raft_service = RaftService::new(Options { 144 | storage: Storage::MEMORY, 145 | address: address.clone(), 146 | service_id: DEFAULT_SERVICE_ID, 147 | }); 148 | 149 | // Give initialization more time 150 | sleep(Duration::from_millis(100)).await; 151 | 152 | let started = RaftService::start(&raft_service, false).await; 153 | if !started { 154 | println!("Warning: Raft service failed to start, skipping test"); 155 | return; // Skip this test if it fails to start 156 | } 157 | 158 | // Bootstrap the cluster 159 | raft_service.bootstrap().await; 160 | 161 | // Give it time to run and stabilize 162 | sleep(Duration::from_millis(1000)).await; 163 | 164 | // Verify service is running by checking leader status 165 | assert!(raft_service.is_leader(), "Should be leader after bootstrap"); 166 | 167 | // Shutdown the service 168 | let shutdown_start = std::time::Instant::now(); 169 | raft_service.shutdown().await; 170 | let shutdown_duration = shutdown_start.elapsed(); 171 | 172 | // Verify shutdown completed in reasonable time (< 5 seconds) 173 | assert!( 174 | shutdown_duration < Duration::from_secs(5), 175 | "Shutdown should complete within 5 seconds, took {:?}", 176 | shutdown_duration 177 | ); 178 | 179 | // Verify service is no longer leader (membership should be Offline) 180 | assert!(!raft_service.is_leader(), "Should not be leader after shutdown"); 181 | } // raft_service drops here 182 | 183 | println!("Test completed successfully"); 184 | } 185 | 186 | /// Full integration test: Start everything, shutdown, verify port is released 187 | /// Note: This test uses scoped drops to avoid runtime drop issues 188 | #[tokio::test(flavor = "multi_thread")] 189 | async fn test_full_stack_shutdown_releases_port() { 190 | let _ = env_logger::try_init(); 191 | let address = "127.0.0.1:19004".to_string(); 192 | 193 | // Scope 1: Create and start full stack, then shut it down 194 | { 195 | let raft_service = RaftService::new(Options { 196 | storage: Storage::MEMORY, 197 | address: address.clone(), 198 | service_id: DEFAULT_SERVICE_ID, 199 | }); 200 | 201 | let server = Server::new(&address); 202 | Server::listen_and_resume(&server).await; 203 | server.register_service(&raft_service).await; 204 | 205 | let started = RaftService::start(&raft_service, false).await; 206 | assert!(started, "Raft service should start"); 207 | 208 | raft_service.bootstrap().await; 209 | 210 | // Verify everything is running 211 | sleep(Duration::from_millis(500)).await; 212 | let connect_result = timeout( 213 | Duration::from_secs(2), 214 | TcpStream::connect(&address) 215 | ).await; 216 | assert!(connect_result.is_ok(), "Should be able to connect to server"); 217 | assert!(raft_service.is_leader(), "Should be leader"); 218 | 219 | // Shutdown in reverse order (service first, then server) 220 | println!("Shutting down Raft service..."); 221 | raft_service.shutdown().await; 222 | 223 | println!("Shutting down RPC server..."); 224 | server.shutdown().await; 225 | 226 | // Give time for everything to shut down 227 | sleep(Duration::from_millis(1000)).await; 228 | } // raft_service and server drop here 229 | 230 | // Give OS time to fully release the port 231 | sleep(Duration::from_millis(500)).await; 232 | 233 | // Scope 2: Start new server on same port to verify it's released 234 | { 235 | println!("Starting new server on same port..."); 236 | let server2 = Server::new(&address); 237 | Server::listen_and_resume(&server2).await; 238 | 239 | // Verify new server is listening 240 | sleep(Duration::from_millis(500)).await; 241 | let connect_result2 = timeout( 242 | Duration::from_secs(2), 243 | TcpStream::connect(&address) 244 | ).await; 245 | assert!(connect_result2.is_ok(), "Should be able to connect to new server on same port"); 246 | 247 | // Cleanup 248 | server2.shutdown().await; 249 | sleep(Duration::from_millis(500)).await; 250 | } // server2 drops here 251 | 252 | println!("Test completed successfully"); 253 | } 254 | 255 | /// Test multiple rapid shutdown/restart cycles 256 | #[tokio::test(flavor = "multi_thread")] 257 | async fn test_rapid_shutdown_restart_cycles() { 258 | let _ = env_logger::try_init(); 259 | let address = "127.0.0.1:19005".to_string(); 260 | 261 | for i in 0..3 { 262 | println!("Cycle {}", i + 1); 263 | 264 | // Start server 265 | let server = Server::new(&address); 266 | Server::listen_and_resume(&server).await; 267 | 268 | // Verify it's listening 269 | sleep(Duration::from_millis(300)).await; 270 | let connect_result = timeout( 271 | Duration::from_secs(2), 272 | TcpStream::connect(&address) 273 | ).await; 274 | assert!( 275 | connect_result.is_ok(), 276 | "Cycle {}: Should be able to connect", 277 | i + 1 278 | ); 279 | 280 | // Shutdown 281 | server.shutdown().await; 282 | sleep(Duration::from_millis(500)).await; 283 | } 284 | 285 | println!("All cycles completed successfully"); 286 | } 287 | 288 | /// Test that connections are closed cleanly during shutdown 289 | #[tokio::test(flavor = "multi_thread")] 290 | async fn test_active_connections_close_on_shutdown() { 291 | let _ = env_logger::try_init(); 292 | let address = "127.0.0.1:19006".to_string(); 293 | 294 | // Start server 295 | let server = Server::new(&address); 296 | Server::listen_and_resume(&server).await; 297 | sleep(Duration::from_millis(300)).await; 298 | 299 | // Open multiple connections 300 | let mut connections = Vec::new(); 301 | for _ in 0..5 { 302 | let stream = TcpStream::connect(&address).await; 303 | assert!(stream.is_ok(), "Should be able to connect"); 304 | connections.push(stream.unwrap()); 305 | } 306 | 307 | println!("Opened {} connections", connections.len()); 308 | 309 | // Shutdown server 310 | server.shutdown().await; 311 | 312 | // Give a moment for shutdown to propagate 313 | sleep(Duration::from_millis(500)).await; 314 | 315 | // Verify we cannot open new connections 316 | let new_connect = timeout( 317 | Duration::from_secs(1), 318 | TcpStream::connect(&address) 319 | ).await; 320 | assert!( 321 | new_connect.is_err() || new_connect.unwrap().is_err(), 322 | "Should not be able to connect after shutdown" 323 | ); 324 | 325 | println!("Verified server is no longer accepting connections"); 326 | } 327 | 328 | /// Test shutdown timeout behavior 329 | #[tokio::test(flavor = "multi_thread")] 330 | async fn test_shutdown_completes_within_timeout() { 331 | let _ = env_logger::try_init(); 332 | let address = "127.0.0.1:19007".to_string(); 333 | 334 | // Create full stack 335 | let raft_service = RaftService::new(Options { 336 | storage: Storage::MEMORY, 337 | address: address.clone(), 338 | service_id: DEFAULT_SERVICE_ID, 339 | }); 340 | 341 | let server = Server::new(&address); 342 | Server::listen_and_resume(&server).await; 343 | server.register_service(&raft_service).await; 344 | RaftService::start(&raft_service, false).await; 345 | raft_service.bootstrap().await; 346 | 347 | sleep(Duration::from_millis(500)).await; 348 | 349 | // Shutdown with timeout 350 | let shutdown_result = timeout(Duration::from_secs(10), async { 351 | raft_service.shutdown().await; 352 | server.shutdown().await; 353 | }).await; 354 | 355 | assert!( 356 | shutdown_result.is_ok(), 357 | "Shutdown should complete within 10 seconds" 358 | ); 359 | } 360 | 361 | -------------------------------------------------------------------------------- /src/vector_clock/mod.rs: -------------------------------------------------------------------------------- 1 | use bifrost_hasher::hash_str; 2 | use parking_lot::RwLock; 3 | use serde::{Deserialize, Serialize}; 4 | use std::cmp::Ordering; 5 | 6 | #[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] 7 | pub enum Relation { 8 | Equal, 9 | Before, 10 | After, 11 | Concurrent, 12 | } 13 | 14 | #[derive(Serialize, Deserialize, Debug, Clone, Eq, Hash)] 15 | pub struct VectorClock { 16 | map: Vec<(S, u64)>, 17 | } 18 | 19 | impl PartialOrd for VectorClock { 20 | fn partial_cmp(&self, other: &VectorClock) -> Option { 21 | let rel = self.relation(other); 22 | match rel { 23 | Relation::Before => Some(Ordering::Less), 24 | Relation::After => Some(Ordering::Greater), 25 | Relation::Equal => Some(Ordering::Equal), 26 | Relation::Concurrent => None, 27 | } 28 | } 29 | } 30 | 31 | impl Ord for VectorClock { 32 | fn cmp(&self, other: &Self) -> Ordering { 33 | let rel = self.relation(other); 34 | match rel { 35 | Relation::Before => Ordering::Less, 36 | Relation::After => Ordering::Greater, 37 | _ => Ordering::Equal, // not justified, but sufficient for BTreeSet data structure 38 | } 39 | } 40 | } 41 | 42 | impl PartialEq for VectorClock { 43 | fn eq(&self, other: &VectorClock) -> bool { 44 | let rel = self.relation(other); 45 | rel == Relation::Equal 46 | } 47 | } 48 | 49 | impl VectorClock { 50 | pub fn new() -> VectorClock { 51 | VectorClock { map: vec![] } 52 | } 53 | 54 | pub fn from_vec(vec: Vec<(S, u64)>) -> Self { 55 | Self { map: vec } 56 | } 57 | 58 | pub fn inc(&mut self, server: S) { 59 | let idx = self.map.binary_search_by_key(&server, |(k, _)| *k); 60 | match idx { 61 | Ok(idx) => { 62 | *(&mut self.map[idx].1) += 1; 63 | } 64 | Err(idx) => { 65 | self.map.insert(idx, (server, 1)); 66 | } 67 | } 68 | } 69 | 70 | pub fn happened_before(&self, clock_b: &VectorClock) -> bool { 71 | let mut ai = 0; 72 | let mut bi = 0; 73 | let al = self.map.len(); 74 | let bl = clock_b.map.len(); 75 | if al == 0 { 76 | return clock_b.map.iter().any(|(_, n)| *n > 0); 77 | } 78 | if bl == 0 { 79 | return false; 80 | } 81 | let mut a_lt_b = false; 82 | let mut b_lt_a = false; 83 | while ai < al && bi < bl { 84 | let (ak, an) = &self.map[ai]; 85 | let (bk, bn) = &clock_b.map[bi]; 86 | if ak == bk { 87 | // Two vector have the same key, compare their values 88 | ai += 1; 89 | bi += 1; 90 | if *an < *bn { 91 | a_lt_b = true; 92 | } else if *an > *bn { 93 | b_lt_a = true; 94 | } 95 | } else if ak > bk { 96 | // Clock b have a server that a does not have 97 | bi += 1; 98 | } else if ak < bk { 99 | // Clock a have a server that b does not have 100 | ai += 1; 101 | } else { 102 | unreachable!(); 103 | } 104 | } 105 | return a_lt_b && (!b_lt_a); 106 | } 107 | 108 | pub fn equals(&self, clock_b: &VectorClock) -> bool { 109 | let al = self.map.len(); 110 | let bl = clock_b.map.len(); 111 | if al == 0 && al == bl { 112 | return true; 113 | } 114 | if al != bl { 115 | if al == 0 { 116 | return clock_b.map.iter().all(|(_, n)| *n == 0); 117 | } 118 | if bl == 0 { 119 | return self.map.iter().all(|(_, n)| *n == 0); 120 | } 121 | } 122 | let mut ai = 0; 123 | let mut bi = 0; 124 | let mut a_eq_b = false; 125 | while ai < al && bi < bl { 126 | let (ak, an) = &self.map[ai]; 127 | let (bk, bn) = &clock_b.map[bi]; 128 | if ak == bk { 129 | // Two vector have the same key, compare their values 130 | if an != bn { 131 | return false; 132 | } 133 | a_eq_b = true; 134 | ai += 1; 135 | bi += 1; 136 | } else if ak > bk { 137 | // Clock b have a server that a does not have 138 | // b should either equal or happend after a 139 | bi += 1; 140 | } else if ak < bk { 141 | // Clock a have a server that b does not have 142 | ai += 1; 143 | } else { 144 | unreachable!(); 145 | } 146 | } 147 | return a_eq_b; 148 | } 149 | 150 | pub fn relation(&self, clock_b: &VectorClock) -> Relation { 151 | if self.equals(clock_b) { 152 | return Relation::Equal; 153 | } 154 | if self.happened_before(clock_b) { 155 | return Relation::Before; 156 | } 157 | if clock_b.happened_before(self) { 158 | return Relation::After; 159 | } 160 | return Relation::Concurrent; 161 | } 162 | 163 | pub fn merge_with(&mut self, clock_b: &VectorClock) { 164 | // merge_with is used to update counter for other servers (also learn from it) 165 | let mut ai = 0; 166 | let mut bi = 0; 167 | let al = self.map.len(); 168 | let bl = clock_b.map.len(); 169 | if bl == 0 { 170 | return; 171 | } 172 | if al == 0 { 173 | self.map = clock_b.map.clone(); 174 | return; 175 | } 176 | let mut new_map = Vec::with_capacity(self.map.len() + clock_b.map.len()); 177 | while ai < al || bi < bl { 178 | if ai >= al { 179 | ai = al - 1; 180 | } 181 | if bi >= bl { 182 | bi = bl - 1; 183 | } 184 | let (ak, an) = &self.map[ai]; 185 | let (bk, bn) = &clock_b.map[bi]; 186 | if ak == bk { 187 | // Two vector have the same key, compare their values 188 | if an < bn { 189 | new_map.push((*ak, *bn)); 190 | } else { 191 | new_map.push((*ak, *an)); 192 | } 193 | ai += 1; 194 | bi += 1; 195 | } else if ak > bk { 196 | // Clock b have a server that a does not have 197 | new_map.push((*bk, *bn)); 198 | bi += 1; 199 | } else if ak < bk { 200 | // Clock a have a server that b does not have 201 | new_map.push((*ak, *an)); 202 | ai += 1; 203 | } else { 204 | unreachable!(); 205 | } 206 | } 207 | self.map = new_map; 208 | } 209 | 210 | pub fn learn_from(&mut self, clock_b: &VectorClock) { 211 | // learn_from only insert missing servers into the clock 212 | let mut ai = 0; 213 | let mut bi = 0; 214 | let al = self.map.len(); 215 | let bl = clock_b.map.len(); 216 | if bl == 0 { 217 | return; 218 | } 219 | if al == 0 { 220 | self.map = clock_b.map.clone(); 221 | return; 222 | } 223 | let mut new_map = Vec::with_capacity(self.map.len() + clock_b.map.len()); 224 | while ai < al || bi < bl { 225 | if ai >= al { 226 | ai = al - 1; 227 | } 228 | if bi >= bl { 229 | bi = bl - 1; 230 | } 231 | let (ak, an) = &self.map[ai]; 232 | let (bk, bn) = &clock_b.map[bi]; 233 | if ak == bk { 234 | // Two vector have the same key, compare their values 235 | ai += 1; 236 | bi += 1; 237 | new_map.push((*ak, *an)); 238 | } else if ak > bk { 239 | // Clock b have a server that a does not have 240 | new_map.push((*bk, *bn)); 241 | bi += 1; 242 | } else if ak < bk { 243 | // Clock a have a server that b does not have 244 | new_map.push((*ak, *an)); 245 | ai += 1; 246 | } else { 247 | unreachable!(); 248 | } 249 | } 250 | self.map = new_map; 251 | } 252 | } 253 | 254 | pub struct ServerVectorClock { 255 | server: u64, 256 | clock: RwLock, 257 | } 258 | 259 | impl ServerVectorClock { 260 | pub fn new(server_address: &String) -> ServerVectorClock { 261 | ServerVectorClock { 262 | server: hash_str(server_address), 263 | clock: RwLock::new(VectorClock::new()), 264 | } 265 | } 266 | pub fn inc(&self) -> StandardVectorClock { 267 | let mut clock = self.clock.write(); 268 | clock.inc(self.server); 269 | clock.clone() 270 | } 271 | 272 | pub fn happened_before(&self, clock_b: &StandardVectorClock) -> bool { 273 | let clock = self.clock.read(); 274 | clock.happened_before(clock_b) 275 | } 276 | pub fn equals(&self, clock_b: &StandardVectorClock) -> bool { 277 | let clock = self.clock.read(); 278 | clock.equals(clock_b) 279 | } 280 | pub fn relation(&self, clock_b: &StandardVectorClock) -> Relation { 281 | let clock = self.clock.read(); 282 | clock.relation(clock_b) 283 | } 284 | pub fn merge_with(&self, clock_b: &StandardVectorClock) { 285 | let mut clock = self.clock.write(); 286 | clock.merge_with(clock_b) 287 | } 288 | pub fn learn_from(&self, clock_b: &StandardVectorClock) { 289 | let mut clock = self.clock.write(); 290 | clock.learn_from(clock_b) 291 | } 292 | pub fn to_clock(&self) -> StandardVectorClock { 293 | let clock = self.clock.read(); 294 | clock.clone() 295 | } 296 | } 297 | 298 | pub type StandardVectorClock = VectorClock; 299 | 300 | #[cfg(test)] 301 | mod test { 302 | use crate::vector_clock::{Relation, StandardVectorClock}; 303 | 304 | #[test] 305 | fn general() { 306 | let _ = env_logger::try_init(); 307 | let mut clock = StandardVectorClock::new(); 308 | let blank_clock = StandardVectorClock::new(); 309 | clock.inc(1); 310 | clock.inc(3); 311 | let old_clock = clock.clone(); 312 | clock.inc(1); 313 | clock.inc(2); 314 | info!("{:?}", clock.relation(&blank_clock)); 315 | assert!(clock > blank_clock); 316 | assert!(blank_clock < clock); 317 | assert!(blank_clock != clock); 318 | assert!( 319 | old_clock.happened_before(&clock), 320 | "old {:?}, new {:?}", 321 | old_clock, 322 | clock 323 | ); 324 | assert!( 325 | !clock.happened_before(&old_clock), 326 | "old {:?}, new {:?}", 327 | old_clock, 328 | clock 329 | ); 330 | assert!( 331 | !clock.equals(&old_clock), 332 | "old {:?}, new {:?}", 333 | old_clock, 334 | clock 335 | ); 336 | assert_eq!( 337 | clock.relation(&old_clock), 338 | Relation::After, 339 | "old {:?}, new {:?}", 340 | old_clock, 341 | clock 342 | ); 343 | assert_eq!( 344 | old_clock.relation(&clock), 345 | Relation::Before, 346 | "old {:?}, new {:?}", 347 | old_clock, 348 | clock 349 | ); 350 | let blank_clock_2 = StandardVectorClock::new(); 351 | assert!(blank_clock == blank_clock_2); 352 | } 353 | 354 | #[test] 355 | fn unaligned_clock_eq() { 356 | let _ = env_logger::try_init(); 357 | let clock_a = StandardVectorClock::from_vec(vec![(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)]); 358 | let clock_b = StandardVectorClock::from_vec(vec![(2, 3), (4, 5)]); 359 | assert!(clock_a.equals(&clock_b)); 360 | assert!(clock_b.equals(&clock_a)); 361 | assert!(!clock_a.happened_before(&clock_b)); 362 | assert!(!clock_b.happened_before(&clock_a)); 363 | assert_eq!(clock_a.relation(&clock_b), Relation::Equal); 364 | } 365 | 366 | #[test] 367 | fn unaligned_clock_rel_disjoint_concurrent() { 368 | let _ = env_logger::try_init(); 369 | let clock_a = StandardVectorClock::from_vec(vec![(1, 2), (3, 4), (5, 6)]); 370 | let clock_b = StandardVectorClock::from_vec(vec![(0, 1), (2, 3), (7, 8), (9, 10)]); 371 | assert!(!clock_a.equals(&clock_b)); 372 | assert!(!clock_b.equals(&clock_a)); 373 | assert!(!clock_a.happened_before(&clock_b)); 374 | assert!(!clock_b.happened_before(&clock_a)); 375 | assert_eq!(clock_a.relation(&clock_b), Relation::Concurrent); 376 | } 377 | 378 | #[test] 379 | fn unaligned_clock_rel_joint_concurrent() { 380 | let _ = env_logger::try_init(); 381 | let clock_a = StandardVectorClock::from_vec(vec![(1, 2), (3, 4)]); 382 | let clock_b = StandardVectorClock::from_vec(vec![(1, 3), (3, 3)]); 383 | assert!(!clock_a.equals(&clock_b)); 384 | assert!(!clock_b.equals(&clock_a)); 385 | assert!(!clock_a.happened_before(&clock_b)); 386 | assert!(!clock_b.happened_before(&clock_a)); 387 | assert_eq!(clock_a.relation(&clock_b), Relation::Concurrent); 388 | } 389 | } 390 | -------------------------------------------------------------------------------- /src/raft/disk.rs: -------------------------------------------------------------------------------- 1 | // Now only offers log persistent 2 | 3 | use crate::raft::{LogEntry, LogsMap, Options, RaftMeta, SnapshotEntity, Storage}; 4 | use async_std::sync::*; 5 | use serde::{Deserialize, Serialize}; 6 | 7 | use std::convert::TryInto; 8 | use std::fs::OpenOptions; 9 | use std::io; 10 | use std::io::Read; 11 | use std::ops::Bound::*; 12 | use std::path::{Path, PathBuf}; 13 | use tokio::fs::*; 14 | use tokio::io::*; 15 | 16 | // const MAX_LOG_CAPACITY: usize = 10; 17 | 18 | #[derive(Clone)] 19 | pub struct DiskOptions { 20 | pub path: String, 21 | pub take_snapshots: bool, 22 | pub append_logs: bool, 23 | pub trim_logs: bool, 24 | // Snapshot configuration 25 | pub snapshot_log_threshold: u64, // Trigger snapshot after N logs 26 | pub log_compaction_threshold: u64, // Compact when logs exceed this 27 | } 28 | 29 | impl DiskOptions { 30 | pub fn new(path: String) -> Self { 31 | Self { 32 | path, 33 | take_snapshots: true, 34 | append_logs: true, 35 | trim_logs: true, 36 | snapshot_log_threshold: 1000, 37 | log_compaction_threshold: 2000, 38 | } 39 | } 40 | } 41 | 42 | pub struct StorageEntity { 43 | pub logs: Option, 44 | pub snapshot: Option, 45 | pub last_term: u64, 46 | pub base_path: PathBuf, 47 | } 48 | 49 | pub struct DiskLogEntry { 50 | pub term: u64, 51 | pub commit_index: u64, 52 | pub last_applied: u64, 53 | pub log: LogEntry, 54 | } 55 | 56 | impl DiskLogEntry { 57 | /// Encode to deterministic binary format 58 | /// Format: 59 | /// [8 bytes] term 60 | /// [8 bytes] commit_index 61 | /// [8 bytes] last_applied 62 | /// [8 bytes] log.id 63 | /// [8 bytes] log.term 64 | /// [8 bytes] log.sm_id 65 | /// [8 bytes] log.fn_id 66 | /// [8 bytes] log.data.len() 67 | /// [N bytes] log.data 68 | pub fn encode(&self) -> Vec { 69 | let data_len = self.log.data.len(); 70 | let total_size = 8 * 8 + data_len; // 8 u64 fields + data 71 | let mut buf = Vec::with_capacity(total_size); 72 | 73 | // Write fixed-size fields in little-endian 74 | buf.extend_from_slice(&self.term.to_le_bytes()); 75 | buf.extend_from_slice(&self.commit_index.to_le_bytes()); 76 | buf.extend_from_slice(&self.last_applied.to_le_bytes()); 77 | buf.extend_from_slice(&self.log.id.to_le_bytes()); 78 | buf.extend_from_slice(&self.log.term.to_le_bytes()); 79 | buf.extend_from_slice(&self.log.sm_id.to_le_bytes()); 80 | buf.extend_from_slice(&self.log.fn_id.to_le_bytes()); 81 | buf.extend_from_slice(&(data_len as u64).to_le_bytes()); 82 | 83 | // Write variable-length data 84 | buf.extend_from_slice(&self.log.data); 85 | 86 | buf 87 | } 88 | 89 | /// Decode from deterministic binary format 90 | pub fn decode(data: &[u8]) -> io::Result { 91 | if data.len() < 64 { 92 | return Err(io::Error::new( 93 | io::ErrorKind::InvalidData, 94 | "DiskLogEntry too short" 95 | )); 96 | } 97 | 98 | // Read fixed-size fields 99 | let term = u64::from_le_bytes(data[0..8].try_into().unwrap()); 100 | let commit_index = u64::from_le_bytes(data[8..16].try_into().unwrap()); 101 | let last_applied = u64::from_le_bytes(data[16..24].try_into().unwrap()); 102 | let log_id = u64::from_le_bytes(data[24..32].try_into().unwrap()); 103 | let log_term = u64::from_le_bytes(data[32..40].try_into().unwrap()); 104 | let log_sm_id = u64::from_le_bytes(data[40..48].try_into().unwrap()); 105 | let log_fn_id = u64::from_le_bytes(data[48..56].try_into().unwrap()); 106 | let data_len = u64::from_le_bytes(data[56..64].try_into().unwrap()) as usize; 107 | 108 | // Validate data length 109 | if data.len() < 64 + data_len { 110 | return Err(io::Error::new( 111 | io::ErrorKind::InvalidData, 112 | format!("DiskLogEntry data truncated: expected {}, got {}", 64 + data_len, data.len()) 113 | )); 114 | } 115 | 116 | // Read variable-length data 117 | let log_data = data[64..64 + data_len].to_vec(); 118 | 119 | Ok(DiskLogEntry { 120 | term, 121 | commit_index, 122 | last_applied, 123 | log: LogEntry { 124 | id: log_id, 125 | term: log_term, 126 | sm_id: log_sm_id, 127 | fn_id: log_fn_id, 128 | data: log_data, 129 | }, 130 | }) 131 | } 132 | } 133 | 134 | impl StorageEntity { 135 | pub fn new_with_options( 136 | opts: &Options, 137 | term: &mut u64, 138 | commit_index: &mut u64, 139 | last_applied: &mut u64, 140 | logs: &mut LogsMap, 141 | ) -> io::Result> { 142 | Ok(match &opts.storage { 143 | &Storage::DISK(ref options) => { 144 | let base_path = Path::new(&options.path); 145 | let _ = std::fs::create_dir_all(base_path); 146 | let log_path = base_path.join("log.dat"); 147 | let snapshot_path = base_path.join("snapshot.dat"); 148 | let mut open_opts = OpenOptions::new(); 149 | open_opts 150 | .write(true) 151 | .create(true) 152 | .read(true) 153 | .truncate(false); 154 | let mut storage = Self { 155 | logs: if options.append_logs { 156 | let mut log_file = open_opts.open(log_path.as_path())?; 157 | let mut len_buf = [0u8; 8]; 158 | let mut counter = 0; 159 | loop { 160 | if log_file.read_exact(&mut len_buf).is_err() { 161 | break; 162 | } 163 | let len = u64::from_le_bytes(len_buf); 164 | let mut data_buf = vec![0u8; len as usize]; 165 | if log_file.read_exact(&mut data_buf).is_err() { 166 | break; 167 | } 168 | let entry = DiskLogEntry::decode(&data_buf) 169 | .expect("Failed to decode log entry from disk"); 170 | *term = entry.term; 171 | // Do not trust commit/last_applied embedded in WAL for SM reconstruction 172 | // We'll derive commit_index from commit.idx and force replay from last_applied=0 173 | logs.insert(entry.log.id, entry.log); 174 | counter += 1; 175 | } 176 | debug!("Recovered {} raft logs", counter); 177 | Some(File::from_std(log_file)) 178 | } else { 179 | None 180 | }, 181 | snapshot: if options.take_snapshots { 182 | Some(File::from_std(open_opts.open(snapshot_path.as_path())?)) 183 | } else { 184 | None 185 | }, 186 | last_term: 0, 187 | base_path: base_path.to_path_buf(), 188 | }; 189 | 190 | // If commit progress side file exists, load it to ensure accurate indices 191 | // Force full replay by resetting last_applied to 0 on startup 192 | *last_applied = 0; 193 | if let Ok(Some((ci, _la))) = futures::executor::block_on(storage.read_commit_progress()) { 194 | *commit_index = ci; 195 | debug!("Recovered commit progress: commit_index={} (will replay to rebuild state)", ci); 196 | } else { 197 | // If no commit progress found, default to 0 to avoid partial state 198 | *commit_index = 0; 199 | } 200 | 201 | Some(storage) 202 | } 203 | _ => None, 204 | }) 205 | } 206 | 207 | pub async fn append_logs<'a>( 208 | &mut self, 209 | meta: &'a RwLockWriteGuard<'a, RaftMeta>, 210 | logs: &'a RwLockWriteGuard<'a, LogsMap>, 211 | ) -> io::Result<()> { 212 | if let Some(f) = &mut self.logs { 213 | let was_last_term = self.last_term; 214 | let mut counter = 0; 215 | let mut terms_appended = vec![]; 216 | let master = meta.state_machine.read().await; 217 | for (term, log) in logs.range((Excluded(self.last_term), Unbounded)) { 218 | // Skip non-recoverable state machines 219 | if !master.is_recoverable(log.sm_id) { 220 | continue; 221 | } 222 | let entry = DiskLogEntry { 223 | term: *term, 224 | commit_index: meta.commit_index, 225 | last_applied: meta.last_applied, 226 | log: log.clone(), 227 | }; 228 | let entry_data = entry.encode(); // Use deterministic encoding 229 | f.write(&(entry_data.len() as u64).to_le_bytes()).await?; 230 | f.write(entry_data.as_slice()).await?; 231 | self.last_term = *term; 232 | terms_appended.push(self.last_term); 233 | counter += 1; 234 | } 235 | if counter > 0 { 236 | f.sync_all().await?; 237 | debug!( 238 | "Appended and persisted {} logs, was {}, appended {:?}", 239 | counter, was_last_term, terms_appended 240 | ); 241 | } 242 | } 243 | Ok(()) 244 | } 245 | 246 | pub async fn post_processing<'a>( 247 | &mut self, 248 | meta: &RwLockWriteGuard<'a, RaftMeta>, 249 | logs: RwLockWriteGuard<'a, LogsMap>, 250 | ) -> io::Result<()> { 251 | // TODO: trim logs in memory 252 | // TODO: trim logs on disk 253 | self.append_logs(meta, &logs).await?; 254 | 255 | Ok(()) 256 | 257 | // let (last_log_id, _) = get_last_log_info!(self, logs); 258 | // let expecting_oldest_log = if last_log_id > MAX_LOG_CAPACITY as u64 { 259 | // last_log_id - MAX_LOG_CAPACITY as u64 260 | // } else { 261 | // 0 262 | // }; 263 | // let double_cap = MAX_LOG_CAPACITY << 1; 264 | // if logs.len() > double_cap && meta.last_applied > expecting_oldest_log { 265 | // debug!("trim logs"); 266 | // while logs.len() > MAX_LOG_CAPACITY { 267 | // let first_key = *logs.iter().next().unwrap().0; 268 | // logs.remove(&first_key).unwrap(); 269 | // } 270 | // if let Some(ref storage) = meta.storage { 271 | // let mut storage = storage.write().await; 272 | // let snapshot = SnapshotEntity { 273 | // term: meta.term, 274 | // commit_index: meta.commit_index, 275 | // last_applied: meta.last_applied, 276 | // snapshot: meta.state_machine.read().await.snapshot().unwrap(), 277 | // }; 278 | // storage 279 | // .snapshot 280 | // .write_all(crate::utils::serde::serialize(&snapshot).as_slice())?; 281 | // storage.snapshot.sync_all().unwrap(); 282 | // } 283 | // } 284 | // if let Some(ref storage) = meta.storage { 285 | // let mut storage = storage.write().await; 286 | // let logs_data = crate::utils::serde::serialize(&*meta.logs.read().await); 287 | // // TODO: async file system calls 288 | // storage.logs.write_all(logs_data.as_slice())?; 289 | // storage.logs.sync_all().unwrap(); 290 | // } 291 | } 292 | 293 | /// Ensure WAL file is fully synced to disk. 294 | pub async fn flush_wal(&mut self) -> io::Result<()> { 295 | if let Some(f) = &mut self.logs { 296 | info!("WAL fsync: syncing log.dat to disk"); 297 | f.sync_all().await?; 298 | info!("WAL fsync: completed"); 299 | } 300 | Ok(()) 301 | } 302 | 303 | /// Persist commit progress atomically to a side file (commit.idx) 304 | pub async fn write_commit_progress(&mut self, commit_index: u64, last_applied: u64) -> io::Result<()> { 305 | let commit_path = self.base_path.join("commit.idx"); 306 | let temp_path = self.base_path.join("commit.idx.tmp"); 307 | let mut f = File::create(&temp_path).await?; 308 | f.write_all(&commit_index.to_le_bytes()).await?; 309 | f.write_all(&last_applied.to_le_bytes()).await?; 310 | f.sync_all().await?; 311 | drop(f); 312 | std::fs::rename(&temp_path, &commit_path)?; 313 | Ok(()) 314 | } 315 | 316 | /// Read commit progress if available 317 | pub async fn read_commit_progress(&self) -> io::Result> { 318 | let commit_path = self.base_path.join("commit.idx"); 319 | if !commit_path.exists() { return Ok(None); } 320 | let mut f = File::open(&commit_path).await?; 321 | let mut buf = [0u8; 16]; 322 | if f.read_exact(&mut buf).await.is_err() { return Ok(None); } 323 | let commit_index = u64::from_le_bytes(buf[0..8].try_into().unwrap()); 324 | let last_applied = u64::from_le_bytes(buf[8..16].try_into().unwrap()); 325 | Ok(Some((commit_index, last_applied))) 326 | } 327 | 328 | /// Write snapshot to disk using atomic write pattern (temp file + rename) 329 | pub async fn write_snapshot(&mut self, snapshot: &SnapshotEntity) -> io::Result<()> { 330 | let snapshot_path = self.base_path.join("snapshot.dat"); 331 | let temp_path = self.base_path.join("snapshot.dat.tmp"); 332 | 333 | // Serialize snapshot 334 | let snapshot_data = crate::utils::serde::serialize(snapshot); 335 | 336 | // Calculate CRC32 checksum 337 | let checksum = crc32fast::hash(&snapshot_data); 338 | 339 | // Write to temp file 340 | let mut temp_file = File::create(&temp_path).await?; 341 | 342 | // Write checksum first 343 | temp_file.write_all(&checksum.to_le_bytes()).await?; 344 | 345 | // Write length 346 | temp_file.write_all(&(snapshot_data.len() as u64).to_le_bytes()).await?; 347 | 348 | // Write data 349 | temp_file.write_all(&snapshot_data).await?; 350 | 351 | // Sync to disk 352 | temp_file.sync_all().await?; 353 | drop(temp_file); 354 | 355 | // Atomic rename 356 | std::fs::rename(&temp_path, &snapshot_path)?; 357 | 358 | info!( 359 | "Snapshot persisted to disk: index={}, term={}, size={} bytes", 360 | snapshot.last_included_index, 361 | snapshot.last_included_term, 362 | snapshot_data.len() 363 | ); 364 | 365 | Ok(()) 366 | } 367 | 368 | /// Read and validate snapshot from disk 369 | pub async fn read_snapshot(&self) -> io::Result> { 370 | let snapshot_path = self.base_path.join("snapshot.dat"); 371 | 372 | // Check if snapshot file exists 373 | if !snapshot_path.exists() { 374 | debug!("No snapshot file found at {:?}", snapshot_path); 375 | return Ok(None); 376 | } 377 | 378 | let mut file = File::open(&snapshot_path).await?; 379 | 380 | // Read checksum 381 | let mut checksum_buf = [0u8; 4]; 382 | if file.read_exact(&mut checksum_buf).await.is_err() { 383 | warn!("Failed to read snapshot checksum, file may be corrupted"); 384 | return Ok(None); 385 | } 386 | let expected_checksum = u32::from_le_bytes(checksum_buf); 387 | 388 | // Read length 389 | let mut len_buf = [0u8; 8]; 390 | if file.read_exact(&mut len_buf).await.is_err() { 391 | warn!("Failed to read snapshot length, file may be corrupted"); 392 | return Ok(None); 393 | } 394 | let len = u64::from_le_bytes(len_buf); 395 | 396 | // Read data 397 | let mut data_buf = vec![0u8; len as usize]; 398 | if file.read_exact(&mut data_buf).await.is_err() { 399 | warn!("Failed to read snapshot data, file may be corrupted"); 400 | return Ok(None); 401 | } 402 | 403 | // Verify checksum 404 | let actual_checksum = crc32fast::hash(&data_buf); 405 | if actual_checksum != expected_checksum { 406 | error!( 407 | "Snapshot checksum mismatch! Expected: {}, Got: {}. File is corrupted.", 408 | expected_checksum, actual_checksum 409 | ); 410 | return Ok(None); 411 | } 412 | 413 | // Deserialize 414 | let snapshot = crate::utils::serde::deserialize::(&data_buf).unwrap(); 415 | 416 | info!( 417 | "Snapshot loaded from disk: index={}, term={}, size={} bytes", 418 | snapshot.last_included_index, 419 | snapshot.last_included_term, 420 | data_buf.len() 421 | ); 422 | Ok(Some(snapshot)) 423 | } 424 | } 425 | -------------------------------------------------------------------------------- /src/membership/mod.rs: -------------------------------------------------------------------------------- 1 | // Group membership manager regardless actual raft members 2 | 3 | pub mod client; 4 | pub mod member; 5 | pub mod server; 6 | 7 | use crate::membership::client::Member as ClientMember; 8 | use bifrost_plugins::hash_ident; 9 | 10 | pub static DEFAULT_SERVICE_ID: u64 = hash_ident!(BIFROST_MEMBERSHIP_SERVICE) as u64; 11 | 12 | pub mod raft { 13 | use super::server::MemberGroup; 14 | use super::*; 15 | use std::collections::BTreeMap; 16 | 17 | raft_state_machine! { 18 | def cmd hb_online_changed(online: Vec, offline: Vec); 19 | def cmd join(address: String) -> Option; 20 | def cmd leave(id: u64) -> bool; 21 | def cmd join_group(group_name: String, id: u64) -> bool; 22 | def cmd leave_group(group: u64, id: u64) -> bool; 23 | def cmd new_group(name: String) -> Result; 24 | def cmd del_group(id: u64) -> bool; 25 | def qry group_leader(group: u64) -> Option<(Option, u64)>; 26 | def qry group_members (group: u64, online_only: bool) -> Option<(Vec, u64)>; 27 | def qry all_members (online_only: bool) -> (Vec, u64); 28 | def qry all_groups() -> BTreeMap; 29 | def sub on_group_member_offline(group: u64) -> (ClientMember, u64); // 30 | def sub on_any_member_offline() -> (ClientMember, u64); // 31 | def sub on_group_member_online(group: u64) -> (ClientMember, u64); // 32 | def sub on_any_member_online() -> (ClientMember, u64); // 33 | def sub on_group_member_joined(group: u64) -> (ClientMember, u64); // 34 | def sub on_any_member_joined() -> (ClientMember, u64); // 35 | def sub on_group_member_left(group: u64) -> (ClientMember, u64); // 36 | def sub on_any_member_left() -> (ClientMember, u64); // 37 | def sub on_group_leader_changed(group: u64) -> (Option, Option, u64); 38 | } 39 | } 40 | 41 | // The service only responsible for receiving heartbeat and 42 | // Updating last updated time 43 | // Expired update time will trigger timeout in the raft state machine 44 | mod heartbeat_rpc { 45 | service! { 46 | rpc ping(id: u64); 47 | } 48 | } 49 | 50 | #[cfg(test)] 51 | mod test { 52 | use crate::membership::client::ObserverClient; 53 | use crate::membership::member::MemberService; 54 | use crate::membership::server::Membership; 55 | use crate::raft::client::RaftClient; 56 | use crate::raft::{Options, RaftService, Storage, DEFAULT_SERVICE_ID}; 57 | use crate::rpc::Server; 58 | use crate::utils::time::async_wait_secs; 59 | use futures::prelude::*; 60 | use std::sync::atomic::*; 61 | use std::sync::Arc; 62 | 63 | #[tokio::test(flavor = "multi_thread")] 64 | async fn primary() { 65 | let _ = env_logger::builder().format_timestamp(None).try_init(); 66 | let addr = String::from("127.0.0.1:2100"); 67 | let raft_service = RaftService::new(Options { 68 | storage: Storage::default(), 69 | address: addr.clone(), 70 | service_id: DEFAULT_SERVICE_ID, 71 | }); 72 | info!("Creating server"); 73 | let server = Server::new(&addr); 74 | info!("Register service"); 75 | server.register_service(&raft_service).await; 76 | info!("Server listen and resume"); 77 | Server::listen_and_resume(&server).await; 78 | info!("Start raft service"); 79 | RaftService::start(&raft_service, false).await; 80 | info!("Bootstrap raft service"); 81 | raft_service.bootstrap().await; 82 | info!("Creating membership service"); 83 | Membership::new(&server, &raft_service).await; 84 | 85 | let group_1 = String::from("test_group_1"); 86 | let group_2 = String::from("test_group_2"); 87 | let group_3 = String::from("test_group_3"); 88 | 89 | info!("Creating raft client"); 90 | let wild_raft_client = RaftClient::new(&vec![addr.clone()], DEFAULT_SERVICE_ID) 91 | .await 92 | .unwrap(); 93 | 94 | info!("Create observer"); 95 | let client = ObserverClient::new(&wild_raft_client); 96 | 97 | info!("Prepare subscription"); 98 | RaftClient::prepare_subscription(&server).await; 99 | 100 | info!("Creating new group: {}", group_1); 101 | client.new_group(&group_1).await.unwrap().unwrap(); 102 | info!("Creating new group {}", group_2); 103 | client.new_group(&group_2).await.unwrap().unwrap(); 104 | info!("Creating new group {}", group_3); 105 | client.new_group(&group_3).await.unwrap().unwrap(); 106 | 107 | let any_member_joined_count = Arc::new(AtomicUsize::new(0)); 108 | let any_member_left_count = Arc::new(AtomicUsize::new(0)); 109 | let any_member_offline_count = Arc::new(AtomicUsize::new(0)); 110 | let any_member_online_count = Arc::new(AtomicUsize::new(0)); 111 | let group_leader_changed_count = Arc::new(AtomicUsize::new(0)); 112 | let group_member_joined_count = Arc::new(AtomicUsize::new(0)); 113 | let group_member_left_count = Arc::new(AtomicUsize::new(0)); 114 | let group_member_online_count = Arc::new(AtomicUsize::new(0)); 115 | let group_member_offline_count = Arc::new(AtomicUsize::new(0)); 116 | 117 | let any_member_joined_count_clone = any_member_joined_count.clone(); 118 | let any_member_left_count_clone = any_member_left_count.clone(); 119 | let any_member_offline_count_clone = any_member_offline_count.clone(); 120 | let any_member_online_count_clone = any_member_online_count.clone(); 121 | let group_leader_changed_count_clone = group_leader_changed_count.clone(); 122 | let group_member_joined_count_clone = group_member_joined_count.clone(); 123 | let group_member_left_count_clone = group_member_left_count.clone(); 124 | let group_member_online_count_clone = group_member_online_count.clone(); 125 | let group_member_offline_count_clone = group_member_offline_count.clone(); 126 | 127 | info!("Subscribe on_any_member_joined"); 128 | client 129 | .on_any_member_joined(move |_| { 130 | any_member_joined_count_clone.fetch_add(1, Ordering::Relaxed); 131 | future::ready(()).boxed() 132 | }) 133 | .await 134 | .unwrap() 135 | .unwrap(); 136 | 137 | info!("Subscribe on_any_member_left"); 138 | client 139 | .on_any_member_left(move |_| { 140 | any_member_left_count_clone.fetch_add(1, Ordering::Relaxed); 141 | future::ready(()).boxed() 142 | }) 143 | .await 144 | .unwrap() 145 | .unwrap(); 146 | 147 | info!("Subscribe on_any_member_offline"); 148 | client 149 | .on_any_member_offline(move |_| { 150 | any_member_offline_count_clone.fetch_add(1, Ordering::Relaxed); 151 | future::ready(()).boxed() 152 | }) 153 | .await 154 | .unwrap() 155 | .unwrap(); 156 | 157 | info!("Subscribe on_any_member_online"); 158 | client 159 | .on_any_member_online(move |_| { 160 | any_member_online_count_clone.fetch_add(1, Ordering::Relaxed); 161 | future::ready(()).boxed() 162 | }) 163 | .await 164 | .unwrap() 165 | .unwrap(); 166 | 167 | info!("Subscribe on_group_leader_changed"); 168 | client 169 | .on_group_leader_changed( 170 | move |_| { 171 | group_leader_changed_count_clone.fetch_add(1, Ordering::Relaxed); 172 | future::ready(()).boxed() 173 | }, 174 | &group_1, 175 | ) 176 | .await 177 | .unwrap() 178 | .unwrap(); 179 | 180 | info!("Subscribe on_group_member_joined"); 181 | client 182 | .on_group_member_joined( 183 | move |_| { 184 | group_member_joined_count_clone.fetch_add(1, Ordering::Relaxed); 185 | future::ready(()).boxed() 186 | }, 187 | &group_1, 188 | ) 189 | .await 190 | .unwrap() 191 | .unwrap(); 192 | 193 | info!("Subscribe on_group_member_left"); 194 | client 195 | .on_group_member_left( 196 | move |_| { 197 | group_member_left_count_clone.fetch_add(1, Ordering::Relaxed); 198 | future::ready(()).boxed() 199 | }, 200 | &group_1, 201 | ) 202 | .await 203 | .unwrap() 204 | .unwrap(); 205 | 206 | info!("Subscribe on_group_member_online"); 207 | client 208 | .on_group_member_online( 209 | move |_| { 210 | group_member_online_count_clone.fetch_add(1, Ordering::Relaxed); 211 | future::ready(()).boxed() 212 | }, 213 | &group_1, 214 | ) 215 | .await 216 | .unwrap() 217 | .unwrap(); 218 | 219 | info!("Subscribe on_group_member_offline"); 220 | client 221 | .on_group_member_offline( 222 | move |_| { 223 | group_member_offline_count_clone.fetch_add(1, Ordering::Relaxed); 224 | future::ready(()).boxed() 225 | }, 226 | &group_1, 227 | ) 228 | .await 229 | .unwrap() 230 | .unwrap(); 231 | 232 | info!("New member1_raft_client"); 233 | let member1_raft_client = RaftClient::new(&vec![addr.clone()], DEFAULT_SERVICE_ID) 234 | .await 235 | .unwrap(); 236 | let member1_addr = String::from("server1"); 237 | info!("New member service {}", member1_addr); 238 | let member1_svr = 239 | MemberService::new(&member1_addr, &member1_raft_client, &raft_service).await; 240 | 241 | info!("New member2_raft_client"); 242 | let member2_raft_client = RaftClient::new(&vec![addr.clone()], DEFAULT_SERVICE_ID) 243 | .await 244 | .unwrap(); 245 | let member2_addr = String::from("server2"); 246 | info!("New member service {}", member2_addr); 247 | let member2_svr = 248 | MemberService::new(&member2_addr, &member2_raft_client, &raft_service).await; 249 | 250 | info!("New member3_raft_client"); 251 | let member3_raft_client = RaftClient::new(&vec![addr.clone()], DEFAULT_SERVICE_ID) 252 | .await 253 | .unwrap(); 254 | let member3_addr = String::from("server3"); 255 | info!("New member service {}", member3_addr); 256 | let member3_svr = 257 | MemberService::new(&member3_addr, &member3_raft_client, &raft_service).await; 258 | 259 | info!("Member 1 join group 1"); 260 | member1_svr.join_group(&group_1).await.unwrap(); 261 | info!("Member 2 join group 1"); 262 | member2_svr.join_group(&group_1).await.unwrap(); 263 | info!("Member 3 join group 1"); 264 | member3_svr.join_group(&group_1).await.unwrap(); 265 | 266 | info!("Member 1 join group 2"); 267 | member1_svr.join_group(&group_2).await.unwrap(); 268 | info!("Member 2 join group 2"); 269 | member2_svr.join_group(&group_2).await.unwrap(); 270 | 271 | info!("Member 1 join group 3"); 272 | member1_svr.join_group(&group_3).await.unwrap(); 273 | 274 | info!("Checking group members after join"); 275 | assert_eq!( 276 | member1_svr 277 | .client() 278 | .all_members(false) 279 | .await 280 | .unwrap() 281 | .0 282 | .len(), 283 | 3 284 | ); 285 | assert_eq!( 286 | member1_svr 287 | .client() 288 | .all_members(true) 289 | .await 290 | .unwrap() 291 | .0 292 | .len(), 293 | 3 294 | ); 295 | 296 | assert_eq!( 297 | member1_svr 298 | .client() 299 | .group_members(&group_1, false) 300 | .await 301 | .unwrap() 302 | .unwrap() 303 | .0 304 | .len(), 305 | 3 306 | ); 307 | assert_eq!( 308 | member1_svr 309 | .client() 310 | .group_members(&group_1, true) 311 | .await 312 | .unwrap() 313 | .unwrap() 314 | .0 315 | .len(), 316 | 3 317 | ); 318 | 319 | assert_eq!( 320 | member1_svr 321 | .client() 322 | .group_members(&group_2, false) 323 | .await 324 | .unwrap() 325 | .unwrap() 326 | .0 327 | .len(), 328 | 2 329 | ); 330 | assert_eq!( 331 | member1_svr 332 | .client() 333 | .group_members(&group_2, true) 334 | .await 335 | .unwrap() 336 | .unwrap() 337 | .0 338 | .len(), 339 | 2 340 | ); 341 | 342 | assert_eq!( 343 | member1_svr 344 | .client() 345 | .group_members(&group_3, false) 346 | .await 347 | .unwrap() 348 | .unwrap() 349 | .0 350 | .len(), 351 | 1 352 | ); 353 | assert_eq!( 354 | member1_svr 355 | .client() 356 | .group_members(&group_3, true) 357 | .await 358 | .unwrap() 359 | .unwrap() 360 | .0 361 | .len(), 362 | 1 363 | ); 364 | 365 | member1_svr.close(); // close only end the heartbeat thread 366 | 367 | info!("############### Waiting for membership changes ###############"); 368 | for i in 0..10 { 369 | async_wait_secs().await; 370 | } 371 | info!("*************** Checking members ***************"); 372 | 373 | assert_eq!( 374 | member1_svr 375 | .client() 376 | .all_members(false) 377 | .await 378 | .unwrap() 379 | .0 380 | .len(), 381 | 3 382 | ); 383 | assert_eq!( 384 | member1_svr 385 | .client() 386 | .all_members(true) 387 | .await 388 | .unwrap() 389 | .0 390 | .len(), 391 | 2 392 | ); 393 | 394 | assert_eq!( 395 | member1_svr 396 | .client() 397 | .group_members(&group_1, false) 398 | .await 399 | .unwrap() 400 | .unwrap() 401 | .0 402 | .len(), 403 | 3 404 | ); 405 | assert_eq!( 406 | member1_svr 407 | .client() 408 | .group_members(&group_1, true) 409 | .await 410 | .unwrap() 411 | .unwrap() 412 | .0 413 | .len(), 414 | 2 415 | ); 416 | 417 | assert_eq!( 418 | member1_svr 419 | .client() 420 | .group_members(&group_2, false) 421 | .await 422 | .unwrap() 423 | .unwrap() 424 | .0 425 | .len(), 426 | 2 427 | ); 428 | assert_eq!( 429 | member1_svr 430 | .client() 431 | .group_members(&group_2, true) 432 | .await 433 | .unwrap() 434 | .unwrap() 435 | .0 436 | .len(), 437 | 1 438 | ); 439 | 440 | assert_eq!( 441 | member1_svr 442 | .client() 443 | .group_members(&group_3, false) 444 | .await 445 | .unwrap() 446 | .unwrap() 447 | .0 448 | .len(), 449 | 1 450 | ); 451 | assert_eq!( 452 | member1_svr 453 | .client() 454 | .group_members(&group_3, true) 455 | .await 456 | .unwrap() 457 | .unwrap() 458 | .0 459 | .len(), 460 | 0 461 | ); 462 | 463 | member2_svr.leave().await.unwrap(); // leave will report to the raft servers to remove it from the list 464 | assert_eq!( 465 | member1_svr 466 | .client() 467 | .all_members(false) 468 | .await 469 | .unwrap() 470 | .0 471 | .len(), 472 | 2 473 | ); 474 | assert_eq!( 475 | member1_svr 476 | .client() 477 | .all_members(true) 478 | .await 479 | .unwrap() 480 | .0 481 | .len(), 482 | 1 483 | ); 484 | 485 | assert_eq!( 486 | member1_svr 487 | .client() 488 | .group_members(&group_1, false) 489 | .await 490 | .unwrap() 491 | .unwrap() 492 | .0 493 | .len(), 494 | 2 495 | ); 496 | assert_eq!( 497 | member1_svr 498 | .client() 499 | .group_members(&group_1, true) 500 | .await 501 | .unwrap() 502 | .unwrap() 503 | .0 504 | .len(), 505 | 1 506 | ); 507 | 508 | assert_eq!( 509 | member1_svr 510 | .client() 511 | .group_members(&group_2, false) 512 | .await 513 | .unwrap() 514 | .unwrap() 515 | .0 516 | .len(), 517 | 1 518 | ); 519 | assert_eq!( 520 | member1_svr 521 | .client() 522 | .group_members(&group_2, true) 523 | .await 524 | .unwrap() 525 | .unwrap() 526 | .0 527 | .len(), 528 | 0 529 | ); 530 | 531 | assert_eq!( 532 | member1_svr 533 | .client() 534 | .group_members(&group_3, false) 535 | .await 536 | .unwrap() 537 | .unwrap() 538 | .0 539 | .len(), 540 | 1 541 | ); 542 | assert_eq!( 543 | member1_svr 544 | .client() 545 | .group_members(&group_3, true) 546 | .await 547 | .unwrap() 548 | .unwrap() 549 | .0 550 | .len(), 551 | 0 552 | ); 553 | 554 | async_wait_secs().await; 555 | 556 | info!("=========== Checking event trigger ==========="); 557 | assert_eq!(any_member_joined_count.load(Ordering::Relaxed), 3); 558 | assert_eq!(any_member_left_count.load(Ordering::Relaxed), 1); 559 | assert_eq!(any_member_offline_count.load(Ordering::Relaxed), 1); 560 | assert_eq!(any_member_online_count.load(Ordering::Relaxed), 0); // no server online from offline 561 | assert!(group_leader_changed_count.load(Ordering::Relaxed) > 0); // Number depends on hashing 562 | assert_eq!(group_member_joined_count.load(Ordering::Relaxed), 3); 563 | // assert_eq!(group_member_left_count.load(Ordering::Relaxed), 2); // this test case is unstable 564 | assert_eq!(group_member_online_count.load(Ordering::Relaxed), 0); 565 | assert_eq!(group_member_offline_count.load(Ordering::Relaxed), 1); 566 | } 567 | } 568 | --------------------------------------------------------------------------------