├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── examples └── rmqttraft-warp-memstore │ ├── Cargo.toml │ └── src │ └── main.rs ├── proto └── raft_service.proto └── src ├── error.rs ├── lib.rs ├── message.rs ├── raft.rs ├── raft_node.rs ├── raft_server.rs ├── raft_service.rs └── storage.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | Cargo.lock 3 | /target 4 | /.git 5 | /.idea 6 | /.cargo 7 | /examples/rmqttraft-warp-memstore/target 8 | /examples/rmqttraft-warp-memstore/Cargo.lock 9 | /examples/rmqttraft-warp-memstore/.idea/.gitignore 10 | /examples/rmqttraft-warp-memstore/.idea/encodings.xml 11 | /examples/rmqttraft-warp-memstore/.idea/modules.xml 12 | /examples/rmqttraft-warp-memstore/.idea/rmqttraft-warp-memstore.iml 13 | /examples/rmqttraft-warp-memstore/.idea/vcs.xml 14 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rmqtt-raft" 3 | version = "0.4.5" 4 | authors = ["rmqtt "] 5 | edition = "2021" 6 | license = "MIT OR Apache-2.0" 7 | repository = "https://github.com/rmqtt/rmqtt-raft" 8 | homepage = "https://github.com/rmqtt/rmqtt-raft" 9 | description = "rmqtt-raft - A raft framework, for regular people" 10 | keywords = ["raft", "distributed-systems"] 11 | categories = ["algorithms"] 12 | exclude = ["examples", ".gitignore", ".cargo/config"] 13 | 14 | [package.metadata.docs.rs] 15 | all-features = true 16 | 17 | [features] 18 | default = [] 19 | reuse = ["reuseport", "reuseaddr"] 20 | reuseport = ["socket2", "tokio-stream"] 21 | reuseaddr = ["socket2", "tokio-stream"] 22 | 23 | [dependencies] 24 | tikv-raft = { package = "raft", version = "0.7", features = [ 25 | "prost-codec", 26 | ], default-features = false } 27 | tokio = { version = "1", default-features = false, features = ["macros"] } 28 | socket2 = { version = "0.5", features = ["all"], optional = true } 29 | tokio-stream = { version = "0.1", features = ["net"], optional = true } 30 | tonic = "0.9" 31 | prost = "0.11" 32 | futures = "0.3" 33 | async-trait = "0.1" 34 | bincode = "1.3" 35 | serde = { version = "1.0", features = ["derive"] } 36 | log = { version = "0.4", features = ["std"] } 37 | slog = "2" 38 | thiserror = "1.0" 39 | dashmap = "6.1" 40 | ahash = "0.8" 41 | chrono = { version = "0.4", default-features = false, features = ["clock"] } 42 | anyhow = "1.0" 43 | once_cell = "1" 44 | bytestring = { version = "1.3", features = ["serde"] } 45 | 46 | [build-dependencies] 47 | tonic-build = "0.9" 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 rmqtt-rs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RmqttRaft - A raft framework, for regular people 2 | 3 | GitHub Release 4 | crates.io 5 | Documentation 6 | 7 | This is an attempt to create a layer on top of 8 | [tikv/raft-rs](https://github.com/tikv/raft-rs), that is easier to use and implement. This is not supposed to be the 9 | most featureful raft, but instead a convenient interface to get started quickly, and have a working raft in no time. 10 | 11 | The interface is strongly inspired by the one used by [canonical/raft](https://github.com/canonical/raft). 12 | 13 | ## Usage 14 | 15 | Add this to your `Cargo.toml`: 16 | 17 | ```toml 18 | [dependencies] 19 | rmqtt-raft = "0.4" 20 | ``` 21 | 22 | ## Getting started 23 | 24 | In order to "raft" storage, we need to implement the `Storage` trait for it. Bellow is an example with `HashStore`, 25 | which is a thread-safe wrapper around an 26 | `HashMap`: 27 | 28 | ```rust 29 | #[derive(Serialize, Deserialize)] 30 | pub enum Message { 31 | Insert { key: String, value: String }, 32 | Get { key: String }, 33 | } 34 | 35 | #[derive(Clone)] 36 | struct HashStore(Arc>>); 37 | 38 | impl HashStore { 39 | fn new() -> Self { 40 | Self(Arc::new(RwLock::new(HashMap::new()))) 41 | } 42 | fn get(&self, key: &str) -> Option { 43 | self.0.read().unwrap().get(key).cloned() 44 | } 45 | } 46 | 47 | #[async_trait] 48 | impl Store for HashStore { 49 | async fn apply(&mut self, message: &[u8]) -> RaftResult> { 50 | let message: Message = deserialize(message).unwrap(); 51 | let message: Vec = match message { 52 | Message::Insert { key, value } => { 53 | let mut db = self.0.write().unwrap(); 54 | let v = serialize(&value).unwrap(); 55 | db.insert(key, value); 56 | v 57 | } 58 | _ => Vec::new(), 59 | }; 60 | Ok(message) 61 | } 62 | 63 | async fn query(&self, query: &[u8]) -> RaftResult> { 64 | let query: Message = deserialize(query).unwrap(); 65 | let data: Vec = match query { 66 | Message::Get { key } => { 67 | if let Some(val) = self.get(&key) { 68 | serialize(&val).unwrap() 69 | } else { 70 | Vec::new() 71 | } 72 | } 73 | _ => Vec::new(), 74 | }; 75 | Ok(data) 76 | } 77 | 78 | async fn snapshot(&self) -> RaftResult> { 79 | Ok(serialize(&self.0.read().unwrap().clone())?) 80 | } 81 | 82 | async fn restore(&mut self, snapshot: &[u8]) -> RaftResult<()> { 83 | let new: HashMap = deserialize(snapshot).unwrap(); 84 | let mut db = self.0.write().unwrap(); 85 | let _ = std::mem::replace(&mut *db, new); 86 | Ok(()) 87 | } 88 | } 89 | 90 | ``` 91 | 92 | Only 4 methods need to be implemented for the Store: 93 | 94 | - `Store::apply`: applies a commited entry to the store. 95 | - `Store::query` query a entry from the store; 96 | - `Store::snapshot`: returns snapshot data for the store. 97 | - `Store::restore`: applies the snapshot passed as argument. 98 | 99 | ### running the raft 100 | 101 | ```rust 102 | #[tokio::main] 103 | async fn main() -> std::result::Result<(), Box> { 104 | let decorator = slog_term::TermDecorator::new().build(); 105 | let drain = slog_term::FullFormat::new(decorator).build().fuse(); 106 | let drain = slog_async::Async::new(drain).build().fuse(); 107 | let logger = slog::Logger::root(drain, slog_o!("version" => env!("CARGO_PKG_VERSION"))); 108 | 109 | // converts log to slog 110 | #[allow(clippy::let_unit_value)] 111 | let _log_guard = slog_stdlog::init().unwrap(); 112 | 113 | let options = Options::from_args(); 114 | let store = HashStore::new(); 115 | info!(logger, "peer_addrs: {:?}", options.peer_addrs); 116 | let cfg = Config { 117 | reuseaddr: true, 118 | reuseport: true, 119 | // grpc_message_size: 50 * 1024 * 1024, 120 | ..Default::default() 121 | }; 122 | let raft = Raft::new( 123 | options.raft_laddr.clone(), 124 | store.clone(), 125 | logger.clone(), 126 | cfg, 127 | )?; 128 | let leader_info = raft.find_leader_info(options.peer_addrs).await?; 129 | info!(logger, "leader_info: {:?}", leader_info); 130 | 131 | let mailbox = Arc::new(raft.mailbox()); 132 | let (raft_handle, mailbox) = match leader_info { 133 | Some((leader_id, leader_addr)) => { 134 | info!(logger, "running in follower mode"); 135 | let handle = tokio::spawn(raft.join( 136 | options.id, 137 | options.raft_laddr, 138 | Some(leader_id), 139 | leader_addr, 140 | )); 141 | (handle, mailbox) 142 | } 143 | None => { 144 | info!(logger, "running in leader mode"); 145 | let handle = tokio::spawn(raft.lead(options.id)); 146 | (handle, mailbox) 147 | } 148 | }; 149 | 150 | tokio::try_join!(raft_handle)?.0?; 151 | Ok(()) 152 | } 153 | ``` 154 | 155 | The `mailbox` gives you a way to interact with the raft, for sending a message, or leaving the cluster for example. 156 | 157 | ## Credit 158 | 159 | This work is based on [riteraft](https://github.com/ritelabs/riteraft), but more adjustments and improvements have been 160 | made to the code . 161 | 162 | ## License 163 | 164 | This library is licensed under either of: 165 | 166 | * MIT license [LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT 167 | * Apache License 2.0 [LICENSE-APACHE](LICENSE-APACHE) or https://opensource.org/licenses/Apache-2.0 168 | 169 | at your option. 170 | 171 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | fn main() -> Result<(), Box> { 2 | let out = std::env::var("OUT_DIR").unwrap(); 3 | println!("out: {}", out); 4 | let build_res = tonic_build::configure() 5 | .out_dir(out) 6 | .compile(&["raft_service.proto"], &["proto/"]); 7 | println!("compile proto result! {:?}", build_res); 8 | build_res.unwrap(); 9 | Ok(()) 10 | } 11 | -------------------------------------------------------------------------------- /examples/rmqttraft-warp-memstore/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rmqttraft-warp-memstore" 3 | version = "0.1.0" 4 | authors = ["rmqtt-rs "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | async-trait = "0.1.48" 11 | bincode = "1.3" 12 | log = "0.4" 13 | rmqtt-raft = { path = "../..", features = ["reuse"] } 14 | serde = "1.0" 15 | slog-async = "2" 16 | slog-term = "2" 17 | slog = "2" 18 | slog-stdlog = "4" 19 | slog-scope = "4" 20 | structopt = "0.3" 21 | tokio = { version = "1", features = ["rt-multi-thread", "macros"] , default-features = false} 22 | warp = "0.3" 23 | chrono = { version = "0.4", default-features = false, features = ["clock"] } 24 | once_cell = "1.7.2" 25 | -------------------------------------------------------------------------------- /examples/rmqttraft-warp-memstore/src/main.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate slog; 3 | extern crate slog_async; 4 | extern crate slog_term; 5 | 6 | use async_trait::async_trait; 7 | use bincode::{deserialize, serialize}; 8 | use rmqtt_raft::{Config, Mailbox, Raft, Result as RaftResult, Store}; 9 | use serde::{Deserialize, Serialize}; 10 | use slog::info; 11 | use slog::Drain; 12 | use std::collections::HashMap; 13 | use std::convert::From; 14 | use std::convert::Infallible; 15 | use std::net::SocketAddr; 16 | use std::str::FromStr; 17 | use std::sync::{Arc, RwLock}; 18 | use structopt::StructOpt; 19 | use warp::{reply, Filter}; 20 | 21 | #[derive(Debug, StructOpt)] 22 | struct Options { 23 | #[structopt(long)] 24 | id: u64, 25 | #[structopt(long)] 26 | raft_laddr: String, 27 | #[structopt(name = "peer-addr", long)] 28 | peer_addrs: Vec, 29 | #[structopt(long)] 30 | web_server: Option, 31 | } 32 | 33 | #[derive(Serialize, Deserialize)] 34 | pub enum Message { 35 | Insert { key: String, value: String }, 36 | Get { key: String }, 37 | } 38 | 39 | #[derive(Clone)] 40 | struct HashStore(Arc>>); 41 | 42 | impl HashStore { 43 | fn new() -> Self { 44 | Self(Arc::new(RwLock::new(HashMap::new()))) 45 | } 46 | fn get(&self, key: &str) -> Option { 47 | self.0.read().unwrap().get(key).cloned() 48 | } 49 | } 50 | 51 | #[async_trait] 52 | impl Store for HashStore { 53 | async fn apply(&mut self, message: &[u8]) -> RaftResult> { 54 | let message: Message = deserialize(message).unwrap(); 55 | let message: Vec = match message { 56 | Message::Insert { key, value } => { 57 | let mut db = self.0.write().unwrap(); 58 | let v = serialize(&value).unwrap(); 59 | db.insert(key, value); 60 | v 61 | } 62 | _ => Vec::new(), 63 | }; 64 | Ok(message) 65 | } 66 | 67 | async fn query(&self, query: &[u8]) -> RaftResult> { 68 | let query: Message = deserialize(query).unwrap(); 69 | let data: Vec = match query { 70 | Message::Get { key } => { 71 | if let Some(val) = self.get(&key) { 72 | serialize(&val).unwrap() 73 | } else { 74 | Vec::new() 75 | } 76 | } 77 | _ => Vec::new(), 78 | }; 79 | Ok(data) 80 | } 81 | 82 | async fn snapshot(&self) -> RaftResult> { 83 | Ok(serialize(&self.0.read().unwrap().clone())?) 84 | } 85 | 86 | async fn restore(&mut self, snapshot: &[u8]) -> RaftResult<()> { 87 | let new: HashMap = deserialize(snapshot).unwrap(); 88 | let mut db = self.0.write().unwrap(); 89 | let _ = std::mem::replace(&mut *db, new); 90 | Ok(()) 91 | } 92 | } 93 | 94 | fn with_mailbox( 95 | mailbox: Arc, 96 | ) -> impl Filter,), Error = Infallible> + Clone { 97 | warp::any().map(move || mailbox.clone()) 98 | } 99 | 100 | fn with_store(store: HashStore) -> impl Filter + Clone { 101 | warp::any().map(move || store.clone()) 102 | } 103 | 104 | async fn put( 105 | mailbox: Arc, 106 | key: String, 107 | value: String, 108 | ) -> Result { 109 | let message = Message::Insert { key, value }; 110 | let message = serialize(&message).unwrap(); 111 | let result = mailbox.send_proposal(message).await; 112 | match result { 113 | Ok(r) => { 114 | let result: String = deserialize(&r).unwrap(); 115 | Ok(reply::json(&result)) 116 | } 117 | Err(e) => Ok(reply::json(&format!("put error, {:?}", e))), 118 | } 119 | } 120 | 121 | async fn get(store: HashStore, key: String) -> Result { 122 | let response = store.get(&key); 123 | Ok(reply::json(&response)) 124 | } 125 | 126 | async fn leave(mailbox: Arc) -> Result { 127 | mailbox.leave().await.unwrap(); 128 | Ok(reply::json(&"OK".to_string())) 129 | } 130 | 131 | async fn status(mailbox: Arc) -> Result { 132 | match mailbox.status().await { 133 | Err(e) => Ok(reply::json(&e.to_string())), 134 | Ok(response) => Ok(reply::json(&response)), 135 | } 136 | } 137 | 138 | //target\release\rmqttraft-warp-memstore.exe --id 1 --raft-laddr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8081" 139 | //target\release\rmqttraft-warp-memstore.exe --id 2 --raft-laddr "127.0.0.1:5002" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8082" 140 | //target\release\rmqttraft-warp-memstore.exe --id 3 --raft-laddr "127.0.0.1:5003" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8083" 141 | //target\release\rmqttraft-warp-memstore.exe --id 4 --raft-laddr "127.0.0.1:5004" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8084" 142 | //target\release\rmqttraft-warp-memstore.exe --id 5 --raft-laddr "127.0.0.1:5005" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8085" 143 | 144 | //./target/release/rmqttraft-warp-memstore --id 1 --raft-laddr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8081" 145 | //./target/release/rmqttraft-warp-memstore --id 2 --raft-laddr "127.0.0.1:5002" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8082" 146 | //./target/release/rmqttraft-warp-memstore --id 3 --raft-laddr "127.0.0.1:5003" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8083" 147 | 148 | //target\debug\rmqttraft-warp-memstore.exe --id 1 --raft-laddr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8081" 149 | //target\debug\rmqttraft-warp-memstore.exe --id 2 --raft-laddr "127.0.0.1:5002" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8082" 150 | //target\debug\rmqttraft-warp-memstore.exe --id 3 --raft-laddr "127.0.0.1:5003" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8083" 151 | 152 | //./target/debug/rmqttraft-warp-memstore --id 1 --raft-laddr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8081" > out_1.log 2>&1 & 153 | //./target/debug/rmqttraft-warp-memstore --id 2 --raft-laddr "127.0.0.1:5002" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5003" --web-server "0.0.0.0:8082" > out_2.log 2>&1 & 154 | //./target/debug/rmqttraft-warp-memstore --id 3 --raft-laddr "127.0.0.1:5003" --peer-addr "127.0.0.1:5001" --peer-addr "127.0.0.1:5002" --web-server "0.0.0.0:8083" > out_3.log 2>&1 & 155 | 156 | // wrk -c 100 -t4 -d60s -H "Connection: keep-alive" "http://127.0.0.1:8081/put/key1/val-1" 157 | // wrk -c 100 -t4 -d60s -H "Connection: keep-alive" "http://127.0.0.1:8082/put/key1/val-2" 158 | // wrk -c 100 -t6 -d60s -H "Connection: keep-alive" "http://127.0.0.1:8083/get/key1" 159 | 160 | // ab -n 5000 -c 20 "http://127.0.0.1:8081/put/key1/val-1" 161 | // ab -n 5000 -c 50 "http://127.0.0.1:8082/put/key1/val-2" 162 | // ab -n 5000 -c 20 "http://127.0.0.1:8083/get/key1" 163 | 164 | // ab -n 50000 -c 1000 "http://127.0.0.1:8081/put/key1/val-1" 165 | // ab -n 50000 -c 1000 "http://127.0.0.1:8082/put/key2/val-1" 166 | // ab -n 50000 -c 1000 "http://127.0.0.1:8083/put/key3/val-1" 167 | 168 | // http://127.0.0.1:8081/status 169 | 170 | #[tokio::main] 171 | async fn main() -> std::result::Result<(), Box> { 172 | let decorator = slog_term::TermDecorator::new().build(); 173 | let drain = slog_term::FullFormat::new(decorator).build().fuse(); 174 | let drain = slog_async::Async::new(drain).build().fuse(); 175 | let logger = slog::Logger::root(drain, slog_o!("version" => env!("CARGO_PKG_VERSION"))); 176 | 177 | // converts log to slog 178 | #[allow(clippy::let_unit_value)] 179 | let _log_guard = slog_stdlog::init().unwrap(); 180 | 181 | let options = Options::from_args(); 182 | let store = HashStore::new(); 183 | info!(logger, "peer_addrs: {:?}", options.peer_addrs); 184 | let cfg = Config { 185 | reuseaddr: true, 186 | reuseport: true, 187 | // grpc_message_size: 50 * 1024 * 1024, 188 | ..Default::default() 189 | }; 190 | let raft = Raft::new( 191 | options.raft_laddr.clone(), 192 | store.clone(), 193 | logger.clone(), 194 | cfg, 195 | )?; 196 | let leader_info = raft.find_leader_info(options.peer_addrs).await?; 197 | info!(logger, "leader_info: {:?}", leader_info); 198 | 199 | let mailbox = Arc::new(raft.mailbox()); 200 | let (raft_handle, mailbox) = match leader_info { 201 | Some((leader_id, leader_addr)) => { 202 | info!(logger, "running in follower mode"); 203 | let handle = tokio::spawn(raft.join( 204 | options.id, 205 | options.raft_laddr, 206 | Some(leader_id), 207 | leader_addr, 208 | )); 209 | (handle, mailbox) 210 | } 211 | None => { 212 | info!(logger, "running in leader mode"); 213 | let handle = tokio::spawn(raft.lead(options.id)); 214 | (handle, mailbox) 215 | } 216 | }; 217 | 218 | let put_kv = warp::get() 219 | .and(warp::path!("put" / String / String)) 220 | .and(with_mailbox(mailbox.clone())) 221 | .and_then(|key, value, mailbox: Arc| put(mailbox, key, value)); 222 | 223 | let get_kv = warp::get() 224 | .and(warp::path!("get" / String)) 225 | .and(with_store(store.clone())) 226 | .and_then(|key, store: HashStore| get(store, key)); 227 | 228 | let leave_kv = warp::get() 229 | .and(warp::path!("leave")) 230 | .and(with_mailbox(mailbox.clone())) 231 | .and_then(leave); 232 | 233 | let status = warp::get() 234 | .and(warp::path!("status")) 235 | .and(with_mailbox(mailbox.clone())) 236 | .and_then(status); 237 | 238 | let routes = put_kv.or(get_kv).or(leave_kv).or(status); 239 | 240 | if let Some(addr) = options.web_server { 241 | let _server = tokio::spawn(async move { 242 | warp::serve(routes) 243 | .run(SocketAddr::from_str(&addr).unwrap()) 244 | .await; 245 | }); 246 | } 247 | 248 | tokio::try_join!(raft_handle)?.0?; 249 | Ok(()) 250 | } 251 | -------------------------------------------------------------------------------- /proto/raft_service.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | package raftservice; 3 | 4 | //import "eraftpb.proto"; 5 | 6 | service RaftService { 7 | rpc RequestId(Empty) returns (IdRequestReponse) {} 8 | rpc ChangeConfig(ConfChange) returns (RaftResponse) {} 9 | rpc SendMessage(Message) returns (RaftResponse) {} 10 | rpc SendProposal(Proposal) returns (RaftResponse) {} 11 | rpc SendQuery(Query) returns (RaftResponse) {} 12 | } 13 | 14 | message ConfChange { 15 | bytes inner = 1; 16 | } 17 | 18 | message Message { 19 | bytes inner = 1; 20 | } 21 | 22 | enum ResultCode { 23 | Ok = 0; 24 | Error = 1; 25 | WrongLeader = 2; 26 | } 27 | 28 | message Proposal { 29 | bytes inner = 1; 30 | } 31 | 32 | message Query { 33 | bytes inner = 1; 34 | } 35 | 36 | message IdRequestReponse{ 37 | ResultCode code = 1; 38 | bytes data = 2; 39 | } 40 | 41 | message Empty {} 42 | 43 | message Entry { 44 | uint64 key = 1; 45 | string value = 2; 46 | } 47 | 48 | message RaftResponse { 49 | bytes inner = 2; 50 | } 51 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error as ThisError; 2 | 3 | pub type Result = std::result::Result; 4 | 5 | #[derive(Debug, ThisError)] 6 | pub enum Error { 7 | #[error("raft error: `{0}`")] 8 | RaftError(#[from] tikv_raft::Error), 9 | #[error("Error joining the cluster")] 10 | JoinError, 11 | #[error("gprc error: `{0}`")] 12 | Grpc(#[from] tonic::transport::Error), 13 | #[error("error calling remote procedure: `{0}`")] 14 | RemoteCall(#[from] tonic::Status), 15 | #[error("io error: {0}")] 16 | Io(String), 17 | #[error("unexpected error, {0}")] 18 | Other(#[source] Box), 19 | #[error("unexpected error")] 20 | Unknown, 21 | #[error("leader does not exist")] 22 | LeaderNotExist, 23 | #[error("Not a Leader")] 24 | NotLeader, 25 | #[error("timeout")] 26 | Elapsed, 27 | #[error("{0}")] 28 | Msg(String), 29 | #[error("send error, {0}")] 30 | SendError(String), 31 | #[error("recv error, {0}")] 32 | RecvError(String), 33 | #[error("{0}")] 34 | Anyhow(anyhow::Error), 35 | } 36 | 37 | impl Error { 38 | pub fn boxed(self) -> Box { 39 | Box::new(self) 40 | } 41 | } 42 | 43 | impl From for Error { 44 | fn from(e: prost::DecodeError) -> Self { 45 | Self::Other(Box::new(e)) 46 | } 47 | } 48 | 49 | impl From for Error { 50 | fn from(e: prost::EncodeError) -> Self { 51 | Self::Other(Box::new(e)) 52 | } 53 | } 54 | 55 | impl From for Error { 56 | fn from(e: tokio::io::Error) -> Self { 57 | Self::Io(e.to_string()) 58 | } 59 | } 60 | 61 | impl From for Error { 62 | fn from(e: bincode::Error) -> Self { 63 | Self::Other(e) 64 | } 65 | } 66 | 67 | impl From for Error { 68 | fn from(e: std::string::FromUtf8Error) -> Self { 69 | Self::Other(Box::new(e)) 70 | } 71 | } 72 | 73 | impl From for Error { 74 | fn from(e: String) -> Self { 75 | Self::Msg(e) 76 | } 77 | } 78 | 79 | impl From<&str> for Error { 80 | fn from(e: &str) -> Self { 81 | Self::Msg(e.to_owned()) 82 | } 83 | } 84 | 85 | impl From for Error { 86 | #[inline] 87 | fn from(e: anyhow::Error) -> Self { 88 | Error::Anyhow(e) 89 | } 90 | } 91 | impl From for Error { 92 | #[inline] 93 | fn from(_: tokio::time::error::Elapsed) -> Self { 94 | Error::Elapsed 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | // Re-exporting necessary types and modules for external use. 4 | pub use crate::error::{Error, Result}; 5 | pub use crate::message::Status; 6 | pub use crate::raft::{Mailbox, Raft, Store}; 7 | pub use tikv_raft::{ReadOnlyOption, StateRole}; 8 | 9 | // Importing modules for internal use. 10 | mod error; 11 | mod message; 12 | mod raft; 13 | mod raft_node; 14 | mod raft_server; 15 | mod raft_service; 16 | mod storage; 17 | 18 | /// Configuration options for the Raft-based system. 19 | #[derive(Clone)] 20 | pub struct Config { 21 | #[cfg(feature = "reuseaddr")] 22 | /// Whether to reuse local addresses. This option is enabled only if the `reuseaddr` feature is active. 23 | pub reuseaddr: bool, 24 | 25 | #[cfg(all( 26 | feature = "reuseport", 27 | not(any(target_os = "solaris", target_os = "illumos")) 28 | ))] 29 | /// Whether to reuse local ports. This option is enabled only if the `reuseport` feature is active 30 | /// and the target OS is not Solaris or Illumos. 31 | pub reuseport: bool, 32 | 33 | /// The timeout duration for gRPC calls. 34 | pub grpc_timeout: Duration, 35 | 36 | /// The maximum number of concurrent gRPC calls. 37 | pub grpc_concurrency_limit: usize, 38 | 39 | /// The maximum size of gRPC messages in bytes. 40 | pub grpc_message_size: usize, 41 | 42 | /// The threshold for the gRPC circuit breaker. If the number of failed requests exceeds this threshold, 43 | /// the circuit breaker will trip. 44 | pub grpc_breaker_threshold: u64, 45 | 46 | /// The interval at which the gRPC circuit breaker will retry after tripping. 47 | pub grpc_breaker_retry_interval: Duration, 48 | 49 | /// The maximum number of proposals to batch together before processing. 50 | pub proposal_batch_size: usize, 51 | 52 | /// The timeout duration for collecting proposals into a batch. If this timeout is reached, 53 | /// the collected proposals will be processed regardless of the batch size. 54 | pub proposal_batch_timeout: Duration, 55 | 56 | /// The interval at which snapshots are generated. 57 | pub snapshot_interval: Duration, 58 | 59 | /// The interval at which heartbeat messages are sent to maintain leader election and cluster health. 60 | pub heartbeat: Duration, 61 | 62 | /// Configuration options for the Raft protocol. 63 | pub raft_cfg: tikv_raft::Config, 64 | } 65 | 66 | impl Default for Config { 67 | /// Provides default values for the `Config` struct. 68 | fn default() -> Self { 69 | Self { 70 | #[cfg(feature = "reuseaddr")] 71 | reuseaddr: false, 72 | 73 | #[cfg(all( 74 | feature = "reuseport", 75 | not(any(target_os = "solaris", target_os = "illumos")) 76 | ))] 77 | reuseport: false, 78 | 79 | grpc_timeout: Duration::from_secs(6), 80 | grpc_concurrency_limit: 200, 81 | grpc_message_size: 50 * 1024 * 1024, // 50 MB 82 | grpc_breaker_threshold: 4, 83 | grpc_breaker_retry_interval: Duration::from_millis(2500), 84 | proposal_batch_size: 50, 85 | proposal_batch_timeout: Duration::from_millis(200), 86 | snapshot_interval: Duration::from_secs(600), 87 | heartbeat: Duration::from_millis(100), 88 | raft_cfg: tikv_raft::Config { 89 | election_tick: 10, 90 | heartbeat_tick: 5, 91 | check_quorum: true, 92 | pre_vote: true, 93 | ..Default::default() 94 | }, 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/message.rs: -------------------------------------------------------------------------------- 1 | use bytestring::ByteString; 2 | use std::collections::HashMap; 3 | use std::time::{Duration, Instant}; 4 | 5 | use serde::de::{self, Deserializer}; 6 | use serde::ser::Serializer; 7 | use serde::{Deserialize, Serialize}; 8 | 9 | use futures::channel::oneshot::Sender; 10 | use tikv_raft::eraftpb::{ConfChange, Message as RaftMessage}; 11 | use tikv_raft::StateRole; 12 | 13 | /// Enumeration representing various types of responses that can be sent back to clients. 14 | #[derive(Serialize, Deserialize, Debug)] 15 | pub enum RaftResponse { 16 | /// Indicates that the request was sent to the wrong leader. 17 | WrongLeader { 18 | leader_id: u64, 19 | leader_addr: Option, 20 | }, 21 | /// Indicates that a join request was successful. 22 | JoinSuccess { 23 | assigned_id: u64, 24 | peer_addrs: HashMap, 25 | }, 26 | /// Contains the leader ID in response to a request for ID. 27 | RequestId { leader_id: u64 }, 28 | /// Represents an error with a message. 29 | Error(String), 30 | /// Contains arbitrary response data. 31 | Response { data: Vec }, 32 | /// Represents the status of the system. 33 | Status(Status), 34 | /// Represents a successful operation. 35 | Ok, 36 | } 37 | 38 | /// Enumeration representing different types of messages that can be sent within the system. 39 | #[allow(dead_code)] 40 | pub enum Message { 41 | /// A proposal message to be processed. 42 | Propose { 43 | proposal: Vec, 44 | chan: Sender, 45 | }, 46 | /// A query message to be processed. 47 | Query { 48 | query: Vec, 49 | chan: Sender, 50 | }, 51 | /// A configuration change message to be processed. 52 | ConfigChange { 53 | change: ConfChange, 54 | chan: Sender, 55 | }, 56 | /// A request for the leader's ID. 57 | RequestId { chan: Sender }, 58 | /// Report that a node is unreachable. 59 | ReportUnreachable { node_id: u64 }, 60 | /// A Raft message to be processed. 61 | Raft(Box), 62 | /// A request for the status of the system. 63 | Status { chan: Sender }, 64 | } 65 | 66 | #[derive(Serialize, Deserialize, Debug, Clone)] 67 | pub struct PeerState { 68 | pub addr: ByteString, 69 | pub available: bool, 70 | } 71 | 72 | /// Struct representing the status of the system. 73 | #[derive(Serialize, Deserialize, Debug, Clone)] 74 | pub struct Status { 75 | pub id: u64, 76 | pub leader_id: u64, 77 | pub uncommitteds: usize, 78 | pub merger_proposals: usize, 79 | pub sending_raft_messages: isize, 80 | pub peers: HashMap>, 81 | #[serde( 82 | serialize_with = "Status::serialize_role", 83 | deserialize_with = "Status::deserialize_role" 84 | )] 85 | pub role: StateRole, 86 | } 87 | 88 | impl Status { 89 | #[inline] 90 | pub fn available(&self) -> bool { 91 | if matches!(self.role, StateRole::Leader) { 92 | //Check if the number of available nodes is greater than or equal to half of the total nodes. 93 | let (all_count, available_count) = self.get_count(); 94 | let available = available_count >= ((all_count / 2) + (all_count % 2)); 95 | log::debug!( 96 | "is Leader, all_count: {}, available_count: {} {}", 97 | all_count, 98 | available_count, 99 | available 100 | ); 101 | available 102 | } else if self.leader_id > 0 { 103 | //As long as a leader exists and is available, the system considers itself in a normal state. 104 | let available = self 105 | .peers 106 | .get(&self.leader_id) 107 | .and_then(|p| p.as_ref().map(|p| p.available)) 108 | .unwrap_or_default(); 109 | log::debug!("has Leader, available: {}", available); 110 | available 111 | } else { 112 | //If there is no Leader, it's still necessary to check whether the number of all other 113 | // available nodes is greater than or equal to half. 114 | let (all_count, available_count) = self.get_count(); 115 | let available = available_count >= ((all_count / 2) + (all_count % 2)); 116 | log::debug!( 117 | "no Leader, all_count: {}, available_count: {} {}", 118 | all_count, 119 | available_count, 120 | available 121 | ); 122 | available 123 | } 124 | } 125 | 126 | #[inline] 127 | fn get_count(&self) -> (usize, usize) { 128 | let available_count = self 129 | .peers 130 | .iter() 131 | .filter(|(_, p)| if let Some(p) = p { p.available } else { false }) 132 | .count(); 133 | if self.peers.contains_key(&self.id) { 134 | (self.peers.len() - 1, available_count - 1) 135 | } else { 136 | (self.peers.len(), available_count) 137 | } 138 | } 139 | 140 | /// Checks if the node has started. 141 | #[inline] 142 | pub fn is_started(&self) -> bool { 143 | self.leader_id > 0 144 | } 145 | 146 | /// Checks if this node is the leader. 147 | #[inline] 148 | pub fn is_leader(&self) -> bool { 149 | self.leader_id == self.id && matches!(self.role, StateRole::Leader) 150 | } 151 | 152 | #[inline] 153 | pub fn deserialize_role<'de, D>(deserializer: D) -> Result 154 | where 155 | D: Deserializer<'de>, 156 | { 157 | let role = match u8::deserialize(deserializer)? { 158 | 1 => StateRole::Follower, 159 | 2 => StateRole::Candidate, 160 | 3 => StateRole::Leader, 161 | 4 => StateRole::PreCandidate, 162 | _ => return Err(de::Error::missing_field("role")), 163 | }; 164 | Ok(role) 165 | } 166 | 167 | #[inline] 168 | pub fn serialize_role(role: &StateRole, s: S) -> std::result::Result 169 | where 170 | S: Serializer, 171 | { 172 | match role { 173 | StateRole::Follower => 1u8, 174 | StateRole::Candidate => 2u8, 175 | StateRole::Leader => 3u8, 176 | StateRole::PreCandidate => 4u8, 177 | } 178 | .serialize(s) 179 | } 180 | } 181 | 182 | /// Enumeration for reply channels which could be single or multiple. 183 | pub(crate) enum ReplyChan { 184 | /// Single reply channel with its timestamp. 185 | One((Sender, Instant)), 186 | /// Multiple reply channels with their timestamps. 187 | More(Vec<(Sender, Instant)>), 188 | } 189 | 190 | /// Enumeration for proposals which could be a single proposal or multiple proposals. 191 | #[derive(Serialize, Deserialize)] 192 | pub(crate) enum Proposals { 193 | /// A single proposal. 194 | One(Vec), 195 | /// Multiple proposals. 196 | More(Vec>), 197 | } 198 | 199 | /// A struct to manage proposal batching and sending. 200 | pub(crate) struct Merger { 201 | proposals: Vec>, 202 | chans: Vec<(Sender, Instant)>, 203 | start_collection_time: i64, 204 | proposal_batch_size: usize, 205 | proposal_batch_timeout: i64, 206 | } 207 | 208 | impl Merger { 209 | /// Creates a new `Merger` instance with the specified batch size and timeout. 210 | /// 211 | /// # Parameters 212 | /// - `proposal_batch_size`: The maximum number of proposals to include in a batch. 213 | /// - `proposal_batch_timeout`: The timeout duration for collecting proposals. 214 | /// 215 | /// # Returns 216 | /// A new `Merger` instance. 217 | pub fn new(proposal_batch_size: usize, proposal_batch_timeout: Duration) -> Self { 218 | Self { 219 | proposals: Vec::new(), 220 | chans: Vec::new(), 221 | start_collection_time: 0, 222 | proposal_batch_size, 223 | proposal_batch_timeout: proposal_batch_timeout.as_millis() as i64, 224 | } 225 | } 226 | 227 | /// Adds a new proposal and its corresponding reply channel to the merger. 228 | /// 229 | /// # Parameters 230 | /// - `proposal`: The proposal data to be added. 231 | /// - `chan`: The reply channel for the proposal. 232 | #[inline] 233 | pub fn add(&mut self, proposal: Vec, chan: Sender) { 234 | self.proposals.push(proposal); 235 | self.chans.push((chan, Instant::now())); 236 | } 237 | 238 | /// Returns the number of proposals currently held by the merger. 239 | /// 240 | /// # Returns 241 | /// The number of proposals. 242 | #[inline] 243 | pub fn len(&self) -> usize { 244 | self.proposals.len() 245 | } 246 | 247 | /// Retrieves a batch of proposals and their corresponding reply channels if the batch size or timeout criteria are met. 248 | /// 249 | /// # Returns 250 | /// An `Option` containing the proposals and reply channels, or `None` if no batch is ready. 251 | #[inline] 252 | pub fn take(&mut self) -> Option<(Proposals, ReplyChan)> { 253 | let max = self.proposal_batch_size; 254 | let len = self.len(); 255 | let len = if len > max { max } else { len }; 256 | if len > 0 && (len == max || self.timeout()) { 257 | let data = if len == 1 { 258 | match (self.proposals.pop(), self.chans.pop()) { 259 | (Some(proposal), Some(chan)) => { 260 | Some((Proposals::One(proposal), ReplyChan::One(chan))) 261 | } 262 | _ => unreachable!(), 263 | } 264 | } else { 265 | let mut proposals = self.proposals.drain(0..len).collect::>(); 266 | let mut chans = self.chans.drain(0..len).collect::>(); 267 | proposals.reverse(); 268 | chans.reverse(); 269 | Some((Proposals::More(proposals), ReplyChan::More(chans))) 270 | }; 271 | self.start_collection_time = chrono::Local::now().timestamp_millis(); 272 | data 273 | } else { 274 | None 275 | } 276 | } 277 | 278 | #[inline] 279 | fn timeout(&self) -> bool { 280 | chrono::Local::now().timestamp_millis() 281 | > (self.start_collection_time + self.proposal_batch_timeout) 282 | } 283 | } 284 | 285 | #[tokio::test] 286 | async fn test_merger() -> std::result::Result<(), Box> { 287 | let mut merger = Merger::new(50, Duration::from_millis(200)); 288 | use futures::channel::oneshot::channel; 289 | use std::time::Duration; 290 | 291 | let add = |merger: &mut Merger| { 292 | let (tx, rx) = channel(); 293 | merger.add(vec![1, 2, 3], tx); 294 | rx 295 | }; 296 | 297 | use std::sync::atomic::{AtomicI64, Ordering}; 298 | use std::sync::Arc; 299 | const MAX: i64 = 111; 300 | let count = Arc::new(AtomicI64::new(0)); 301 | let mut futs = Vec::new(); 302 | for _ in 0..MAX { 303 | let rx = add(&mut merger); 304 | let count1 = count.clone(); 305 | let fut = async move { 306 | let r = tokio::time::timeout(Duration::from_secs(3), rx).await; 307 | match r { 308 | Ok(_) => {} 309 | Err(_) => { 310 | println!("timeout ..."); 311 | } 312 | } 313 | count1.fetch_add(1, Ordering::SeqCst); 314 | }; 315 | 316 | futs.push(fut); 317 | } 318 | 319 | let sends = async { 320 | loop { 321 | if let Some((_data, chan)) = merger.take() { 322 | match chan { 323 | ReplyChan::One((tx, _)) => { 324 | let _ = tx.send(RaftResponse::Ok); 325 | } 326 | ReplyChan::More(txs) => { 327 | for (tx, _) in txs { 328 | let _ = tx.send(RaftResponse::Ok); 329 | } 330 | } 331 | } 332 | } 333 | tokio::time::sleep(Duration::from_millis(100)).await; 334 | if merger.len() == 0 { 335 | break; 336 | } 337 | } 338 | }; 339 | 340 | let count_p = count.clone(); 341 | let count_print = async move { 342 | loop { 343 | tokio::time::sleep(Duration::from_secs(2)).await; 344 | println!("count_p: {}", count_p.load(Ordering::SeqCst)); 345 | if count_p.load(Ordering::SeqCst) >= MAX { 346 | break; 347 | } 348 | } 349 | }; 350 | println!("futs: {}", futs.len()); 351 | futures::future::join3(futures::future::join_all(futs), sends, count_print).await; 352 | 353 | Ok(()) 354 | } 355 | -------------------------------------------------------------------------------- /src/raft.rs: -------------------------------------------------------------------------------- 1 | use std::net::{SocketAddr, ToSocketAddrs}; 2 | use std::sync::Arc; 3 | use std::time::Duration; 4 | 5 | use async_trait::async_trait; 6 | use bincode::{deserialize, serialize}; 7 | use futures::channel::{mpsc, oneshot}; 8 | use futures::future::FutureExt; 9 | use futures::SinkExt; 10 | use log::{debug, info, warn}; 11 | use prost::Message as _; 12 | use tikv_raft::eraftpb::{ConfChange, ConfChangeType}; 13 | use tokio::time::timeout; 14 | use tonic::Request; 15 | 16 | use crate::error::{Error, Result}; 17 | use crate::message::{Message, RaftResponse, Status}; 18 | use crate::raft_node::{Peer, RaftNode}; 19 | use crate::raft_server::RaftServer; 20 | use crate::raft_service::connect; 21 | use crate::raft_service::{ConfChange as RiteraftConfChange, Empty, ResultCode}; 22 | use crate::Config; 23 | 24 | type DashMap = dashmap::DashMap; 25 | 26 | #[async_trait] 27 | pub trait Store { 28 | async fn apply(&mut self, message: &[u8]) -> Result>; 29 | async fn query(&self, query: &[u8]) -> Result>; 30 | async fn snapshot(&self) -> Result>; 31 | async fn restore(&mut self, snapshot: &[u8]) -> Result<()>; 32 | } 33 | 34 | struct ProposalSender { 35 | proposal: Vec, 36 | client: Peer, 37 | } 38 | 39 | impl ProposalSender { 40 | async fn send(self) -> Result { 41 | match self.client.send_proposal(self.proposal).await { 42 | Ok(reply) => { 43 | let raft_response: RaftResponse = deserialize(&reply)?; 44 | Ok(raft_response) 45 | } 46 | Err(e) => { 47 | warn!("error sending proposal {:?}", e); 48 | Err(e) 49 | } 50 | } 51 | } 52 | } 53 | 54 | /// A mailbox to send messages to a running raft node. 55 | #[derive(Clone)] 56 | pub struct Mailbox { 57 | peers: Arc>, 58 | sender: mpsc::Sender, 59 | grpc_timeout: Duration, 60 | grpc_concurrency_limit: usize, 61 | grpc_message_size: usize, 62 | grpc_breaker_threshold: u64, 63 | grpc_breaker_retry_interval: i64, 64 | } 65 | 66 | impl Mailbox { 67 | /// Retrieves a list of peers with their IDs. 68 | /// This method returns a vector containing tuples of peer IDs and their respective `Peer` objects. 69 | /// It iterates over the internal `peers` map and collects the IDs and cloned `Peer` instances. 70 | #[inline] 71 | pub fn pears(&self) -> Vec<(u64, Peer)> { 72 | self.peers 73 | .iter() 74 | .map(|p| { 75 | let (id, _) = p.key(); 76 | (*id, p.value().clone()) 77 | }) 78 | .collect::>() 79 | } 80 | 81 | #[inline] 82 | async fn peer(&self, leader_id: u64, leader_addr: String) -> Peer { 83 | self.peers 84 | .entry((leader_id, leader_addr.clone())) 85 | .or_insert_with(|| { 86 | Peer::new( 87 | leader_addr, 88 | self.grpc_timeout, 89 | self.grpc_concurrency_limit, 90 | self.grpc_message_size, 91 | self.grpc_breaker_threshold, 92 | self.grpc_breaker_retry_interval, 93 | ) 94 | }) 95 | .clone() 96 | } 97 | 98 | #[inline] 99 | async fn send_to_leader( 100 | &self, 101 | proposal: Vec, 102 | leader_id: u64, 103 | leader_addr: String, 104 | ) -> Result { 105 | let peer = self.peer(leader_id, leader_addr).await; 106 | let proposal_sender = ProposalSender { 107 | proposal, 108 | client: peer, 109 | }; 110 | proposal_sender.send().await 111 | } 112 | 113 | /// Sends a proposal to the leader node. 114 | /// This method first attempts to send the proposal to the local node if it is the leader. 115 | /// If the node is not the leader, it retrieves the leader's address and sends the proposal to the leader node. 116 | /// If the proposal is successfully handled, the method returns a `RaftResponse::Response` with the resulting data. 117 | #[inline] 118 | pub async fn send_proposal(&self, message: Vec) -> Result> { 119 | match self.get_leader_info().await? { 120 | (true, _, _) => { 121 | debug!("this node is leader"); 122 | let (tx, rx) = oneshot::channel(); 123 | let proposal = Message::Propose { 124 | proposal: message.clone(), 125 | chan: tx, 126 | }; 127 | let mut sender = self.sender.clone(); 128 | sender 129 | .send(proposal) 130 | .await //.try_send(proposal) 131 | .map_err(|e| Error::SendError(e.to_string()))?; 132 | let reply = timeout(self.grpc_timeout, rx).await; 133 | let reply = reply 134 | .map_err(|e| Error::RecvError(e.to_string()))? 135 | .map_err(|e| Error::RecvError(e.to_string()))?; 136 | match reply { 137 | RaftResponse::Response { data } => return Ok(data), 138 | _ => { 139 | warn!("Recv other raft response: {:?}", reply); 140 | return Err(Error::Unknown); 141 | } 142 | } 143 | } 144 | (false, target_leader_id, target_leader_addr) => { 145 | debug!( 146 | "This node not is Leader, leader_id: {:?}, leader_addr: {:?}", 147 | target_leader_id, target_leader_addr 148 | ); 149 | if let Some(target_leader_addr) = target_leader_addr { 150 | if target_leader_id != 0 { 151 | return match self 152 | .send_to_leader(message, target_leader_id, target_leader_addr.clone()) 153 | .await? 154 | { 155 | RaftResponse::Response { data } => return Ok(data), 156 | RaftResponse::WrongLeader { 157 | leader_id, 158 | leader_addr, 159 | } => { 160 | warn!("The target node is not the Leader, target_leader_id: {}, target_leader_addr: {:?}, actual_leader_id: {}, actual_leader_addr: {:?}", 161 | target_leader_id, target_leader_addr, leader_id, leader_addr); 162 | return Err(Error::NotLeader); 163 | } 164 | RaftResponse::Error(e) => Err(Error::from(e)), 165 | _ => { 166 | warn!("Recv other raft response, target_leader_id: {}, target_leader_addr: {:?}", target_leader_id, target_leader_addr); 167 | return Err(Error::Unknown); 168 | } 169 | }; 170 | } 171 | } 172 | } 173 | } 174 | Err(Error::LeaderNotExist) 175 | } 176 | 177 | /// Deprecated method to send a message, internally calls `send_proposal`. 178 | #[inline] 179 | #[deprecated] 180 | pub async fn send(&self, message: Vec) -> Result> { 181 | self.send_proposal(message).await 182 | } 183 | 184 | /// Sends a query to the Raft node and returns the response data. 185 | /// It sends a `Message::Query` containing the query bytes and waits for a response. 186 | /// On success, it returns the data wrapped in `RaftResponse::Response`. 187 | #[inline] 188 | pub async fn query(&self, query: Vec) -> Result> { 189 | let (tx, rx) = oneshot::channel(); 190 | let mut sender = self.sender.clone(); 191 | match sender.try_send(Message::Query { query, chan: tx }) { 192 | Ok(()) => match timeout(self.grpc_timeout, rx).await { 193 | Ok(Ok(RaftResponse::Response { data })) => Ok(data), 194 | Ok(Ok(RaftResponse::Error(e))) => Err(Error::from(e)), 195 | _ => Err(Error::Unknown), 196 | }, 197 | Err(e) => Err(Error::SendError(e.to_string())), 198 | } 199 | } 200 | 201 | /// Sends a request to leave the Raft cluster. 202 | /// It initiates a `ConfigChange` to remove the node from the cluster and waits for a response. 203 | #[inline] 204 | pub async fn leave(&self) -> Result<()> { 205 | let mut change = ConfChange::default(); 206 | // set node id to 0, the node will set it to self when it receives it. 207 | change.set_node_id(0); 208 | change.set_change_type(ConfChangeType::RemoveNode); 209 | let mut sender = self.sender.clone(); 210 | let (chan, rx) = oneshot::channel(); 211 | match sender.send(Message::ConfigChange { change, chan }).await { 212 | Ok(()) => match rx.await { 213 | Ok(RaftResponse::Ok) => Ok(()), 214 | Ok(RaftResponse::Error(e)) => Err(Error::from(e)), 215 | _ => Err(Error::Unknown), 216 | }, 217 | Err(e) => Err(Error::SendError(e.to_string())), 218 | } 219 | } 220 | 221 | /// Retrieves the current status of the Raft node. 222 | /// Sends a `Message::Status` request and waits for a `RaftResponse::Status` reply, which contains the node's status. 223 | #[inline] 224 | pub async fn status(&self) -> Result { 225 | let (tx, rx) = oneshot::channel(); 226 | let mut sender = self.sender.clone(); 227 | match sender.send(Message::Status { chan: tx }).await { 228 | Ok(_) => match timeout(self.grpc_timeout, rx).await { 229 | Ok(Ok(RaftResponse::Status(status))) => Ok(status), 230 | Ok(Ok(RaftResponse::Error(e))) => Err(Error::from(e)), 231 | _ => Err(Error::Unknown), 232 | }, 233 | Err(e) => Err(Error::SendError(e.to_string())), 234 | } 235 | } 236 | 237 | /// Retrieves leader information, including whether the current node is the leader, the leader ID, and its address. 238 | /// This method sends a `Message::RequestId` and waits for a response with the leader's ID and address. 239 | #[inline] 240 | async fn get_leader_info(&self) -> Result<(bool, u64, Option)> { 241 | let (tx, rx) = oneshot::channel(); 242 | let mut sender = self.sender.clone(); 243 | match sender.send(Message::RequestId { chan: tx }).await { 244 | Ok(_) => match timeout(self.grpc_timeout, rx).await { 245 | Ok(Ok(RaftResponse::RequestId { leader_id })) => Ok((true, leader_id, None)), 246 | Ok(Ok(RaftResponse::WrongLeader { 247 | leader_id, 248 | leader_addr, 249 | })) => Ok((false, leader_id, leader_addr)), 250 | Ok(Ok(RaftResponse::Error(e))) => Err(Error::from(e)), 251 | _ => Err(Error::Unknown), 252 | }, 253 | Err(e) => Err(Error::SendError(e.to_string())), 254 | } 255 | } 256 | } 257 | 258 | pub struct Raft { 259 | store: S, 260 | tx: mpsc::Sender, 261 | rx: mpsc::Receiver, 262 | laddr: SocketAddr, 263 | logger: slog::Logger, 264 | cfg: Arc, 265 | } 266 | 267 | impl Raft { 268 | /// Creates a new Raft node with the provided address, store, logger, and configuration. 269 | /// The node communicates with other peers using a mailbox. 270 | pub fn new( 271 | laddr: A, 272 | store: S, 273 | logger: slog::Logger, 274 | cfg: Config, 275 | ) -> Result { 276 | let laddr = laddr 277 | .to_socket_addrs()? 278 | .next() 279 | .ok_or_else(|| Error::from("None"))?; 280 | let (tx, rx) = mpsc::channel(100_000); 281 | let cfg = Arc::new(cfg); 282 | Ok(Self { 283 | store, 284 | tx, 285 | rx, 286 | laddr, 287 | logger, 288 | cfg, 289 | }) 290 | } 291 | 292 | /// Returns a `Mailbox` for the Raft node, which facilitates communication with peers. 293 | pub fn mailbox(&self) -> Mailbox { 294 | Mailbox { 295 | peers: Arc::new(DashMap::default()), 296 | sender: self.tx.clone(), 297 | grpc_timeout: self.cfg.grpc_timeout, 298 | grpc_concurrency_limit: self.cfg.grpc_concurrency_limit, 299 | grpc_message_size: self.cfg.grpc_message_size, 300 | grpc_breaker_threshold: self.cfg.grpc_breaker_threshold, 301 | grpc_breaker_retry_interval: self.cfg.grpc_breaker_retry_interval.as_millis() as i64, 302 | } 303 | } 304 | 305 | /// Finds leader information by querying a list of peer addresses. 306 | /// Returns the leader ID and its address if found. 307 | pub async fn find_leader_info(&self, peer_addrs: Vec) -> Result> { 308 | let mut futs = Vec::new(); 309 | for addr in peer_addrs { 310 | let fut = async { 311 | let _addr = addr.clone(); 312 | match self.request_leader(addr).await { 313 | Ok(reply) => Ok(reply), 314 | Err(e) => Err(e), 315 | } 316 | }; 317 | futs.push(fut.boxed()); 318 | } 319 | 320 | let (leader_id, leader_addr) = match futures::future::select_ok(futs).await { 321 | Ok((Some((leader_id, leader_addr)), _)) => (leader_id, leader_addr), 322 | Ok((None, _)) => return Err(Error::LeaderNotExist), 323 | Err(_e) => return Ok(None), 324 | }; 325 | 326 | if leader_id == 0 { 327 | Ok(None) 328 | } else { 329 | Ok(Some((leader_id, leader_addr))) 330 | } 331 | } 332 | 333 | /// Requests the leader information from a specific peer. 334 | /// Sends a `Message::RequestId` to the peer and waits for the response. 335 | async fn request_leader(&self, peer_addr: String) -> Result> { 336 | let (leader_id, leader_addr): (u64, String) = { 337 | let mut client = connect( 338 | &peer_addr, 339 | 1, 340 | self.cfg.grpc_message_size, 341 | self.cfg.grpc_timeout, 342 | ) 343 | .await?; 344 | let response = client 345 | .request_id(Request::new(Empty::default())) 346 | .await? 347 | .into_inner(); 348 | match response.code() { 349 | ResultCode::WrongLeader => { 350 | let (leader_id, addr): (u64, Option) = deserialize(&response.data)?; 351 | if let Some(addr) = addr { 352 | (leader_id, addr) 353 | } else { 354 | return Ok(None); 355 | } 356 | } 357 | ResultCode::Ok => (deserialize(&response.data)?, peer_addr), 358 | ResultCode::Error => return Ok(None), 359 | } 360 | }; 361 | Ok(Some((leader_id, leader_addr))) 362 | } 363 | 364 | /// The `lead` function transitions the current node to the leader role in a Raft cluster. 365 | /// It initializes the leader node and runs both the Raft server and the node concurrently. 366 | /// The function will return once the server or node experiences an error, or when the leader 367 | /// role is relinquished. 368 | /// 369 | /// # Arguments 370 | /// 371 | /// * `node_id` - The unique identifier for the node. 372 | /// 373 | /// # Returns 374 | /// 375 | /// A `Result<()>` indicating success or failure during the process. 376 | pub async fn lead(self, node_id: u64) -> Result<()> { 377 | let node = RaftNode::new_leader( 378 | self.rx, 379 | self.tx.clone(), 380 | node_id, 381 | self.store, 382 | &self.logger, 383 | self.cfg.clone(), 384 | )?; 385 | 386 | let server = RaftServer::new(self.tx, self.laddr, self.cfg.clone()); 387 | let server_handle = async { 388 | if let Err(e) = server.run().await { 389 | warn!("raft server run error: {:?}", e); 390 | Err(e) 391 | } else { 392 | Ok(()) 393 | } 394 | }; 395 | let node_handle = async { 396 | if let Err(e) = node.run().await { 397 | warn!("node run error: {:?}", e); 398 | Err(e) 399 | } else { 400 | Ok(()) 401 | } 402 | }; 403 | 404 | tokio::try_join!(server_handle, node_handle)?; 405 | info!("leaving leader node"); 406 | 407 | Ok(()) 408 | } 409 | 410 | /// The `join` function is used to make the current node join an existing Raft cluster. 411 | /// It tries to discover the current leader, communicates with the leader to join the cluster, 412 | /// and configures the node as a follower. 413 | /// 414 | /// # Arguments 415 | /// 416 | /// * `node_id` - The unique identifier for the current node. 417 | /// * `node_addr` - The address of the current node. 418 | /// * `leader_id` - The optional leader node's identifier (if already known). 419 | /// * `leader_addr` - The address of the leader node. 420 | /// 421 | /// # Returns 422 | /// 423 | /// A `Result<()>` indicating success or failure during the joining process. 424 | pub async fn join( 425 | self, 426 | node_id: u64, 427 | node_addr: String, 428 | leader_id: Option, 429 | leader_addr: String, 430 | ) -> Result<()> { 431 | // 1. try to discover the leader and obtain an id from it, if leader_id is None. 432 | info!("attempting to join peer cluster at {}", leader_addr); 433 | let (leader_id, leader_addr): (u64, String) = if let Some(leader_id) = leader_id { 434 | (leader_id, leader_addr) 435 | } else { 436 | self.request_leader(leader_addr) 437 | .await? 438 | .ok_or(Error::JoinError)? 439 | }; 440 | 441 | // 2. run server and node to prepare for joining 442 | let mut node = RaftNode::new_follower( 443 | self.rx, 444 | self.tx.clone(), 445 | node_id, 446 | self.store, 447 | &self.logger, 448 | self.cfg.clone(), 449 | )?; 450 | let peer = node.add_peer(&leader_addr, leader_id); 451 | let mut client = peer.client().await?; 452 | let server = RaftServer::new(self.tx, self.laddr, self.cfg.clone()); 453 | let server_handle = async { 454 | if let Err(e) = server.run().await { 455 | warn!("raft server run error: {:?}", e); 456 | Err(e) 457 | } else { 458 | Ok(()) 459 | } 460 | }; 461 | 462 | //try remove from the cluster 463 | let mut change_remove = ConfChange::default(); 464 | change_remove.set_node_id(node_id); 465 | change_remove.set_change_type(ConfChangeType::RemoveNode); 466 | let change_remove = RiteraftConfChange { 467 | inner: ConfChange::encode_to_vec(&change_remove), 468 | }; 469 | 470 | let raft_response = client 471 | .change_config(Request::new(change_remove)) 472 | .await? 473 | .into_inner(); 474 | 475 | info!( 476 | "change_remove raft_response: {:?}", 477 | deserialize::(&raft_response.inner)? 478 | ); 479 | 480 | // 3. Join the cluster 481 | // TODO: handle wrong leader 482 | let mut change = ConfChange::default(); 483 | change.set_node_id(node_id); 484 | change.set_change_type(ConfChangeType::AddNode); 485 | change.set_context(serialize(&node_addr)?); 486 | // change.set_context(serialize(&node_addr)?); 487 | 488 | let change = RiteraftConfChange { 489 | inner: ConfChange::encode_to_vec(&change), 490 | }; 491 | let raft_response = client 492 | .change_config(Request::new(change)) 493 | .await? 494 | .into_inner(); 495 | if let RaftResponse::JoinSuccess { 496 | assigned_id, 497 | peer_addrs, 498 | } = deserialize(&raft_response.inner)? 499 | { 500 | info!( 501 | "change_config response.assigned_id: {:?}, peer_addrs: {:?}", 502 | assigned_id, peer_addrs 503 | ); 504 | for (id, addr) in peer_addrs { 505 | if id != assigned_id { 506 | node.add_peer(&addr, id); 507 | } 508 | } 509 | } else { 510 | return Err(Error::JoinError); 511 | } 512 | 513 | let node_handle = async { 514 | if let Err(e) = node.run().await { 515 | warn!("node run error: {:?}", e); 516 | Err(e) 517 | } else { 518 | Ok(()) 519 | } 520 | }; 521 | let _ = tokio::try_join!(server_handle, node_handle)?; 522 | info!("leaving follower node"); 523 | Ok(()) 524 | } 525 | } 526 | -------------------------------------------------------------------------------- /src/raft_node.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::ops::{Deref, DerefMut}; 3 | use std::sync::atomic::{AtomicI64, AtomicIsize, AtomicU64, Ordering}; 4 | use std::sync::Arc; 5 | use std::time::{Duration, Instant}; 6 | 7 | use bincode::{deserialize, serialize}; 8 | use bytestring::ByteString; 9 | use futures::channel::{mpsc, oneshot}; 10 | use futures::SinkExt; 11 | use futures::StreamExt; 12 | use log::*; 13 | use prost::Message as _; 14 | use tikv_raft::eraftpb::{ConfChange, ConfChangeType, Entry, EntryType, Message as RaftMessage}; 15 | use tikv_raft::{prelude::*, raw_node::RawNode, Config as RaftConfig}; 16 | use tokio::sync::RwLock; 17 | use tokio::time::timeout; 18 | use tonic::Request; 19 | 20 | use crate::error::{Error, Result}; 21 | use crate::message::{Merger, Message, PeerState, Proposals, RaftResponse, ReplyChan, Status}; 22 | use crate::raft::Store; 23 | use crate::raft_service::raft_service_client::RaftServiceClient; 24 | use crate::raft_service::{connect, Message as RraftMessage, Proposal as RraftProposal, Query}; 25 | use crate::storage::{LogStore, MemStorage}; 26 | use crate::Config; 27 | 28 | pub type RaftGrpcClient = RaftServiceClient; 29 | 30 | struct MessageSender { 31 | message: RaftMessage, 32 | client: Peer, 33 | client_id: u64, 34 | chan: mpsc::Sender, 35 | max_retries: usize, 36 | timeout: Duration, 37 | sending_raft_messages: Arc, 38 | } 39 | 40 | impl MessageSender { 41 | /// attempt to send a message MessageSender::max_retries times at MessageSender::timeout 42 | /// inteval. 43 | async fn send(mut self) { 44 | let mut current_retry = 0usize; 45 | loop { 46 | match self.client.send_message(&self.message).await { 47 | Ok(_) => { 48 | self.sending_raft_messages.fetch_sub(1, Ordering::SeqCst); 49 | return; 50 | } 51 | Err(e) => { 52 | if current_retry < self.max_retries { 53 | current_retry += 1; 54 | tokio::time::sleep(self.timeout).await; 55 | } else { 56 | warn!( 57 | "error sending message after {}/{} retries: {:?}, target addr: {:?}", 58 | current_retry, self.max_retries, e, self.client.addr 59 | ); 60 | if let Err(e) = self 61 | .chan 62 | .send(Message::ReportUnreachable { 63 | node_id: self.client_id, 64 | }) 65 | .await 66 | { 67 | warn!( 68 | "error ReportUnreachable after {}/{} retries: {:?}, target addr: {:?}", 69 | current_retry, self.max_retries, e, self.client.addr 70 | ); 71 | } 72 | self.sending_raft_messages.fetch_sub(1, Ordering::SeqCst); 73 | return; 74 | } 75 | } 76 | } 77 | } 78 | } 79 | } 80 | 81 | struct QuerySender { 82 | query: Vec, 83 | client: Peer, 84 | chan: oneshot::Sender, 85 | max_retries: usize, 86 | timeout: Duration, 87 | } 88 | 89 | impl QuerySender { 90 | async fn send(self) { 91 | let mut current_retry = 0usize; 92 | 93 | let mut client = match self.client.client().await { 94 | Ok(c) => c, 95 | Err(e) => { 96 | warn!( 97 | "error sending query after, {:?}, target addr: {:?}", 98 | e, self.client.addr 99 | ); 100 | if let Err(e) = self.chan.send(RaftResponse::Error(e.to_string())) { 101 | warn!( 102 | "send_query, Message::Query, RaftResponse send error: {:?}, target addr: {:?}", 103 | e, self.client.addr 104 | ); 105 | } 106 | return; 107 | } 108 | }; 109 | 110 | loop { 111 | let message_request = Request::new(Query { 112 | inner: self.query.clone(), 113 | }); 114 | match client.send_query(message_request).await { 115 | Ok(grpc_response) => { 116 | let raft_response = match deserialize(&grpc_response.into_inner().inner) { 117 | Ok(resp) => resp, 118 | Err(e) => { 119 | warn!( 120 | "send_query, Message::Query, RaftResponse deserialize error: {:?}, target addr: {:?}", 121 | e, self.client.addr 122 | ); 123 | return; 124 | } 125 | }; 126 | if let Err(e) = self.chan.send(raft_response) { 127 | warn!( 128 | "send_query, Message::Query, RaftResponse send error: {:?}, target addr: {:?}", 129 | e, self.client.addr 130 | ); 131 | } 132 | return; 133 | } 134 | Err(e) => { 135 | if current_retry < self.max_retries { 136 | current_retry += 1; 137 | tokio::time::sleep(self.timeout).await; 138 | } else { 139 | warn!( 140 | "error sending query after {} retries: {}, target addr: {:?}", 141 | self.max_retries, e, self.client.addr 142 | ); 143 | if let Err(e) = self.chan.send(RaftResponse::Error(e.to_string())) { 144 | warn!( 145 | "send_query, Message::Query, RaftResponse send error: {:?}, target addr: {:?}", 146 | e, self.client.addr 147 | ); 148 | } 149 | return; 150 | } 151 | } 152 | } 153 | } 154 | } 155 | } 156 | 157 | #[derive(Clone)] 158 | pub struct Peer { 159 | addr: ByteString, 160 | client: Arc>>, 161 | grpc_fails: Arc, 162 | grpc_fail_time: Arc, 163 | crw_timeout: Duration, 164 | concurrency_limit: usize, 165 | grpc_message_size: usize, 166 | grpc_breaker_threshold: u64, 167 | grpc_breaker_retry_interval: i64, 168 | active_tasks: Arc, 169 | } 170 | 171 | impl Peer { 172 | /// Creates a new `Peer` instance with the specified parameters. 173 | /// 174 | /// # Parameters 175 | /// - `addr`: The address of the peer to connect to. 176 | /// - `crw_timeout`: The timeout duration for connection and read/write operations. 177 | /// - `concurrency_limit`: The maximum number of concurrent gRPC requests allowed. 178 | /// - `grpc_message_size`: The maximum size of a gRPC message. 179 | /// - `grpc_breaker_threshold`: The threshold for the number of gRPC failures before breaking the circuit. 180 | /// - `grpc_breaker_retry_interval`: The time interval for retrying after the circuit breaker is tripped. 181 | /// 182 | /// # Returns 183 | /// - A new `Peer` instance with the provided configuration. 184 | /// 185 | /// # Behavior 186 | /// - Initializes internal state, including counters and timeouts. 187 | /// - Logs the connection attempt to the specified address. 188 | pub fn new( 189 | addr: String, 190 | crw_timeout: Duration, 191 | concurrency_limit: usize, 192 | grpc_message_size: usize, 193 | grpc_breaker_threshold: u64, 194 | grpc_breaker_retry_interval: i64, 195 | ) -> Peer { 196 | debug!("connecting to node at {}...", addr); 197 | Peer { 198 | addr: addr.into(), 199 | client: Arc::new(RwLock::new(None)), 200 | grpc_fails: Arc::new(AtomicU64::new(0)), 201 | grpc_fail_time: Arc::new(AtomicI64::new(0)), 202 | crw_timeout, 203 | concurrency_limit, 204 | grpc_message_size, 205 | grpc_breaker_threshold, 206 | grpc_breaker_retry_interval, 207 | active_tasks: Arc::new(AtomicI64::new(0)), 208 | } 209 | } 210 | 211 | /// Returns the number of currently active tasks associated with this peer. 212 | /// 213 | /// # Returns 214 | /// - The number of active tasks as an `i64`. 215 | /// 216 | /// # Behavior 217 | /// - Reads the value of the `active_tasks` counter. 218 | #[inline] 219 | pub fn active_tasks(&self) -> i64 { 220 | self.active_tasks.load(Ordering::SeqCst) 221 | } 222 | 223 | /// Returns the number of gRPC failures encountered by this peer. 224 | /// 225 | /// # Returns 226 | /// - The number of gRPC failures as a `u64`. 227 | /// 228 | /// # Behavior 229 | /// - Reads the value of the `grpc_fails` counter. 230 | #[inline] 231 | pub fn grpc_fails(&self) -> u64 { 232 | self.grpc_fails.load(Ordering::SeqCst) 233 | } 234 | 235 | /// Connects to the peer if not already connected, and returns the gRPC client. 236 | /// 237 | /// # Returns 238 | /// - `Ok(RaftGrpcClient)`: On successful connection, returns the gRPC client. 239 | /// - `Err(Error)`: On failure, returns an error. 240 | /// 241 | /// # Behavior 242 | /// - Checks if the gRPC client is already connected and available. 243 | /// - If not, attempts to establish a new connection and store the client. 244 | #[inline] 245 | async fn connect(&self) -> Result { 246 | if let Some(c) = self.client.read().await.as_ref() { 247 | return Ok(c.clone()); 248 | } 249 | 250 | let mut client = self.client.write().await; 251 | if let Some(c) = client.as_ref() { 252 | return Ok(c.clone()); 253 | } 254 | 255 | let c = connect( 256 | &self.addr, 257 | self.concurrency_limit, 258 | self.grpc_message_size, 259 | self.crw_timeout, 260 | ) 261 | .await?; 262 | client.replace(c.clone()); 263 | Ok(c) 264 | } 265 | 266 | /// Retrieves the gRPC client by establishing a connection if needed. 267 | /// 268 | /// # Returns 269 | /// - `Ok(RaftGrpcClient)`: On successful connection, returns the gRPC client. 270 | /// - `Err(Error)`: On failure, returns an error. 271 | /// 272 | /// # Behavior 273 | /// - Calls `connect` to ensure the client is connected and available. 274 | #[inline] 275 | pub async fn client(&self) -> Result { 276 | self.connect().await 277 | } 278 | 279 | /// Sends a Raft message to the peer and waits for a response. 280 | /// 281 | /// # Parameters 282 | /// - `msg`: The Raft message to be sent. 283 | /// 284 | /// # Returns 285 | /// - `Ok(Vec)`: On successful message send, returns the response data as a byte vector. 286 | /// - `Err(Error)`: On failure, returns an error. 287 | /// 288 | /// # Behavior 289 | /// - Checks if the peer is available for sending messages. 290 | /// - Encodes the message and sends it using the `_send_message` method. 291 | /// - Updates the active task count and records success or failure. 292 | #[inline] 293 | pub async fn send_message(&self, msg: &RaftMessage) -> Result> { 294 | if !self.available() { 295 | return Err(Error::Msg("The gRPC remote service is unavailable".into())); 296 | } 297 | 298 | let msg = RraftMessage { 299 | inner: RaftMessage::encode_to_vec(msg), 300 | }; 301 | self.active_tasks.fetch_add(1, Ordering::SeqCst); 302 | let reply = self._send_message(msg).await; 303 | self.active_tasks.fetch_sub(1, Ordering::SeqCst); 304 | match reply { 305 | Ok(reply) => { 306 | self.record_success(); 307 | Ok(reply) 308 | } 309 | Err(e) => { 310 | self.record_failure(); 311 | Err(e) 312 | } 313 | } 314 | } 315 | 316 | #[inline] 317 | async fn _send_message(&self, msg: RraftMessage) -> Result> { 318 | let c = self.connect().await?; 319 | async fn task(mut c: RaftGrpcClient, msg: RraftMessage) -> Result> { 320 | let message_request = Request::new(msg); 321 | let response = c.send_message(message_request).await?; 322 | let message_reply = response.into_inner(); 323 | Ok(message_reply.inner) 324 | } 325 | 326 | let result = tokio::time::timeout(self.crw_timeout, task(c, msg)).await; 327 | let result = result.map_err(|_| Error::Elapsed)??; 328 | Ok(result) 329 | } 330 | 331 | /// Sends a Raft proposal to the peer and waits for a response. 332 | /// 333 | /// # Parameters 334 | /// - `msg`: The Raft proposal to be sent as a byte vector. 335 | /// 336 | /// # Returns 337 | /// - `Ok(Vec)`: On successful proposal send, returns the response data as a byte vector. 338 | /// - `Err(Error)`: On failure, returns an error. 339 | /// 340 | /// # Behavior 341 | /// - Checks if the peer is available for sending proposals. 342 | /// - Wraps the proposal in a `RraftProposal` and sends it using the `_send_proposal` method. 343 | /// - Updates the active task count and records success or failure. 344 | #[inline] 345 | pub async fn send_proposal(&self, msg: Vec) -> Result> { 346 | if !self.available() { 347 | return Err(Error::Msg("The gRPC remote service is unavailable".into())); 348 | } 349 | 350 | let msg = RraftProposal { inner: msg }; 351 | let _active_tasks = self.active_tasks.fetch_add(1, Ordering::SeqCst); 352 | let reply = self._send_proposal(msg).await; 353 | self.active_tasks.fetch_sub(1, Ordering::SeqCst); 354 | match reply { 355 | Ok(reply) => { 356 | self.record_success(); 357 | Ok(reply) 358 | } 359 | Err(e) => { 360 | self.record_failure(); 361 | Err(e) 362 | } 363 | } 364 | } 365 | 366 | #[inline] 367 | async fn _send_proposal(&self, msg: RraftProposal) -> Result> { 368 | let c = self.connect().await?; 369 | 370 | async fn task(mut c: RaftGrpcClient, msg: RraftProposal) -> Result> { 371 | let message_request = Request::new(msg); 372 | let response = c.send_proposal(message_request).await?; 373 | let message_reply = response.into_inner(); 374 | Ok(message_reply.inner) 375 | } 376 | 377 | let result = tokio::time::timeout(self.crw_timeout, task(c, msg)).await; 378 | let result = result.map_err(|_| Error::Elapsed)??; 379 | Ok(result) 380 | } 381 | 382 | #[inline] 383 | fn record_failure(&self) { 384 | self.grpc_fails.fetch_add(1, Ordering::SeqCst); 385 | self.grpc_fail_time 386 | .store(chrono::Local::now().timestamp_millis(), Ordering::SeqCst); 387 | } 388 | 389 | #[inline] 390 | fn record_success(&self) { 391 | self.grpc_fails.store(0, Ordering::SeqCst); 392 | } 393 | 394 | #[inline] 395 | pub(crate) fn is_unavailable(&self) -> bool { 396 | self.grpc_fails.load(Ordering::SeqCst) >= self.grpc_breaker_threshold 397 | } 398 | 399 | #[inline] 400 | fn available(&self) -> bool { 401 | self.grpc_fails.load(Ordering::SeqCst) < self.grpc_breaker_threshold 402 | || (chrono::Local::now().timestamp_millis() 403 | - self.grpc_fail_time.load(Ordering::SeqCst)) 404 | > self.grpc_breaker_retry_interval 405 | } 406 | } 407 | 408 | pub struct RaftNode { 409 | inner: RawNode, 410 | pub peers: HashMap>, 411 | pub rcv: mpsc::Receiver, 412 | pub snd: mpsc::Sender, 413 | store: S, 414 | uncommitteds: HashMap, 415 | should_quit: bool, 416 | seq: AtomicU64, 417 | sending_raft_messages: Arc, 418 | last_snap_time: Instant, 419 | cfg: Arc, 420 | } 421 | 422 | impl RaftNode { 423 | /// Creates a new leader node for the Raft cluster. 424 | /// 425 | /// This function initializes a new `RaftNode` instance as a leader. It sets up the Raft configuration, 426 | /// applies a default snapshot to initialize the state, and sets the node to be a leader. 427 | /// 428 | /// # Parameters 429 | /// - `rcv`: A receiver for Raft messages. This will be used to receive incoming messages. 430 | /// - `snd`: A sender for Raft messages. This will be used to send outgoing messages. 431 | /// - `id`: The unique identifier for this Raft node. 432 | /// - `store`: The store implementation used for persisting Raft state. 433 | /// - `logger`: A logger instance for logging messages related to the Raft node. 434 | /// - `cfg`: Configuration for the Raft node, including various timeouts and limits. 435 | /// 436 | /// # Returns 437 | /// Returns a `Result` containing either the newly created `RaftNode` or an error if the creation failed. 438 | pub fn new_leader( 439 | rcv: mpsc::Receiver, 440 | snd: mpsc::Sender, 441 | id: u64, 442 | store: S, 443 | logger: &slog::Logger, 444 | cfg: Arc, 445 | ) -> Result { 446 | let config = Self::new_config(id, &cfg.raft_cfg); 447 | config.validate()?; 448 | 449 | let mut s = Snapshot::default(); 450 | // Because we don't use the same configuration to initialize every node, so we use 451 | // a non-zero index to force new followers catch up logs by snapshot first, which will 452 | // bring all nodes to the same initial state. 453 | s.mut_metadata().index = 1; 454 | s.mut_metadata().term = 1; 455 | s.mut_metadata().mut_conf_state().voters = vec![id]; 456 | 457 | let mut storage: MemStorage = MemStorage::create(); 458 | storage.apply_snapshot(s)?; 459 | let mut inner = RawNode::new(&config, storage, logger)?; 460 | let peers = HashMap::new(); 461 | let seq = AtomicU64::new(0); 462 | let sending_raft_messages = Arc::new(AtomicIsize::new(0)); 463 | let last_snap_time = Instant::now(); // + cfg.snapshot_interval; 464 | 465 | inner.raft.become_candidate(); 466 | inner.raft.become_leader(); 467 | 468 | // let msg_tx = Self::start_message_sender(); 469 | let uncommitteds = HashMap::new(); 470 | let node = RaftNode { 471 | inner, 472 | rcv, 473 | peers, 474 | store, 475 | // msg_tx, 476 | uncommitteds, 477 | seq, 478 | snd, 479 | should_quit: false, 480 | sending_raft_messages, 481 | last_snap_time, 482 | cfg, 483 | }; 484 | Ok(node) 485 | } 486 | 487 | /// Creates a new follower node for the Raft cluster. 488 | /// 489 | /// This function initializes a new `RaftNode` instance as a follower. It sets up the Raft configuration 490 | /// and creates a new `RawNode` instance in follower mode. 491 | /// 492 | /// # Parameters 493 | /// - `rcv`: A receiver for Raft messages. This will be used to receive incoming messages. 494 | /// - `snd`: A sender for Raft messages. This will be used to send outgoing messages. 495 | /// - `id`: The unique identifier for this Raft node. 496 | /// - `store`: The store implementation used for persisting Raft state. 497 | /// - `logger`: A logger instance for logging messages related to the Raft node. 498 | /// - `cfg`: Configuration for the Raft node, including various timeouts and limits. 499 | /// 500 | /// # Returns 501 | /// Returns a `Result` containing either the newly created `RaftNode` or an error if the creation failed. 502 | pub fn new_follower( 503 | rcv: mpsc::Receiver, 504 | snd: mpsc::Sender, 505 | id: u64, 506 | store: S, 507 | logger: &slog::Logger, 508 | cfg: Arc, 509 | ) -> Result { 510 | let config = Self::new_config(id, &cfg.raft_cfg); 511 | config.validate()?; 512 | 513 | let storage = MemStorage::create(); 514 | let inner = RawNode::new(&config, storage, logger)?; 515 | let peers = HashMap::new(); 516 | let seq = AtomicU64::new(0); 517 | let sending_raft_messages = Arc::new(AtomicIsize::new(0)); 518 | let last_snap_time = Instant::now(); // + cfg.snapshot_interval; 519 | // let msg_tx = Self::start_message_sender(); 520 | let uncommitteds = HashMap::new(); 521 | Ok(RaftNode { 522 | inner, 523 | rcv, 524 | peers, 525 | store, 526 | // msg_tx, 527 | uncommitteds, 528 | seq, 529 | snd, 530 | should_quit: false, 531 | sending_raft_messages, 532 | last_snap_time, 533 | cfg, 534 | }) 535 | } 536 | 537 | /// Creates a new Raft configuration with the specified node ID. 538 | /// 539 | /// This function clones the provided configuration and sets the node ID. 540 | /// 541 | /// # Parameters 542 | /// - `id`: The unique identifier for the Raft node. 543 | /// - `cfg`: The base Raft configuration to clone and modify. 544 | /// 545 | /// # Returns 546 | /// Returns a `RaftConfig` with the updated node ID. 547 | #[inline] 548 | fn new_config(id: u64, cfg: &RaftConfig) -> RaftConfig { 549 | let mut cfg = cfg.clone(); 550 | cfg.id = id; 551 | cfg 552 | } 553 | 554 | /// Retrieves a peer by its ID. 555 | /// 556 | /// This function looks up a peer in the `peers` map by its ID. 557 | /// 558 | /// # Parameters 559 | /// - `id`: The ID of the peer to retrieve. 560 | /// 561 | /// # Returns 562 | /// Returns an `Option`. If the peer is found, it is returned; otherwise, `None` is returned. 563 | #[inline] 564 | pub fn peer(&self, id: u64) -> Option { 565 | match self.peers.get(&id) { 566 | Some(Some(p)) => Some(p.clone()), 567 | _ => None, 568 | } 569 | } 570 | 571 | /// Checks if the current node is the leader. 572 | /// 573 | /// This function compares the leader ID of the Raft instance with the current node's ID. 574 | /// 575 | /// # Returns 576 | /// Returns `true` if the current node is the leader, otherwise `false`. 577 | #[inline] 578 | pub fn is_leader(&self) -> bool { 579 | self.inner.raft.leader_id == self.inner.raft.id 580 | } 581 | 582 | /// Retrieves the ID of the current node. 583 | /// 584 | /// This function returns the unique identifier of the current Raft node. 585 | /// 586 | /// # Returns 587 | /// Returns the node's ID as a `u64`. 588 | #[inline] 589 | pub fn id(&self) -> u64 { 590 | self.raft.id 591 | } 592 | 593 | /// Adds a new peer to the `peers` map. 594 | /// 595 | /// This function creates a new `Peer` instance with the specified address and configuration, 596 | /// and adds it to the `peers` map. 597 | /// 598 | /// # Parameters 599 | /// - `addr`: The address of the new peer. 600 | /// - `id`: The unique identifier for the new peer. 601 | /// 602 | /// # Returns 603 | /// Returns the newly created `Peer` instance. 604 | #[inline] 605 | pub fn add_peer(&mut self, addr: &str, id: u64) -> Peer { 606 | let peer = Peer::new( 607 | addr.to_string(), 608 | self.cfg.grpc_timeout, 609 | self.cfg.grpc_concurrency_limit, 610 | self.cfg.grpc_message_size, 611 | self.cfg.grpc_breaker_threshold, 612 | self.cfg.grpc_breaker_retry_interval.as_millis() as i64, 613 | ); 614 | self.peers.insert(id, Some(peer.clone())); 615 | peer 616 | } 617 | 618 | #[inline] 619 | fn leader(&self) -> u64 { 620 | self.raft.leader_id 621 | } 622 | 623 | #[inline] 624 | fn has_leader(&self) -> bool { 625 | self.raft.leader_id > 0 626 | } 627 | 628 | #[inline] 629 | fn peer_addrs(&self) -> HashMap { 630 | self.peers 631 | .iter() 632 | .filter_map(|(&id, peer)| { 633 | peer.as_ref() 634 | .map(|Peer { addr, .. }| (id, addr.to_string())) 635 | }) 636 | .collect() 637 | } 638 | 639 | #[inline] 640 | fn peer_states(&self) -> HashMap> { 641 | self.peers 642 | .iter() 643 | .map(|(id, peer)| { 644 | if let Some(p) = peer { 645 | ( 646 | *id, 647 | Some(PeerState { 648 | addr: p.addr.clone(), 649 | available: !p.is_unavailable(), 650 | }), 651 | ) 652 | } else { 653 | (*id, None) 654 | } 655 | }) 656 | .collect() 657 | } 658 | 659 | #[inline] 660 | fn status(&self, merger_proposals: usize) -> Status { 661 | let role = self.raft.state; 662 | let leader_id = self.raft.leader_id; 663 | let sending_raft_messages = self.sending_raft_messages.load(Ordering::SeqCst); 664 | Status { 665 | id: self.inner.raft.id, 666 | leader_id, 667 | uncommitteds: self.uncommitteds.len(), 668 | merger_proposals, 669 | sending_raft_messages, 670 | peers: self.peer_states(), 671 | role, 672 | } 673 | } 674 | 675 | // forward query request to leader 676 | #[inline] 677 | async fn forward_query(&self, query: Vec, chan: oneshot::Sender) { 678 | let id = self.leader(); 679 | let peer = match self.peer(id) { 680 | Some(peer) => peer, 681 | None => { 682 | if let Err(e) = chan.send(RaftResponse::WrongLeader { 683 | leader_id: id, 684 | leader_addr: None, 685 | }) { 686 | warn!( 687 | "forward_query, Message::Query, RaftResponse send error: {:?}", 688 | e 689 | ); 690 | } 691 | return; 692 | } 693 | }; 694 | 695 | let query_sender = QuerySender { 696 | query, 697 | client: peer, 698 | chan, 699 | timeout: Duration::from_millis(1000), 700 | max_retries: 0, 701 | }; 702 | tokio::spawn(query_sender.send()); 703 | } 704 | 705 | #[inline] 706 | async fn send_query(&self, query: &[u8], chan: oneshot::Sender) { 707 | let data = self.store.query(query).await.unwrap_or_default(); 708 | if let Err(e) = chan.send(RaftResponse::Response { data }) { 709 | warn!("Message::Query, RaftResponse send error: {:?}", e); 710 | } 711 | } 712 | 713 | #[inline] 714 | fn send_wrong_leader(&self, from: &str, chan: oneshot::Sender) { 715 | let leader_id = self.leader(); 716 | // leader can't be an empty node 717 | let leader_addr = self 718 | .peers 719 | .get(&leader_id) 720 | .and_then(|peer| peer.as_ref().map(|p| p.addr.to_string())); 721 | let raft_response = RaftResponse::WrongLeader { 722 | leader_id, 723 | leader_addr, 724 | }; 725 | if let Err(e) = chan.send(raft_response) { 726 | warn!( 727 | "send_wrong_leader, from: {}, RaftResponse send error: {:?}", 728 | from, e 729 | ); 730 | } 731 | } 732 | 733 | #[inline] 734 | fn _send_error(&self, chan: oneshot::Sender, e: String) { 735 | let raft_response = RaftResponse::Error(e); 736 | if let Err(e) = chan.send(raft_response) { 737 | warn!("send_error, RaftResponse send error: {:?}", e); 738 | } 739 | } 740 | 741 | #[inline] 742 | fn send_leader_id(&self, chan: oneshot::Sender) { 743 | if let Err(e) = chan.send(RaftResponse::RequestId { 744 | leader_id: self.leader(), 745 | }) { 746 | warn!("Message::RequestId, RaftResponse send error: {:?}", e); 747 | } 748 | } 749 | 750 | #[inline] 751 | fn send_status(&self, merger: &Merger, chan: oneshot::Sender) { 752 | if let Err(e) = chan.send(RaftResponse::Status(self.status(merger.len()))) { 753 | warn!("Message::Status, RaftResponse send error: {:?}", e); 754 | } 755 | } 756 | 757 | #[inline] 758 | fn take_and_propose(&mut self, merger: &mut Merger) { 759 | if let Some((data, reply_chans)) = merger.take() { 760 | let seq = self.seq.fetch_add(1, Ordering::Relaxed); 761 | self.uncommitteds.insert(seq, reply_chans); 762 | let seq = serialize(&seq).unwrap(); 763 | let data = serialize(&data).unwrap(); 764 | if let Err(e) = self.propose(seq, data) { 765 | error!("propose to raft error, {:?}", e); 766 | } 767 | } 768 | } 769 | 770 | pub(crate) async fn run(mut self) -> Result<()> { 771 | let mut heartbeat = self.cfg.heartbeat; 772 | let mut now = Instant::now(); 773 | let mut snapshot_received = self.is_leader(); 774 | let mut merger = Merger::new( 775 | self.cfg.proposal_batch_size, 776 | self.cfg.proposal_batch_timeout, 777 | ); 778 | info!("snapshot_received: {:?}", snapshot_received); 779 | info!("has_leader: {:?}", self.has_leader()); 780 | 781 | loop { 782 | if self.should_quit { 783 | warn!("Quitting raft"); 784 | return Ok(()); 785 | } 786 | 787 | match timeout(heartbeat, self.rcv.next()).await { 788 | Ok(Some(Message::ConfigChange { chan, mut change })) => { 789 | info!("change Received, {:?}", change); 790 | // whenever a change id is 0, it's a message to self. 791 | if change.get_node_id() == 0 { 792 | change.set_node_id(self.id()); 793 | } 794 | 795 | if !self.is_leader() { 796 | // wrong leader send client cluster data 797 | // TODO: retry strategy in case of failure 798 | self.send_wrong_leader("ConfigChange", chan); 799 | } else { 800 | // leader assign new id to peer 801 | info!("received request from: {}", change.get_node_id()); 802 | let seq = self.seq.fetch_add(1, Ordering::Relaxed); 803 | self.uncommitteds 804 | .insert(seq, ReplyChan::One((chan, Instant::now()))); 805 | if let Err(e) = self.propose_conf_change(serialize(&seq).unwrap(), change) { 806 | warn!("propose_conf_change, error: {:?}", e); 807 | } 808 | } 809 | } 810 | Ok(Some(Message::Raft(m))) => { 811 | debug!( 812 | "raft message: to={} from={} msg_type={:?}, commit={}, {:?}", 813 | self.raft.id, 814 | m.from, 815 | m.msg_type, 816 | m.get_commit(), 817 | m 818 | ); 819 | let msg_type = m.get_msg_type(); 820 | if MessageType::MsgTransferLeader == msg_type { 821 | info!( 822 | "raft message MsgTransferLeader, snapshot_received: {}, raft.leader_id: {}, {:?}", 823 | snapshot_received, self.raft.leader_id, m 824 | ); 825 | } 826 | 827 | if !snapshot_received && msg_type == MessageType::MsgHeartbeat { 828 | info!( 829 | "raft message, snapshot_received: {}, has_leader: {}, {:?}", 830 | snapshot_received, 831 | self.has_leader(), 832 | m 833 | ); 834 | } else { 835 | if let Err(e) = self.step(*m) { 836 | warn!( 837 | "step error, {:?}, msg_type: {:?}, snapshot_received: {}", 838 | e, msg_type, snapshot_received 839 | ); 840 | } 841 | if msg_type == MessageType::MsgSnapshot { 842 | snapshot_received = true; 843 | } 844 | } 845 | } 846 | Ok(Some(Message::Propose { proposal, chan })) => { 847 | let now = Instant::now(); 848 | if !self.is_leader() { 849 | debug!("Message::Propose, send_wrong_leader {:?}", proposal); 850 | self.send_wrong_leader("Propose", chan); 851 | } else { 852 | merger.add(proposal, chan); 853 | self.take_and_propose(&mut merger); 854 | } 855 | if now.elapsed() > self.cfg.heartbeat { 856 | info!("Message::Propose elapsed: {:?}", now.elapsed()); 857 | } 858 | } 859 | 860 | Ok(Some(Message::Query { query, chan })) => { 861 | let now = Instant::now(); 862 | if !self.is_leader() { 863 | debug!("[forward_query] query.len: {:?}", query.len()); 864 | self.forward_query(query, chan).await; 865 | } else { 866 | debug!("Message::Query, {:?}", query); 867 | self.send_query(&query, chan).await; 868 | } 869 | if now.elapsed() > self.cfg.heartbeat { 870 | info!("Message::Query elapsed: {:?}", now.elapsed()); 871 | } 872 | } 873 | 874 | Ok(Some(Message::RequestId { chan })) => { 875 | debug!("requested Id, is_leader: {}", self.is_leader()); 876 | if !self.is_leader() { 877 | self.send_wrong_leader("RequestId", chan); 878 | } else { 879 | self.send_leader_id(chan); 880 | } 881 | } 882 | Ok(Some(Message::Status { chan })) => { 883 | self.send_status(&merger, chan); 884 | } 885 | Ok(Some(Message::ReportUnreachable { node_id })) => { 886 | debug!( 887 | "Message::ReportUnreachable, node_id: {}, sending_raft_messages: {}", 888 | node_id, 889 | self.sending_raft_messages.load(Ordering::SeqCst) 890 | ); 891 | self.report_unreachable(node_id); 892 | } 893 | Ok(None) => { 894 | error!("Recv None"); 895 | return Err(Error::RecvError("Recv None".into())); 896 | } 897 | Err(_) => { 898 | self.take_and_propose(&mut merger); 899 | } 900 | } 901 | 902 | let elapsed = now.elapsed(); 903 | now = Instant::now(); 904 | 905 | if elapsed >= heartbeat { 906 | if elapsed > Duration::from_millis(500) { 907 | warn!( 908 | "[run] raft tick elapsed: {:?}, heartbeat: {:?}, uncommitteds: {}, sending_raft_messages: {}", 909 | elapsed, 910 | heartbeat, 911 | self.uncommitteds.len(), 912 | self.sending_raft_messages.load(Ordering::SeqCst), 913 | ); 914 | } 915 | heartbeat = self.cfg.heartbeat; 916 | self.tick(); 917 | } else { 918 | heartbeat -= elapsed; 919 | } 920 | 921 | let on_ready_now = Instant::now(); 922 | if let Err(e) = self.on_ready().await { 923 | error!( 924 | "raft on_ready(..) error: {:?}, elapsed: {:?}", 925 | e, 926 | on_ready_now.elapsed() 927 | ); 928 | return Err(e); 929 | } 930 | if on_ready_now.elapsed() > Duration::from_millis(500) { 931 | warn!( 932 | "[run] raft on_ready(..) uncommitteds: {}, sending_raft_messages: {}, elapsed: {:?}", 933 | self.uncommitteds.len(), 934 | self.sending_raft_messages.load(Ordering::SeqCst), 935 | on_ready_now.elapsed() 936 | ); 937 | } 938 | } 939 | } 940 | 941 | async fn on_ready(&mut self) -> Result<()> { 942 | if !self.has_ready() { 943 | return Ok(()); 944 | } 945 | 946 | let mut ready = self.ready(); 947 | 948 | if !ready.messages().is_empty() { 949 | // Send out the messages. 950 | self.send_messages(ready.take_messages()); 951 | } 952 | 953 | if *ready.snapshot() != Snapshot::default() { 954 | let snapshot = ready.snapshot(); 955 | self.store.restore(snapshot.get_data()).await?; 956 | let store = self.mut_store(); 957 | store.apply_snapshot(Snapshot { 958 | metadata: Some(snapshot.get_metadata().clone()), 959 | ..Default::default() 960 | })?; 961 | } 962 | 963 | self.handle_committed_entries(ready.take_committed_entries()) 964 | .await?; 965 | 966 | if !ready.entries().is_empty() { 967 | let entries = ready.entries(); 968 | let store = self.mut_store(); 969 | store.append(entries)?; 970 | } 971 | 972 | if let Some(hs) = ready.hs() { 973 | // Raft HardState changed, and we need to persist it. 974 | let store = self.mut_store(); 975 | store.set_hard_state(hs)?; 976 | } 977 | 978 | if !ready.persisted_messages().is_empty() { 979 | // Send out the persisted messages come from the node. 980 | self.send_messages(ready.take_persisted_messages()); 981 | } 982 | let mut light_rd = self.advance(ready); 983 | 984 | if let Some(commit) = light_rd.commit_index() { 985 | let store = self.mut_store(); 986 | store.set_hard_state_comit(commit)?; 987 | } 988 | // Send out the messages. 989 | self.send_messages(light_rd.take_messages()); 990 | // Apply all committed entries. 991 | self.handle_committed_entries(light_rd.take_committed_entries()) 992 | .await?; 993 | self.advance_apply(); 994 | 995 | Ok(()) 996 | } 997 | 998 | fn send_messages(&mut self, msgs: Vec) { 999 | for message in msgs { 1000 | // for message in ready.messages.drain(..) { 1001 | let client_id = message.get_to(); 1002 | let client = match self.peer(client_id) { 1003 | Some(peer) => peer, 1004 | None => continue, 1005 | }; 1006 | 1007 | let message_sender = MessageSender { 1008 | message, 1009 | client, 1010 | client_id, 1011 | chan: self.snd.clone(), 1012 | max_retries: 1, 1013 | timeout: Duration::from_millis(500), 1014 | sending_raft_messages: self.sending_raft_messages.clone(), 1015 | }; 1016 | self.sending_raft_messages.fetch_add(1, Ordering::SeqCst); 1017 | tokio::spawn(message_sender.send()); 1018 | } 1019 | } 1020 | 1021 | async fn handle_committed_entries(&mut self, committed_entries: Vec) -> Result<()> { 1022 | // Fitler out empty entries produced by new elected leaders. 1023 | let committed_entries_count = committed_entries.len(); 1024 | let now = std::time::Instant::now(); 1025 | for entry in committed_entries { 1026 | if entry.data.is_empty() { 1027 | // From new elected leaders. 1028 | continue; 1029 | } 1030 | if let EntryType::EntryConfChange = entry.get_entry_type() { 1031 | self.handle_config_change(&entry).await?; 1032 | } else { 1033 | self.handle_normal(&entry).await?; 1034 | } 1035 | } 1036 | 1037 | if now.elapsed().as_millis() > 500 { 1038 | log::info!( 1039 | "[handle_committed_entries] uncommitteds.len(): {}, sending_raft_messages: {}, \ 1040 | committed_entries_count: {}, raft.inflight_buffers_size: {}, \ 1041 | raft.msgs: {}, raft.group_commit: {}, raft.pending_read_count: {}, raft.ready_read_count: {}, \ 1042 | raft.soft_state: {:?}, raft.hard_state: {:?}, raft.state: {:?}, raft.heartbeat_elapsed: {}, \ 1043 | self.raft.read_states: {}, raft.heartbeat_timeout: {}, raft.heartbeat_elapsed: {}, \ 1044 | cost time: {:?}", 1045 | self.uncommitteds.len(), self.sending_raft_messages.load(Ordering::SeqCst), 1046 | committed_entries_count, self.raft.inflight_buffers_size(), self.raft.msgs.len(), 1047 | self.raft.group_commit(), self.raft.pending_read_count(), self.raft.ready_read_count(), 1048 | self.raft.soft_state(), self.raft.hard_state(), self.raft.state, self.raft.heartbeat_elapsed(), 1049 | self.raft.read_states.len(), self.raft.heartbeat_timeout(), self.raft.heartbeat_elapsed(), 1050 | now.elapsed() 1051 | ); 1052 | } 1053 | Ok(()) 1054 | } 1055 | 1056 | #[inline] 1057 | async fn handle_config_change(&mut self, entry: &Entry) -> Result<()> { 1058 | info!("handle_config_change, entry: {:?}", entry); 1059 | let seq: u64 = deserialize(entry.get_context())?; 1060 | let change = ConfChange::decode(entry.get_data()) 1061 | .map_err(|e| tonic::Status::invalid_argument(e.to_string()))?; 1062 | let id = change.get_node_id(); 1063 | 1064 | let change_type = change.get_change_type(); 1065 | 1066 | match change_type { 1067 | ConfChangeType::AddNode => { 1068 | let addr: String = deserialize(change.get_context())?; 1069 | info!("adding {} ({}) to peers", addr, id); 1070 | self.add_peer(&addr, id); 1071 | } 1072 | ConfChangeType::RemoveNode => { 1073 | if change.get_node_id() == self.id() { 1074 | self.should_quit = true; 1075 | warn!("quiting the cluster"); 1076 | } else { 1077 | self.peers.remove(&change.get_node_id()); 1078 | } 1079 | } 1080 | _ => { 1081 | warn!("unimplemented! change_type: {:?}", change_type); 1082 | } 1083 | } 1084 | 1085 | if let Ok(cs) = self.apply_conf_change(&change) { 1086 | let last_applied = self.raft.raft_log.applied; 1087 | if matches!(change_type, ConfChangeType::AddNode) { 1088 | self.last_snap_time = Instant::now(); 1089 | let snapshot = prost::bytes::Bytes::from(self.store.snapshot().await?); 1090 | info!( 1091 | "create snapshot cost time: {:?}", 1092 | self.last_snap_time.elapsed(), 1093 | ); 1094 | let store = self.mut_store(); 1095 | store.set_conf_state(&cs)?; 1096 | store.compact(last_applied)?; 1097 | store.create_snapshot(snapshot)?; 1098 | } else { 1099 | let store = self.mut_store(); 1100 | store.set_conf_state(&cs)?; 1101 | store.compact(last_applied)?; 1102 | } 1103 | } 1104 | 1105 | if let Some(sender) = self.uncommitteds.remove(&seq) { 1106 | let response = match change_type { 1107 | ConfChangeType::AddNode => RaftResponse::JoinSuccess { 1108 | assigned_id: id, 1109 | peer_addrs: self.peer_addrs(), 1110 | }, 1111 | ConfChangeType::RemoveNode => RaftResponse::Ok, 1112 | _ => { 1113 | warn!("unimplemented! change_type: {:?}", change_type); 1114 | RaftResponse::Error("unimplemented".into()) 1115 | } 1116 | }; 1117 | if let ReplyChan::One((sender, _)) = sender { 1118 | if sender.send(response).is_err() { 1119 | warn!("error sending response") 1120 | } 1121 | } 1122 | } 1123 | Ok(()) 1124 | } 1125 | 1126 | #[inline] 1127 | async fn handle_normal(&mut self, entry: &Entry) -> Result<()> { 1128 | let seq: u64 = deserialize(entry.get_context())?; 1129 | debug!( 1130 | "[handle_normal] seq:{}, senders.len(): {}", 1131 | seq, 1132 | self.uncommitteds.len() 1133 | ); 1134 | 1135 | match ( 1136 | deserialize::(entry.get_data())?, 1137 | self.uncommitteds.remove(&seq), 1138 | ) { 1139 | (Proposals::One(data), chan) => { 1140 | let apply_start = std::time::Instant::now(); 1141 | let reply = 1142 | tokio::time::timeout(Duration::from_secs(5), self.store.apply(&data)).await; 1143 | if apply_start.elapsed().as_secs() > 3 { 1144 | log::warn!("apply, cost time: {:?}", apply_start.elapsed()); 1145 | } 1146 | let reply = reply.unwrap_or_else(|e| Err(Error::from(e))); 1147 | if let Some(ReplyChan::One((chan, inst))) = chan { 1148 | let res = match reply { 1149 | Ok(data) => RaftResponse::Response { data }, 1150 | Err(e) => RaftResponse::Error(e.to_string()), 1151 | }; 1152 | if let Err(_resp) = chan.send(res) { 1153 | warn!( 1154 | "[handle_normal] send RaftResponse error, seq:{}, cost time: {:?}, uncommitteds: {}, sending_raft_messages: {}", 1155 | seq, 1156 | inst.elapsed(), 1157 | self.uncommitteds.len(), 1158 | self.sending_raft_messages.load(Ordering::SeqCst) 1159 | ); 1160 | } 1161 | } 1162 | } 1163 | (Proposals::More(mut datas), chans) => { 1164 | let mut chans = if let Some(ReplyChan::More(chans)) = chans { 1165 | Some(chans) 1166 | } else { 1167 | None 1168 | }; 1169 | while let Some(data) = datas.pop() { 1170 | let apply_start = std::time::Instant::now(); 1171 | let reply = 1172 | tokio::time::timeout(Duration::from_secs(5), self.store.apply(&data)).await; 1173 | if apply_start.elapsed().as_secs() > 3 { 1174 | log::warn!("apply, cost time: {:?}", apply_start.elapsed()); 1175 | } 1176 | let reply = reply.unwrap_or_else(|e| Err(Error::from(e))); 1177 | if let Some((chan, inst)) = chans.as_mut().and_then(|cs| cs.pop()) { 1178 | if inst.elapsed().as_secs() > 3 { 1179 | warn!( 1180 | "[handle_normal] cost time, {:?}, chan is canceled: {}, uncommitteds: {}, sending_raft_messages: {}", 1181 | inst.elapsed(), 1182 | chan.is_canceled(), 1183 | self.uncommitteds.len(), 1184 | self.sending_raft_messages.load(Ordering::SeqCst) 1185 | ); 1186 | } 1187 | let res = match reply { 1188 | Ok(data) => RaftResponse::Response { data }, 1189 | Err(e) => RaftResponse::Error(e.to_string()), 1190 | }; 1191 | if let Err(_resp) = chan.send(res) { 1192 | warn!( 1193 | "[handle_normal] send RaftResponse error, seq:{}, cost time: {:?}", 1194 | seq, 1195 | inst.elapsed() 1196 | ); 1197 | } 1198 | } 1199 | } 1200 | } 1201 | } 1202 | 1203 | if Instant::now() > self.last_snap_time + self.cfg.snapshot_interval { 1204 | self.last_snap_time = Instant::now(); 1205 | let last_applied = self.raft.raft_log.applied; 1206 | let snapshot = prost::bytes::Bytes::from(self.store.snapshot().await?); 1207 | let store = self.mut_store(); 1208 | store.compact(last_applied)?; 1209 | let first_index = store.first_index().unwrap_or(0); 1210 | let last_index = store.last_index().unwrap_or(0); 1211 | let result = store.create_snapshot(snapshot); 1212 | info!( 1213 | "create snapshot cost time: {:?}, first_index: {:?}, last_index: {:?}, {}, create snapshot result: {:?}", 1214 | self.last_snap_time.elapsed(), 1215 | first_index, 1216 | last_index, 1217 | (last_index as i64 - first_index as i64), 1218 | result 1219 | ); 1220 | } 1221 | Ok(()) 1222 | } 1223 | } 1224 | 1225 | impl Deref for RaftNode { 1226 | type Target = RawNode; 1227 | 1228 | fn deref(&self) -> &Self::Target { 1229 | &self.inner 1230 | } 1231 | } 1232 | 1233 | impl DerefMut for RaftNode { 1234 | fn deref_mut(&mut self) -> &mut Self::Target { 1235 | &mut self.inner 1236 | } 1237 | } 1238 | -------------------------------------------------------------------------------- /src/raft_server.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | use std::sync::Arc; 3 | use std::time::Duration; 4 | 5 | use bincode::serialize; 6 | use futures::channel::{mpsc, oneshot}; 7 | use futures::SinkExt; 8 | use log::{info, warn}; 9 | use prost::Message as _; 10 | use tikv_raft::eraftpb::{ConfChange, Message as RaftMessage}; 11 | use tokio::time::timeout; 12 | use tonic::transport::Server; 13 | use tonic::{Request, Response, Status}; 14 | 15 | use crate::message::{Message, RaftResponse}; 16 | use crate::raft_service::raft_service_server::{RaftService, RaftServiceServer}; 17 | use crate::raft_service::{ 18 | self, ConfChange as RiteraftConfChange, Empty, Message as RiteraftMessage, 19 | }; 20 | use crate::{error, Config}; 21 | 22 | /// A gRPC server that handles Raft-related requests. 23 | pub struct RaftServer { 24 | snd: mpsc::Sender, 25 | laddr: SocketAddr, 26 | timeout: Duration, 27 | cfg: Arc, 28 | } 29 | 30 | impl RaftServer { 31 | /// Creates a new instance of `RaftServer`. 32 | /// 33 | /// This function initializes a new `RaftServer` with the specified parameters. 34 | /// 35 | /// # Parameters 36 | /// - `snd`: A sender for Raft messages. 37 | /// - `laddr`: The local address where the server will listen for incoming requests. 38 | /// - `cfg`: Configuration for the server, including gRPC timeouts and other settings. 39 | /// 40 | /// # Returns 41 | /// Returns a new `RaftServer` instance. 42 | pub fn new(snd: mpsc::Sender, laddr: SocketAddr, cfg: Arc) -> Self { 43 | RaftServer { 44 | snd, 45 | laddr, 46 | timeout: cfg.grpc_timeout, 47 | cfg, 48 | } 49 | } 50 | 51 | /// Starts the gRPC server to handle Raft requests. 52 | /// 53 | /// This function sets up the gRPC server and listens for incoming requests. It uses 54 | /// the `RaftServiceServer` to handle requests and manage configuration options. 55 | /// 56 | /// # Returns 57 | /// Returns a `Result` indicating whether the server started successfully or if an error occurred. 58 | pub async fn run(self) -> error::Result<()> { 59 | let laddr = self.laddr; 60 | let _cfg = self.cfg.clone(); 61 | info!("listening gRPC requests on: {}", laddr); 62 | let svc = RaftServiceServer::new(self) 63 | .max_decoding_message_size(_cfg.grpc_message_size) 64 | .max_encoding_message_size(_cfg.grpc_message_size); 65 | let server = Server::builder().add_service(svc); 66 | 67 | #[cfg(any(feature = "reuseport", feature = "reuseaddr"))] 68 | #[cfg(all(feature = "socket2", feature = "tokio-stream"))] 69 | { 70 | log::info!( 71 | "reuseaddr: {}, reuseport: {}", 72 | _cfg.reuseaddr, 73 | _cfg.reuseport 74 | ); 75 | let listener = raft_service::bind(laddr, 1024, _cfg.reuseaddr, _cfg.reuseport)?; 76 | server.serve_with_incoming(listener).await?; 77 | } 78 | #[cfg(not(any(feature = "reuseport", feature = "reuseaddr")))] 79 | server.serve(laddr).await?; 80 | 81 | info!("server has quit"); 82 | Ok(()) 83 | } 84 | } 85 | 86 | #[tonic::async_trait] 87 | impl RaftService for RaftServer { 88 | /// Handles requests for a new Raft node ID. 89 | /// 90 | /// This method sends a `RequestId` message to the Raft node and waits for a response. 91 | /// It returns the node ID if successful or an error status if not. 92 | /// 93 | /// # Parameters 94 | /// - `req`: The incoming request containing no additional data. 95 | /// 96 | /// # Returns 97 | /// Returns a `Response` containing the node ID or an error status. 98 | async fn request_id( 99 | &self, 100 | _: Request, 101 | ) -> Result, Status> { 102 | let mut sender = self.snd.clone(); 103 | let (tx, rx) = oneshot::channel(); 104 | let _ = sender.send(Message::RequestId { chan: tx }).await; 105 | //let response = rx.await; 106 | let reply = timeout(self.timeout, rx) 107 | .await 108 | .map_err(|_e| Status::unavailable("recv timeout for reply"))? 109 | .map_err(|_e| Status::unavailable("recv canceled for reply"))?; 110 | match reply { 111 | RaftResponse::WrongLeader { 112 | leader_id, 113 | leader_addr, 114 | } => { 115 | warn!("sending wrong leader"); 116 | Ok(Response::new(raft_service::IdRequestReponse { 117 | code: raft_service::ResultCode::WrongLeader as i32, 118 | data: serialize(&(leader_id, leader_addr)).unwrap(), 119 | })) 120 | } 121 | RaftResponse::RequestId { leader_id } => { 122 | Ok(Response::new(raft_service::IdRequestReponse { 123 | code: raft_service::ResultCode::Ok as i32, 124 | data: serialize(&leader_id).unwrap(), 125 | })) 126 | } 127 | _ => unreachable!(), 128 | } 129 | } 130 | 131 | /// Handles configuration change requests. 132 | /// 133 | /// This method processes a configuration change request by sending it to the Raft node 134 | /// and waits for a response. It returns the result of the configuration change operation. 135 | /// 136 | /// # Parameters 137 | /// - `req`: The incoming request containing the configuration change data. 138 | /// 139 | /// # Returns 140 | /// Returns a `Response` containing the result of the configuration change or an error status. 141 | async fn change_config( 142 | &self, 143 | req: Request, 144 | ) -> Result, Status> { 145 | let change = ConfChange::decode(req.into_inner().inner.as_ref()) 146 | .map_err(|e| Status::invalid_argument(e.to_string()))?; 147 | 148 | let mut sender = self.snd.clone(); 149 | 150 | let (tx, rx) = oneshot::channel(); 151 | 152 | let message = Message::ConfigChange { change, chan: tx }; 153 | 154 | match sender.send(message).await { 155 | Ok(_) => (), 156 | Err(_) => warn!("send error"), 157 | } 158 | 159 | let mut reply = raft_service::RaftResponse::default(); 160 | 161 | match timeout(self.timeout, rx).await { 162 | Ok(Ok(raft_response)) => { 163 | reply.inner = 164 | serialize(&raft_response).map_err(|e| Status::unavailable(e.to_string()))?; 165 | } 166 | Ok(_) => (), 167 | Err(e) => { 168 | reply.inner = serialize(&RaftResponse::Error("timeout".into())) 169 | .map_err(|e| Status::unavailable(e.to_string()))?; 170 | warn!("timeout waiting for reply, {:?}", e); 171 | } 172 | } 173 | 174 | Ok(Response::new(reply)) 175 | } 176 | 177 | /// Handles sending Raft messages. 178 | /// 179 | /// This method processes a Raft message by sending it to the Raft node and returns 180 | /// the result of the send operation. 181 | /// 182 | /// # Parameters 183 | /// - `request`: The incoming request containing the Raft message data. 184 | /// 185 | /// # Returns 186 | /// Returns a `Response` indicating success or an error status. 187 | async fn send_message( 188 | &self, 189 | request: Request, 190 | ) -> Result, Status> { 191 | let message = RaftMessage::decode(request.into_inner().inner.as_ref()) 192 | .map_err(|e| Status::invalid_argument(e.to_string()))?; 193 | match self.snd.clone().try_send(Message::Raft(Box::new(message))) { 194 | Ok(()) => { 195 | let response = RaftResponse::Ok; 196 | Ok(Response::new(raft_service::RaftResponse { 197 | inner: serialize(&response).unwrap(), 198 | })) 199 | } 200 | Err(_) => Err(Status::unavailable("error for try send message")), 201 | } 202 | } 203 | 204 | /// Handles sending proposals. 205 | /// 206 | /// This method sends a proposal to the Raft node and waits for a response. It returns 207 | /// the result of the proposal send operation. 208 | /// 209 | /// # Parameters 210 | /// - `req`: The incoming request containing the proposal data. 211 | /// 212 | /// # Returns 213 | /// Returns a `Response` containing the result of the proposal send operation or an error status. 214 | async fn send_proposal( 215 | &self, 216 | req: Request, 217 | ) -> Result, Status> { 218 | let proposal = req.into_inner().inner; 219 | let mut sender = self.snd.clone(); 220 | let (tx, rx) = oneshot::channel(); 221 | let message = Message::Propose { proposal, chan: tx }; 222 | 223 | match sender.try_send(message) { 224 | Ok(()) => match timeout(self.timeout, rx).await { 225 | Ok(Ok(raft_response)) => match serialize(&raft_response) { 226 | Ok(resp) => Ok(Response::new(raft_service::RaftResponse { inner: resp })), 227 | Err(e) => { 228 | warn!("serialize error, {}", e); 229 | Err(Status::unavailable("serialize error")) 230 | } 231 | }, 232 | Ok(Err(e)) => { 233 | warn!("recv error for reply, {}", e); 234 | Err(Status::unavailable("recv error for reply")) 235 | } 236 | Err(e) => { 237 | warn!("timeout waiting for reply, {}", e); 238 | Err(Status::unavailable("timeout waiting for reply")) 239 | } 240 | }, 241 | Err(e) => { 242 | warn!("error for try send message, {}", e); 243 | Err(Status::unavailable("error for try send message")) 244 | } 245 | } 246 | } 247 | 248 | /// Handles sending queries. 249 | /// 250 | /// This method sends a query to the Raft node and waits for a response. It returns 251 | /// the result of the query send operation. 252 | /// 253 | /// # Parameters 254 | /// - `req`: The incoming request containing the query data. 255 | /// 256 | /// # Returns 257 | /// Returns a `Response` containing the result of the query send operation or an error status. 258 | async fn send_query( 259 | &self, 260 | req: Request, 261 | ) -> Result, Status> { 262 | let query = req.into_inner().inner; 263 | let mut sender = self.snd.clone(); 264 | let (tx, rx) = oneshot::channel(); 265 | let message = Message::Query { query, chan: tx }; 266 | let mut reply = raft_service::RaftResponse::default(); 267 | match sender.try_send(message) { 268 | Ok(()) => { 269 | // if we don't receive a response after 2secs, we timeout 270 | match timeout(self.timeout, rx).await { 271 | Ok(Ok(raft_response)) => { 272 | reply.inner = serialize(&raft_response) 273 | .map_err(|e| Status::unavailable(e.to_string()))?; 274 | } 275 | Ok(Err(e)) => { 276 | reply.inner = serialize(&RaftResponse::Error(e.to_string())) 277 | .map_err(|e| Status::unavailable(e.to_string()))?; 278 | warn!("send query error, {}", e); 279 | } 280 | Err(_e) => { 281 | reply.inner = serialize(&RaftResponse::Error("timeout".into())) 282 | .map_err(|e| Status::unavailable(e.to_string()))?; 283 | warn!("timeout waiting for send query reply"); 284 | } 285 | } 286 | } 287 | Err(e) => { 288 | reply.inner = serialize(&RaftResponse::Error(e.to_string())) 289 | .map_err(|e| Status::unavailable(e.to_string()))?; 290 | warn!("send query error, {}", e) 291 | } 292 | } 293 | 294 | Ok(Response::new(reply)) 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /src/raft_service.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use tonic::transport::{Channel, Endpoint}; 4 | 5 | use raft_service_client::RaftServiceClient; 6 | 7 | use crate::error::Result; 8 | 9 | tonic::include_proto!("raftservice"); 10 | 11 | pub(crate) type RaftServiceClientType = RaftServiceClient; 12 | 13 | /// Creates a gRPC `Endpoint` for connecting to a Raft service. 14 | /// 15 | /// This function constructs a gRPC `Endpoint` configured with the specified address, concurrency 16 | /// limit, and timeout settings. The `Endpoint` is used to establish a connection to the Raft 17 | /// service. 18 | /// 19 | /// # Parameters 20 | /// - `saddr`: The server address in the form of a string (e.g., "127.0.0.1:50051"). 21 | /// - `concurrency_limit`: The maximum number of concurrent requests allowed. 22 | /// - `timeout`: The connection timeout duration. 23 | /// 24 | /// # Returns 25 | /// Returns a `Result` containing the configured `Endpoint` on success, or an error if the endpoint 26 | /// creation fails. 27 | #[inline] 28 | pub(crate) fn endpoint( 29 | saddr: &str, 30 | concurrency_limit: usize, 31 | timeout: Duration, 32 | ) -> Result { 33 | let endpoint = Channel::from_shared(format!("http://{}", saddr)) 34 | .map(|endpoint| { 35 | endpoint 36 | .concurrency_limit(concurrency_limit) 37 | .connect_timeout(timeout) 38 | .timeout(timeout) 39 | }) 40 | .map_err(anyhow::Error::new)?; 41 | Ok(endpoint) 42 | } 43 | 44 | /// Establishes a connection to the Raft service and returns a client. 45 | /// 46 | /// This asynchronous function creates a new `RaftServiceClient` instance, using the provided 47 | /// address, concurrency limit, message size, and timeout settings. The client is configured with 48 | /// the specified message size for both encoding and decoding. 49 | /// 50 | /// # Parameters 51 | /// - `saddr`: The server address in the form of a string (e.g., "127.0.0.1:50051"). 52 | /// - `concurrency_limit`: The maximum number of concurrent requests allowed. 53 | /// - `message_size`: The maximum size of messages for encoding and decoding. 54 | /// - `timeout`: The connection timeout duration. 55 | /// 56 | /// # Returns 57 | /// Returns a `Result` containing the `RaftServiceClient` instance on success, or an error if the 58 | /// connection fails. 59 | #[inline] 60 | pub(crate) async fn connect( 61 | saddr: &str, 62 | concurrency_limit: usize, 63 | message_size: usize, 64 | timeout: Duration, 65 | ) -> Result { 66 | Ok(RaftServiceClientType::new( 67 | endpoint(saddr, concurrency_limit, timeout)? 68 | .connect() 69 | .await?, 70 | ) 71 | .max_decoding_message_size(message_size) 72 | .max_encoding_message_size(message_size)) 73 | } 74 | 75 | /// Binds a TCP listener to the specified address and returns a `TcpListenerStream`. 76 | /// 77 | /// This function sets up a TCP listener with options for socket reuse and a backlog queue. It 78 | /// returns a `TcpListenerStream` that can be used to accept incoming connections. This is 79 | /// particularly useful for scenarios requiring high-performance and customizable socket options. 80 | /// 81 | /// # Parameters 82 | /// - `laddr`: The local address to bind in the form of `std::net::SocketAddr`. 83 | /// - `backlog`: The maximum number of pending connections in the backlog queue. 84 | /// - `_reuseaddr`: Whether to enable the `SO_REUSEADDR` option on Unix-like systems. 85 | /// - `_reuseport`: Whether to enable the `SO_REUSEPORT` option on Unix-like systems. 86 | /// 87 | /// # Returns 88 | /// Returns a `Result` containing the `TcpListenerStream` on success, or an error if the binding fails. 89 | #[inline] 90 | #[cfg(all(feature = "socket2", feature = "tokio-stream"))] 91 | pub fn bind( 92 | laddr: std::net::SocketAddr, 93 | backlog: i32, 94 | _reuseaddr: bool, 95 | _reuseport: bool, 96 | ) -> anyhow::Result { 97 | use socket2::{Domain, SockAddr, Socket, Type}; 98 | let builder = Socket::new(Domain::for_address(laddr), Type::STREAM, None)?; 99 | builder.set_nonblocking(true)?; 100 | #[cfg(unix)] 101 | #[cfg(feature = "reuseaddr")] 102 | builder.set_reuse_address(_reuseaddr)?; 103 | #[cfg(unix)] 104 | #[cfg(feature = "reuseport")] 105 | builder.set_reuse_port(_reuseport)?; 106 | builder.bind(&SockAddr::from(laddr))?; 107 | builder.listen(backlog)?; 108 | let listener = tokio_stream::wrappers::TcpListenerStream::new( 109 | tokio::net::TcpListener::from_std(std::net::TcpListener::from(builder))?, 110 | ); 111 | Ok(listener) 112 | } 113 | -------------------------------------------------------------------------------- /src/storage.rs: -------------------------------------------------------------------------------- 1 | use tikv_raft::prelude::*; 2 | use tikv_raft::storage::MemStorage as CoreMemStorage; 3 | use tikv_raft::GetEntriesContext; 4 | 5 | use crate::error::Result; 6 | 7 | /// A trait defining operations for a log store in a Raft implementation. 8 | /// 9 | /// The `LogStore` trait extends the `Storage` trait with additional methods to manage Raft log entries, 10 | /// hard state, configuration state, and snapshots. Implementations of this trait should support appending 11 | /// log entries, updating the hard state and configuration state, creating and applying snapshots, and 12 | /// compacting the log. 13 | /// 14 | /// # Methods 15 | /// - `append`: Append a list of log entries to the log store. 16 | /// - `set_hard_state`: Set the hard state for the Raft state machine. 17 | /// - `set_hard_state_comit`: Set the commit index in the hard state. 18 | /// - `set_conf_state`: Set the configuration state for the Raft state machine. 19 | /// - `create_snapshot`: Create a snapshot with the given data. 20 | /// - `apply_snapshot`: Apply a snapshot to the log store. 21 | /// - `compact`: Compact the log store up to the given index. 22 | pub trait LogStore: Storage { 23 | /// Appends a list of log entries to the log store. 24 | fn append(&mut self, entries: &[Entry]) -> Result<()>; 25 | /// Sets the hard state for the Raft state machine. 26 | fn set_hard_state(&mut self, hard_state: &HardState) -> Result<()>; 27 | 28 | /// Sets the commit index in the hard state. 29 | fn set_hard_state_comit(&mut self, comit: u64) -> Result<()>; 30 | 31 | /// Sets the configuration state for the Raft state machine. 32 | fn set_conf_state(&mut self, conf_state: &ConfState) -> Result<()>; 33 | 34 | /// Creates a snapshot with the given data. 35 | fn create_snapshot(&mut self, data: prost::bytes::Bytes) -> Result<()>; 36 | 37 | /// Applies a snapshot to the log store. 38 | fn apply_snapshot(&mut self, snapshot: Snapshot) -> Result<()>; 39 | 40 | /// Compacts the log store up to the given index. 41 | fn compact(&mut self, index: u64) -> Result<()>; 42 | } 43 | 44 | /// An in-memory implementation of the `LogStore` trait using Tikv's `MemStorage`. 45 | /// 46 | /// The `MemStorage` struct provides an in-memory storage backend for Raft logs and state. It uses Tikv's 47 | /// `CoreMemStorage` as the underlying storage engine and includes additional methods for managing snapshots. 48 | /// 49 | /// # Fields 50 | /// - `core`: The underlying `CoreMemStorage` used for log and state management. 51 | /// - `snapshot`: The currently held snapshot. 52 | pub struct MemStorage { 53 | core: CoreMemStorage, 54 | snapshot: Snapshot, 55 | } 56 | 57 | impl MemStorage { 58 | /// Creates a new `MemStorage` instance with default settings. 59 | /// 60 | /// This function initializes `CoreMemStorage` and sets the `snapshot` to its default value. 61 | /// 62 | /// # Returns 63 | /// Returns a new `MemStorage` instance. 64 | #[inline] 65 | pub fn create() -> Self { 66 | let core = CoreMemStorage::default(); 67 | let snapshot = Default::default(); 68 | Self { core, snapshot } 69 | } 70 | } 71 | 72 | impl LogStore for MemStorage { 73 | /// Appends a list of log entries to the in-memory log store. 74 | /// 75 | /// This method acquires a write lock on the underlying `CoreMemStorage` and appends the provided 76 | /// entries. 77 | /// 78 | /// # Parameters 79 | /// - `entries`: The entries to be appended. 80 | /// 81 | /// # Returns 82 | /// Returns a `Result` indicating success or failure. 83 | #[inline] 84 | fn append(&mut self, entries: &[Entry]) -> Result<()> { 85 | let mut store = self.core.wl(); 86 | store.append(entries)?; 87 | Ok(()) 88 | } 89 | 90 | /// Sets the hard state for the Raft state machine. 91 | /// 92 | /// This method acquires a write lock on the underlying `CoreMemStorage` and updates the hard state. 93 | /// 94 | /// # Parameters 95 | /// - `hard_state`: The new hard state to set. 96 | /// 97 | /// # Returns 98 | /// Returns a `Result` indicating success or failure. 99 | #[inline] 100 | fn set_hard_state(&mut self, hard_state: &HardState) -> Result<()> { 101 | let mut store = self.core.wl(); 102 | store.set_hardstate(hard_state.clone()); 103 | Ok(()) 104 | } 105 | 106 | /// Sets the commit index in the hard state. 107 | /// 108 | /// This method updates the commit index in the hard state by first acquiring a write lock on the 109 | /// underlying `CoreMemStorage`, modifying the commit index, and then setting the updated hard state. 110 | /// 111 | /// # Parameters 112 | /// - `comit`: The commit index to set. 113 | /// 114 | /// # Returns 115 | /// Returns a `Result` indicating success or failure. 116 | #[inline] 117 | fn set_hard_state_comit(&mut self, comit: u64) -> Result<()> { 118 | let mut store = self.core.wl(); 119 | let mut hard_state = store.hard_state().clone(); 120 | hard_state.set_commit(comit); 121 | store.set_hardstate(hard_state); 122 | Ok(()) 123 | } 124 | 125 | /// Sets the configuration state for the Raft state machine. 126 | /// 127 | /// This method acquires a write lock on the underlying `CoreMemStorage` and updates the configuration state. 128 | /// 129 | /// # Parameters 130 | /// - `conf_state`: The new configuration state to set. 131 | /// 132 | /// # Returns 133 | /// Returns a `Result` indicating success or failure. 134 | #[inline] 135 | fn set_conf_state(&mut self, conf_state: &ConfState) -> Result<()> { 136 | let mut store = self.core.wl(); 137 | store.set_conf_state(conf_state.clone()); 138 | Ok(()) 139 | } 140 | 141 | /// Creates a snapshot with the given data. 142 | /// 143 | /// This method initializes a new snapshot with the provided data and stores it in the `snapshot` field. 144 | /// 145 | /// # Parameters 146 | /// - `data`: The data to be included in the snapshot. 147 | /// 148 | /// # Returns 149 | /// Returns a `Result` indicating success or failure. 150 | #[inline] 151 | fn create_snapshot(&mut self, data: prost::bytes::Bytes) -> Result<()> { 152 | let mut snapshot = self.core.snapshot(0, 0)?; 153 | snapshot.set_data(data.to_vec()); 154 | self.snapshot = snapshot; 155 | Ok(()) 156 | } 157 | 158 | /// Applies a snapshot to the in-memory log store. 159 | /// 160 | /// This method acquires a write lock on the underlying `CoreMemStorage` and applies the provided snapshot. 161 | /// 162 | /// # Parameters 163 | /// - `snapshot`: The snapshot to apply. 164 | /// 165 | /// # Returns 166 | /// Returns a `Result` indicating success or failure. 167 | #[inline] 168 | fn apply_snapshot(&mut self, snapshot: Snapshot) -> Result<()> { 169 | let mut store = self.core.wl(); 170 | store.apply_snapshot(snapshot)?; 171 | Ok(()) 172 | } 173 | 174 | /// Compacts the log store up to the given index. 175 | /// 176 | /// This method acquires a write lock on the underlying `CoreMemStorage` and compacts the log up to the specified index. 177 | /// 178 | /// # Parameters 179 | /// - `index`: The index up to which to compact the log. 180 | /// 181 | /// # Returns 182 | /// Returns a `Result` indicating success or failure. 183 | #[inline] 184 | fn compact(&mut self, index: u64) -> Result<()> { 185 | let mut store = self.core.wl(); 186 | store.compact(index)?; 187 | Ok(()) 188 | } 189 | } 190 | 191 | impl Storage for MemStorage { 192 | /// Retrieves the initial state of the Raft state machine. 193 | /// 194 | /// This method returns the initial state from the underlying `CoreMemStorage`. 195 | /// 196 | /// # Returns 197 | /// Returns a `Result` containing the `RaftState` on success. 198 | #[inline] 199 | fn initial_state(&self) -> tikv_raft::Result { 200 | let raft_state = self.core.initial_state()?; 201 | Ok(raft_state) 202 | } 203 | 204 | /// Retrieves a range of log entries. 205 | /// 206 | /// This method acquires a read lock on the underlying `CoreMemStorage` and returns log entries 207 | /// in the specified range. 208 | /// 209 | /// # Parameters 210 | /// - `low`: The start index of the range (inclusive). 211 | /// - `high`: The end index of the range (exclusive). 212 | /// - `max_size`: The maximum size of the entries to return (optional). 213 | /// - `context`: Additional context for retrieving the entries. 214 | /// 215 | /// # Returns 216 | /// Returns a `Result` containing a vector of `Entry` objects on success. 217 | #[inline] 218 | fn entries( 219 | &self, 220 | low: u64, 221 | high: u64, 222 | max_size: impl Into>, 223 | context: GetEntriesContext, 224 | ) -> tikv_raft::Result> { 225 | let entries = self.core.entries(low, high, max_size, context)?; 226 | Ok(entries) 227 | } 228 | 229 | /// Retrieves the term of the log entry at the specified index. 230 | /// 231 | /// This method returns the term of the log entry from the underlying `CoreMemStorage`. 232 | /// 233 | /// # Parameters 234 | /// - `idx`: The index of the log entry. 235 | /// 236 | /// # Returns 237 | /// Returns a `Result` containing the term of the entry on success. 238 | #[inline] 239 | fn term(&self, idx: u64) -> tikv_raft::Result { 240 | self.core.term(idx) 241 | } 242 | 243 | /// Retrieves the first index of the log. 244 | /// 245 | /// This method returns the first index from the underlying `CoreMemStorage`. 246 | /// 247 | /// # Returns 248 | /// Returns a `Result` containing the first index on success. 249 | #[inline] 250 | fn first_index(&self) -> tikv_raft::Result { 251 | self.core.first_index() 252 | } 253 | 254 | /// Retrieves the last index of the log. 255 | /// 256 | /// This method returns the last index from the underlying `CoreMemStorage`. 257 | /// 258 | /// # Returns 259 | /// Returns a `Result` containing the last index on success. 260 | #[inline] 261 | fn last_index(&self) -> tikv_raft::Result { 262 | self.core.last_index() 263 | } 264 | 265 | /// Retrieves the current snapshot. 266 | /// 267 | /// This method returns a clone of the current snapshot held in the `snapshot` field. 268 | /// 269 | /// # Parameters 270 | /// - `_request_index`: The index for the snapshot request (not used in this implementation). 271 | /// - `_to`: The index up to which the snapshot is requested (not used in this implementation). 272 | /// 273 | /// # Returns 274 | /// Returns a `Result` containing the current `Snapshot` on success. 275 | #[inline] 276 | fn snapshot(&self, _request_index: u64, _to: u64) -> tikv_raft::Result { 277 | Ok(self.snapshot.clone()) 278 | } 279 | } 280 | --------------------------------------------------------------------------------