├── .gitignore ├── Cargo.toml ├── LICENSE.md ├── README.md └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | test* 3 | 4 | # Added by cargo 5 | 6 | /target 7 | 8 | 9 | # Added by cargo 10 | # 11 | # already existing elements were commented out 12 | 13 | #/target 14 | /Cargo.lock 15 | 16 | *.profraw 17 | log -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "raft-rs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2023 Phil Eaton 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # raft-rs 2 | 3 | Not my first time implementing Raft. I wrote about [another 4 | implementation](https://notes.eatonphil.com/2023-05-25-raft.html) in 5 | Go I did. But you don't learn a concept well until you've implemented 6 | it a few times. And I wanted some practice with Rust. 7 | 8 | Achieved: 9 | - No dependencies beyond the standard library. 10 | - Leader election. 11 | - Log replication. 12 | 13 | Non-goals (for now): 14 | - Production use. 15 | - Snapshots and log truncation. 16 | - Cluster membership changes. 17 | 18 | ```console 19 | $ cargo test 20 | ``` 21 | 22 | ## References 23 | 24 | - The Raft Paper: [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf) 25 | - Diego Ongaro's Thesis: [Consensus: Bridging Theory and Practice](https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf) 26 | - Diego Ongaro's TLA+ spec: [TLA+ specification for the Raft consensus algorithm](https://github.com/ongardie/raft.tla) 27 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // References: 2 | // [0] In Search of an Understandable Consensus Algorithm (Extended Version) -- https://raft.github.io/raft.pdf 3 | 4 | use std::convert::{TryFrom, TryInto}; 5 | use std::io::{BufReader, BufWriter, Read, Write}; 6 | use std::net::SocketAddr; 7 | use std::os::unix::prelude::FileExt; 8 | use std::sync::{mpsc, Arc, Mutex}; 9 | use std::time::{Duration, Instant}; 10 | 11 | const PAGESIZE: u64 = 512; 12 | 13 | struct PageCache { 14 | // Backing file. 15 | file: std::fs::File, 16 | 17 | // Page cache. Maps file offset to page. 18 | page_cache: std::collections::HashMap, 19 | page_cache_size: usize, 20 | 21 | // For buffering actual writes to disk. 22 | buffer: Vec, 23 | buffer_write_at: Option, 24 | buffer_write_at_offset: u64, 25 | } 26 | 27 | impl PageCache { 28 | fn new(file: std::fs::File, page_cache_size: usize) -> PageCache { 29 | let mut page_cache = std::collections::HashMap::new(); 30 | // Allocate the space up front! The page cache should never 31 | // allocate after this. This is a big deal. 32 | page_cache.reserve(page_cache_size + 1); 33 | 34 | PageCache { 35 | file, 36 | page_cache_size, 37 | page_cache, 38 | 39 | buffer: vec![], 40 | buffer_write_at: None, 41 | buffer_write_at_offset: 0, 42 | } 43 | } 44 | 45 | fn insert_or_replace_in_cache(&mut self, offset: u64, page: [u8; PAGESIZE as usize]) { 46 | if self.page_cache_size == 0 { 47 | return; 48 | } 49 | 50 | // If it's already in the cache, just overwrite it. 51 | if let Some(existing) = self.page_cache.get(&offset) { 52 | if page != *existing { 53 | self.page_cache.insert(offset, page); 54 | } 55 | return; 56 | } 57 | 58 | // TODO: Come up with a better cache policy. 59 | if self.page_cache.len() == self.page_cache_size { 60 | self.page_cache.clear(); 61 | } 62 | 63 | // Otherwise insert and evict something if we're out of space. 64 | self.page_cache.insert(offset, page); 65 | } 66 | 67 | #[allow(dead_code)] 68 | fn len(&self) -> usize { 69 | self.page_cache.len() 70 | } 71 | 72 | fn read(&mut self, offset: u64, buf_into: &mut [u8; PAGESIZE as usize]) { 73 | // For now, must to read() while a `write()` is ongoing. See 74 | // the comment in `self.write()`. 75 | assert_eq!(self.buffer_write_at, None); 76 | 77 | assert_eq!(buf_into.len(), PAGESIZE as usize); 78 | if let Some(page) = self.page_cache.get(&offset) { 79 | buf_into.copy_from_slice(page); 80 | return; 81 | } 82 | 83 | self.file.read_exact_at(&mut buf_into[0..], offset).unwrap(); 84 | self.insert_or_replace_in_cache(offset, *buf_into); 85 | } 86 | 87 | fn write(&mut self, offset: u64, page: [u8; PAGESIZE as usize]) { 88 | if self.buffer_write_at.is_none() { 89 | self.buffer_write_at = Some(offset); 90 | self.buffer_write_at_offset = offset; 91 | } else { 92 | // Make sure we're always doing sequential writes in 93 | // between self.flush() call. 94 | assert_eq!(self.buffer_write_at_offset, offset - PAGESIZE); 95 | self.buffer_write_at_offset = offset; 96 | } 97 | 98 | assert_ne!(self.buffer_write_at, None); 99 | 100 | // TODO: It is potentially unsafe if we are doing reads 101 | // inbetween writes. That isn't possible in the current 102 | // code. The case to worry about would be `self.write()` 103 | // before `self.sync()` where the pagecache gets filled up and 104 | // this particular page isn't in the pagecache and hasn't yet 105 | // been written to disk. The only correct thing to do would be 106 | // for `self.read()` to also check `self.buffer` before 107 | // reading from disk. 108 | self.buffer.extend(page); 109 | 110 | self.insert_or_replace_in_cache(offset, page); 111 | } 112 | 113 | fn sync(&mut self) { 114 | self.file 115 | .write_all_at(&self.buffer, self.buffer_write_at.unwrap()) 116 | .unwrap(); 117 | self.buffer.clear(); 118 | self.buffer_write_at = None; 119 | self.buffer_write_at_offset = 0; 120 | self.file.sync_all().unwrap(); 121 | } 122 | } 123 | 124 | #[cfg(test)] 125 | mod pagecache_tests { 126 | use super::*; 127 | 128 | #[test] 129 | fn test_pagecache() { 130 | let tests = [0, 1, 100]; 131 | for cache_size in tests { 132 | let tmp = server_tests::TmpDir::new(); 133 | let mut filename = tmp.dir.to_path_buf(); 134 | filename.push("test.dat"); 135 | let file = std::fs::File::options() 136 | .create(true) 137 | .read(true) 138 | .write(true) 139 | .open(filename.clone()) 140 | .expect("Could not open data file."); 141 | 142 | let first_page = [b'a'; PAGESIZE as usize]; 143 | let third_page = [b'c'; PAGESIZE as usize]; 144 | let mut p = PageCache::new(file, cache_size); 145 | p.write(0, first_page); 146 | assert!(p.len() <= cache_size); 147 | if cache_size > 0 { 148 | assert!(p.len() > 0); 149 | } 150 | p.sync(); 151 | 152 | p.write(PAGESIZE * 2, third_page); 153 | assert!(p.len() <= cache_size); 154 | if cache_size > 0 { 155 | assert!(p.len() > 0); 156 | } 157 | p.sync(); 158 | 159 | drop(p); 160 | 161 | let mut file = std::fs::File::options() 162 | .read(true) 163 | .open(filename) 164 | .expect("Could not open data file."); 165 | let mut all_pages = [0; 3 * PAGESIZE as usize]; 166 | file.read_exact(&mut all_pages).unwrap(); 167 | 168 | let second_page = [0; PAGESIZE as usize]; 169 | assert_eq!(all_pages[0..PAGESIZE as usize], first_page); 170 | assert_eq!( 171 | all_pages[PAGESIZE as usize..2 * PAGESIZE as usize], 172 | second_page 173 | ); 174 | assert_eq!( 175 | all_pages[2 * PAGESIZE as usize..3 * PAGESIZE as usize], 176 | third_page 177 | ); 178 | 179 | let mut p = PageCache::new(file, cache_size); 180 | let mut page = [0; PAGESIZE as usize]; 181 | p.read(0, &mut page); 182 | assert!(p.len() <= cache_size); 183 | if cache_size > 0 { 184 | assert!(p.len() > 0); 185 | } 186 | assert_eq!(page, first_page); 187 | p.read(PAGESIZE, &mut page); 188 | assert!(p.len() <= cache_size); 189 | if cache_size > 0 { 190 | assert!(p.len() > 0); 191 | } 192 | assert_eq!(page, second_page); 193 | p.read(PAGESIZE * 2, &mut page); 194 | assert!(p.len() <= cache_size); 195 | if cache_size > 0 { 196 | assert!(p.len() > 0); 197 | } 198 | assert_eq!(page, third_page); 199 | } 200 | } 201 | } 202 | 203 | struct PageCacheIO<'this> { 204 | offset: u64, 205 | pagecache: &'this mut PageCache, 206 | } 207 | 208 | impl<'this> Read for &mut PageCacheIO<'this> { 209 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 210 | assert_eq!(buf.len(), PAGESIZE as usize); 211 | let fixed_buf = <&mut [u8; PAGESIZE as usize]>::try_from(buf).unwrap(); 212 | self.pagecache.read(self.offset, fixed_buf); 213 | self.offset += PAGESIZE; 214 | Ok(PAGESIZE as usize) 215 | } 216 | } 217 | 218 | impl<'this> Write for PageCacheIO<'this> { 219 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 220 | assert_eq!(buf.len(), PAGESIZE as usize); 221 | let fixed_buf = <&[u8; PAGESIZE as usize]>::try_from(buf).unwrap(); 222 | self.pagecache.write(self.offset, *fixed_buf); 223 | self.offset += PAGESIZE; 224 | Ok(PAGESIZE as usize) 225 | } 226 | 227 | fn flush(&mut self) -> std::io::Result<()> { 228 | self.pagecache.sync(); 229 | Ok(()) 230 | } 231 | } 232 | 233 | // ON DISK FORMAT 234 | // 235 | // | Byte Range | Value | 236 | // |----------------|----------------| 237 | // | 0 - 4 | Magic Number | 238 | // | 4 - 8 | Format Version | 239 | // | 8 - 16 | Term | 240 | // | 16 - 32 | Voted For | 241 | // | 32 - 40 | Log Length | 242 | // | 40 - 44 | Checksum | 243 | // | PAGESIZE - EOF | Log Entries | 244 | // 245 | // ON DISK LOG ENTRY FORMAT 246 | // 247 | // | Byte Range | Value | 248 | // |------------------------------|--------------------| 249 | // | 0 | Entry Start Marker | 250 | // | 1 - 5 | Checksum | 251 | // | 5 - 13 | Log Index | 252 | // | 13 - 21 | Term | 253 | // | 21 - 37 | Client Serial Id | 254 | // | 37 - 53 | Client Id | 255 | // | 53 - 61 | Command Length | 256 | // | 61 - (61 + $Command Length$) | Command | 257 | // 258 | // $Entry Start$ is `1` when the page is the start of an entry, not an 259 | // overflow page. 260 | 261 | #[derive(Debug, Clone)] 262 | struct LogEntry { 263 | // Actual data. 264 | command: Vec, 265 | index: u64, 266 | term: u64, 267 | client_serial_id: u128, 268 | client_id: u128, 269 | } 270 | 271 | impl PartialEq for LogEntry { 272 | fn eq(&self, other: &Self) -> bool { 273 | self.command == other.command && self.term == other.term 274 | } 275 | } 276 | 277 | impl LogEntry { 278 | fn command_first_page(command_length: usize) -> usize { 279 | let page_minus_metadata = (PAGESIZE - 61) as usize; 280 | if command_length <= page_minus_metadata { 281 | command_length 282 | } else { 283 | page_minus_metadata 284 | } 285 | } 286 | 287 | fn store_metadata(&self, buffer: &mut [u8; PAGESIZE as usize]) -> usize { 288 | *buffer = [0; PAGESIZE as usize]; 289 | let command_length = self.command.len(); 290 | 291 | buffer[0] = 1; // Entry start marker. 292 | buffer[5..13].copy_from_slice(&self.term.to_le_bytes()); 293 | buffer[13..21].copy_from_slice(&self.index.to_le_bytes()); 294 | buffer[21..37].copy_from_slice(&self.client_serial_id.to_le_bytes()); 295 | buffer[37..53].copy_from_slice(&self.client_id.to_le_bytes()); 296 | buffer[53..61].copy_from_slice(&command_length.to_le_bytes()); 297 | 298 | let mut checksum = CRC32C::new(); 299 | checksum.update(&buffer[5..61]); 300 | checksum.update(&self.command); 301 | buffer[1..5].copy_from_slice(&checksum.sum().to_le_bytes()); 302 | 303 | let command_first_page = LogEntry::command_first_page(command_length); 304 | buffer[61..61 + command_first_page].copy_from_slice(&self.command[0..command_first_page]); 305 | command_length - command_first_page 306 | } 307 | 308 | fn store_overflow(&self, buffer: &mut [u8; PAGESIZE as usize], offset: usize) -> usize { 309 | let to_write = self.command.len() - offset; 310 | let filled = if to_write > PAGESIZE as usize - 1 { 311 | // -1 for the overflow marker. 312 | PAGESIZE as usize - 1 313 | } else { 314 | to_write 315 | }; 316 | buffer[0] = 0; // Overflow marker. 317 | buffer[1..1 + filled].copy_from_slice(&self.command[offset..offset + filled]); 318 | filled 319 | } 320 | 321 | fn encode(&self, buffer: &mut [u8; PAGESIZE as usize], mut writer: impl std::io::Write) -> u64 { 322 | let to_write = self.store_metadata(buffer); 323 | writer.write_all(buffer).unwrap(); 324 | let mut pages = 1; 325 | 326 | let mut written = self.command.len() - to_write; 327 | 328 | while written < self.command.len() { 329 | let filled = self.store_overflow(buffer, written); 330 | writer.write_all(buffer).unwrap(); 331 | written += filled; 332 | pages += 1; 333 | } 334 | 335 | pages 336 | } 337 | 338 | fn recover_metadata(page: &[u8; PAGESIZE as usize]) -> (LogEntry, u32, usize) { 339 | assert_eq!(page[0], 1); // Start of entry marker. 340 | let term = u64::from_le_bytes(page[5..13].try_into().unwrap()); 341 | let index = u64::from_le_bytes(page[13..21].try_into().unwrap()); 342 | let client_serial_id = u128::from_le_bytes(page[21..37].try_into().unwrap()); 343 | let client_id = u128::from_le_bytes(page[37..53].try_into().unwrap()); 344 | let command_length = u64::from_le_bytes(page[53..61].try_into().unwrap()) as usize; 345 | let stored_checksum = u32::from_le_bytes(page[1..5].try_into().unwrap()); 346 | 347 | // recover_metadata() will only decode the first page's worth of 348 | // the command. Call recover_overflow() to decode any 349 | // additional pages. 350 | let command_first_page = LogEntry::command_first_page(command_length); 351 | let mut command = vec![0; command_length]; 352 | command[0..command_first_page].copy_from_slice(&page[61..61 + command_first_page]); 353 | 354 | ( 355 | LogEntry { 356 | index, 357 | term, 358 | command, 359 | client_serial_id, 360 | client_id, 361 | }, 362 | stored_checksum, 363 | command_first_page, 364 | ) 365 | } 366 | 367 | fn recover_overflow( 368 | page: &[u8; PAGESIZE as usize], 369 | command: &mut [u8], 370 | command_read: usize, 371 | ) -> usize { 372 | let to_read = command.len() - command_read; 373 | 374 | // Entry start marker is false for overflow page. 375 | assert_eq!(page[0], 0); 376 | 377 | let fill = if to_read > PAGESIZE as usize - 1 { 378 | // -1 for the entry start marker. 379 | PAGESIZE as usize - 1 380 | } else { 381 | to_read 382 | }; 383 | command[command_read..command_read + fill].copy_from_slice(&page[1..1 + fill]); 384 | fill 385 | } 386 | 387 | fn decode(mut reader: impl std::io::Read) -> LogEntry { 388 | let mut page = [0; PAGESIZE as usize]; 389 | // Since entries are always encoded into complete PAGESIZE 390 | // bytes, for network or for disk, it should always be 391 | // reasonable to block on an entire PAGESIZE of bytes, for 392 | // network or for disk. 393 | reader.read_exact(&mut page).unwrap(); 394 | 395 | let (mut entry, stored_checksum, command_read) = LogEntry::recover_metadata(&page); 396 | let mut actual_checksum = CRC32C::new(); 397 | actual_checksum.update(&page[5..61]); 398 | 399 | let mut read = command_read; 400 | while read < entry.command.len() { 401 | reader.read_exact(&mut page).unwrap(); 402 | let filled = LogEntry::recover_overflow(&page, &mut entry.command, read); 403 | read += filled; 404 | } 405 | 406 | actual_checksum.update(&entry.command); 407 | assert_eq!(stored_checksum, actual_checksum.sum()); 408 | entry 409 | } 410 | 411 | fn decode_from_pagecache(pagecache: &mut PageCache, offset: u64) -> (LogEntry, u64) { 412 | let mut reader = PageCacheIO { offset, pagecache }; 413 | let entry = LogEntry::decode(&mut reader); 414 | let offset = reader.offset; 415 | 416 | (entry, offset) 417 | } 418 | } 419 | 420 | struct DurableState { 421 | // In-memory data. 422 | last_log_term: u64, 423 | next_log_index: u64, 424 | next_log_offset: u64, 425 | pagecache: PageCache, 426 | 427 | // On-disk data. 428 | current_term: u64, 429 | voted_for: u128, // Zero is the None value. User must not be a valid server id. 430 | } 431 | 432 | impl DurableState { 433 | fn new(data_directory: &std::path::Path, id: u128, page_cache_size: usize) -> DurableState { 434 | let mut filename = data_directory.to_path_buf(); 435 | filename.push(format!("server_{}.data", id)); 436 | let file = std::fs::File::options() 437 | .create(true) 438 | .read(true) 439 | .write(true) 440 | .open(filename) 441 | .expect("Could not open data file."); 442 | DurableState { 443 | last_log_term: 0, 444 | next_log_index: 0, 445 | next_log_offset: PAGESIZE, 446 | pagecache: PageCache::new(file, page_cache_size), 447 | 448 | current_term: 0, 449 | voted_for: 0, 450 | } 451 | } 452 | 453 | fn restore(&mut self) { 454 | // If there's nothing to restore, calling append with the 455 | // required 0th empty log entry will be sufficient to get 456 | // state into the right place. 457 | if let Ok(m) = self.pagecache.file.metadata() { 458 | if m.len() == 0 { 459 | self.append(&mut [LogEntry { 460 | index: 0, 461 | term: 0, 462 | command: vec![], 463 | client_serial_id: 0, 464 | client_id: 0, 465 | }]); 466 | return; 467 | } 468 | } 469 | 470 | let mut metadata: [u8; PAGESIZE as usize] = [0; PAGESIZE as usize]; 471 | self.pagecache.read(0, &mut metadata); 472 | 473 | // Magic number check. 474 | assert_eq!(metadata[0..4], 0xFABEF15E_u32.to_le_bytes()); 475 | 476 | // Version number check. 477 | assert_eq!(metadata[4..8], 1_u32.to_le_bytes()); 478 | 479 | self.current_term = u64::from_le_bytes(metadata[8..16].try_into().unwrap()); 480 | self.voted_for = u128::from_le_bytes(metadata[16..32].try_into().unwrap()); 481 | 482 | let checksum = u32::from_le_bytes(metadata[40..44].try_into().unwrap()); 483 | if checksum != crc32c(&metadata[0..40]) { 484 | panic!("Bad checksum for data file."); 485 | } 486 | 487 | let log_length = u64::from_le_bytes(metadata[32..40].try_into().unwrap()) as usize; 488 | 489 | let mut scanned = 0; 490 | while scanned < log_length { 491 | self.next_log_index += 1; 492 | 493 | let (e, new_offset) = 494 | LogEntry::decode_from_pagecache(&mut self.pagecache, self.next_log_offset); 495 | self.last_log_term = e.term; 496 | self.next_log_offset = new_offset; 497 | scanned += 1; 498 | } 499 | } 500 | 501 | #[allow(dead_code)] 502 | fn debug_client_entry_count(&mut self) -> u64 { 503 | let mut count = 0; 504 | for i in 0..self.next_log_index { 505 | let e = self.log_at_index(i); 506 | if !e.command.is_empty() { 507 | count += 1; 508 | } 509 | } 510 | 511 | count 512 | } 513 | 514 | fn append(&mut self, entries: &mut [LogEntry]) { 515 | self.append_from_index(entries, self.next_log_index); 516 | } 517 | 518 | // Durably add logs to disk. 519 | fn append_from_index(&mut self, entries: &mut [LogEntry], from_index: u64) { 520 | let mut buffer: [u8; PAGESIZE as usize] = [0; PAGESIZE as usize]; 521 | 522 | self.next_log_offset = self.offset_from_index(from_index); 523 | // This is extremely important. Sometimes the log must be 524 | // truncated. This is what does the truncation. Existing 525 | // messages are not necessarily overwritten. But metadata for 526 | // what the current last log index is always correct. 527 | self.next_log_index = from_index; 528 | 529 | let mut writer = PageCacheIO { 530 | offset: self.next_log_offset, 531 | pagecache: &mut self.pagecache, 532 | }; 533 | if !entries.is_empty() { 534 | // Write out all new logs. 535 | for entry in entries.iter_mut() { 536 | entry.index = self.next_log_index; 537 | self.next_log_index += 1; 538 | 539 | assert!(self.next_log_offset >= PAGESIZE); 540 | 541 | let pages = entry.encode(&mut buffer, &mut writer); 542 | println!("Wrote {:?} at {}.", entry.command, entry.index); 543 | self.next_log_offset += pages * PAGESIZE; 544 | 545 | self.last_log_term = entry.term; 546 | } 547 | 548 | writer.flush().unwrap(); 549 | } 550 | 551 | // Write log length metadata. 552 | self.update(self.current_term, self.voted_for); 553 | } 554 | 555 | // Durably save non-log data. 556 | fn update(&mut self, term: u64, voted_for: u128) { 557 | self.current_term = term; 558 | self.voted_for = voted_for; 559 | 560 | let mut metadata: [u8; PAGESIZE as usize] = [0; PAGESIZE as usize]; 561 | // Magic number. 562 | metadata[0..4].copy_from_slice(&0xFABEF15E_u32.to_le_bytes()); 563 | // Version. 564 | metadata[4..8].copy_from_slice(&1_u32.to_le_bytes()); 565 | 566 | metadata[8..16].copy_from_slice(&term.to_le_bytes()); 567 | 568 | metadata[16..32].copy_from_slice(&voted_for.to_le_bytes()); 569 | 570 | let log_length = self.next_log_index; 571 | metadata[32..40].copy_from_slice(&log_length.to_le_bytes()); 572 | 573 | let checksum = crc32c(&metadata[0..40]); 574 | metadata[40..44].copy_from_slice(&checksum.to_le_bytes()); 575 | 576 | self.pagecache.write(0, metadata); 577 | self.pagecache.sync(); 578 | } 579 | 580 | fn offset_from_index(&mut self, index: u64) -> u64 { 581 | if index == self.next_log_index { 582 | return self.next_log_offset; 583 | } 584 | 585 | assert!(index < self.next_log_index); 586 | let mut page: [u8; PAGESIZE as usize] = [0; PAGESIZE as usize]; 587 | 588 | // Rather than linear search backwards, we store the index in 589 | // the page itself and then do a binary search on disk. 590 | let mut l = PAGESIZE; 591 | let mut r = self.next_log_offset - PAGESIZE; 592 | while l <= r { 593 | let mut m = l + (r - l) / 2; 594 | // Round up to the nearest page. 595 | m += m % PAGESIZE; 596 | assert_eq!(m % PAGESIZE, 0); 597 | 598 | // Look for a start of entry page. 599 | self.pagecache.read(m, &mut page); 600 | while page[0] != 1 { 601 | m -= PAGESIZE; 602 | self.pagecache.read(m, &mut page); 603 | } 604 | 605 | // TODO: Bad idea to hardcode the offset. 606 | let current_index = u64::from_le_bytes(page[13..21].try_into().unwrap()); 607 | if current_index == index { 608 | return m; 609 | } 610 | 611 | if current_index < index { 612 | // Read until the next entry, set m to the next entry. 613 | page[0] = 0; 614 | m += PAGESIZE; 615 | self.pagecache.read(m, &mut page); 616 | while page[0] != 1 { 617 | m += PAGESIZE; 618 | self.pagecache.read(m, &mut page); 619 | } 620 | 621 | l = m; 622 | } else { 623 | r = m - PAGESIZE; 624 | } 625 | } 626 | 627 | unreachable!( 628 | "Could not find index {index} with log length: {}.", 629 | self.next_log_index 630 | ); 631 | } 632 | 633 | fn log_at_index(&mut self, i: u64) -> LogEntry { 634 | let offset = self.offset_from_index(i); 635 | let (entry, _) = LogEntry::decode_from_pagecache(&mut self.pagecache, offset); 636 | entry 637 | } 638 | } 639 | 640 | #[derive(Copy, Clone, PartialEq, Debug)] 641 | enum Condition { 642 | Leader, 643 | Follower, 644 | Candidate, 645 | } 646 | 647 | impl std::fmt::Display for Condition { 648 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 649 | write!(f, "{:?}", self) 650 | } 651 | } 652 | 653 | struct VolatileState { 654 | condition: Condition, 655 | 656 | commit_index: u64, 657 | last_applied: u64, 658 | 659 | // Timeouts 660 | election_frequency: Duration, // Read-only. 661 | election_timeout: Instant, // Randomly set based on election_frequency. 662 | rand: Random, 663 | 664 | // Leader-only state. 665 | next_index: Vec, 666 | match_index: Vec, 667 | 668 | // Candidate-only state. 669 | votes: usize, 670 | } 671 | 672 | impl VolatileState { 673 | fn new(cluster_size: usize, election_frequency: Duration, rand: Random) -> VolatileState { 674 | let jitter = election_frequency.as_secs_f64() / 3.0; 675 | VolatileState { 676 | condition: Condition::Follower, 677 | commit_index: 0, 678 | last_applied: 0, 679 | next_index: vec![0; cluster_size], 680 | match_index: vec![0; cluster_size], 681 | votes: 0, 682 | 683 | election_frequency, 684 | election_timeout: Instant::now() + Duration::from_secs_f64(jitter), 685 | rand, 686 | } 687 | } 688 | 689 | fn reset(&mut self) { 690 | let count = self.next_index.len(); 691 | for i in 0..count { 692 | self.next_index[i] = 0; 693 | self.match_index[i] = 0; 694 | } 695 | self.votes = 0; 696 | } 697 | } 698 | 699 | struct State { 700 | logger: Logger, 701 | durable: DurableState, 702 | volatile: VolatileState, 703 | 704 | // Non-Raft state. 705 | stopped: bool, 706 | } 707 | 708 | impl State { 709 | fn log + std::fmt::Display>(&self, msg: S) { 710 | self.logger.log( 711 | self.durable.current_term, 712 | self.durable.next_log_index, 713 | self.volatile.condition, 714 | msg, 715 | ); 716 | } 717 | 718 | fn next_request_id(&mut self) -> u64 { 719 | self.volatile.rand.generate_u64() 720 | } 721 | 722 | fn reset_election_timeout(&mut self) { 723 | let random_percent = self.volatile.rand.generate_percent(); 724 | let positive = self.volatile.rand.generate_bool(); 725 | let jitter = random_percent as f64 * (self.volatile.election_frequency.as_secs_f64() / 2.0); 726 | 727 | let mut new_timeout = self.volatile.election_frequency; 728 | // Duration apparently isn't allowed to be negative. 729 | if positive { 730 | new_timeout += Duration::from_secs_f64(jitter); 731 | } else { 732 | new_timeout -= Duration::from_secs_f64(jitter); 733 | } 734 | 735 | self.volatile.election_timeout = Instant::now() + new_timeout; 736 | 737 | self.log(format!( 738 | "Resetting election timeout: {}ms.", 739 | new_timeout.as_millis() 740 | )); 741 | } 742 | 743 | fn transition(&mut self, condition: Condition, term_increase: u64, voted_for: u128) { 744 | assert_ne!(self.volatile.condition, condition); 745 | self.log(format!("Became {}.", condition)); 746 | self.volatile.condition = condition; 747 | // Reset vote. 748 | self.durable 749 | .update(self.durable.current_term + term_increase, voted_for); 750 | } 751 | } 752 | 753 | // REQUEST WIRE PROTOCOL 754 | // 755 | // | Byte Range | Value | 756 | // |------------|-----------------------------------------------| 757 | // | 0 - 8 | Request Id | 758 | // | 8 - 24 | Sender Id | 759 | // | 24 | Request Type | 760 | // | 25 - 33 | Term | 761 | // | 33 - 49 | Leader Id / Candidate Id | 762 | // | 49 - 57 | Prev Log Index / Last Log Index | 763 | // | 57 - 65 | Prev Log Term / Last Log Term | 764 | // | 65 - 73 | (Request Vote) Checksum / Leader Commit Index | 765 | // | 73 | Entries Length | 766 | // | 74 - 78 | (Append Entries) Checksum | 767 | // | 78 - EOM | Entries | 768 | // 769 | // ENTRIES WIRE PROTOCOL 770 | // 771 | // See: ON DISK LOG ENTRY FORMAT. 772 | // 773 | // RESPONSE WIRE PROTOCOL 774 | // 775 | // | Byte Range | Value | 776 | // |------------|---------------------------------------| 777 | // | 0 - 8 | Request Id | 778 | // | 8 - 24 | Sender Id | 779 | // | 24 | Response Type | 780 | // | 25 - 33 | Term | 781 | // | 33 | Success / Vote Granted | 782 | // | 34 - 42 | (Request Vote) Checksum / Match Index | 783 | // | 42 - 46 | (Append Entries) Checksum | 784 | 785 | struct RPCMessageEncoder { 786 | request_id: u64, // Not part of Raft. Used only for debugging. 787 | sender_id: u128, 788 | writer: BufWriter, 789 | written: Vec, 790 | } 791 | 792 | impl RPCMessageEncoder { 793 | fn new(request_id: u64, sender_id: u128, writer: BufWriter) -> RPCMessageEncoder { 794 | RPCMessageEncoder { 795 | request_id, 796 | writer, 797 | written: vec![], 798 | sender_id, 799 | } 800 | } 801 | 802 | fn metadata(&mut self, kind: u8, term: u64) { 803 | assert_eq!(self.written.len(), 0); 804 | 805 | self.written 806 | .extend_from_slice(&self.request_id.to_le_bytes()); 807 | 808 | self.written 809 | .extend_from_slice(&self.sender_id.to_le_bytes()); 810 | 811 | self.written.push(kind); 812 | 813 | self.written.extend_from_slice(&term.to_le_bytes()); 814 | assert_eq!(self.written.len(), 33); 815 | 816 | self.writer.write_all(&self.written).unwrap(); 817 | } 818 | 819 | fn data(&mut self, data: &[u8]) { 820 | let offset = self.written.len(); 821 | self.written.extend_from_slice(data); 822 | self.writer.write_all(&self.written[offset..]).unwrap(); 823 | } 824 | 825 | fn done(&mut self) { 826 | let checksum = crc32c(&self.written); 827 | 828 | self.writer.write_all(&checksum.to_le_bytes()).unwrap(); 829 | self.writer.flush().unwrap(); 830 | } 831 | } 832 | 833 | #[derive(Debug, PartialEq)] 834 | struct RequestVoteRequest { 835 | request_id: u64, // Not part of Raft. Used only for debugging. 836 | term: u64, 837 | candidate_id: u128, 838 | last_log_index: u64, 839 | last_log_term: u64, 840 | } 841 | 842 | impl RequestVoteRequest { 843 | fn decode( 844 | mut reader: BufReader, 845 | metadata: [u8; 33], 846 | request_id: u64, 847 | term: u64, 848 | ) -> Result { 849 | let mut buffer: [u8; 69] = [0; 69]; 850 | buffer[0..metadata.len()].copy_from_slice(&metadata); 851 | reader.read_exact(&mut buffer[metadata.len()..]).unwrap(); 852 | 853 | let checksum = u32::from_le_bytes(buffer[65..69].try_into().unwrap()); 854 | if checksum != crc32c(&buffer[0..65]) { 855 | return Err("Bad checksum.".into()); 856 | } 857 | 858 | let candidate_id = u128::from_le_bytes(buffer[33..49].try_into().unwrap()); 859 | let last_log_index = u64::from_le_bytes(buffer[49..57].try_into().unwrap()); 860 | let last_log_term = u64::from_le_bytes(buffer[57..65].try_into().unwrap()); 861 | 862 | Ok(RPCBody::RequestVoteRequest(RequestVoteRequest { 863 | request_id, 864 | term, 865 | candidate_id, 866 | last_log_index, 867 | last_log_term, 868 | })) 869 | } 870 | 871 | fn encode(&self, encoder: &mut RPCMessageEncoder) { 872 | encoder.metadata(RPCBodyKind::RequestVoteRequest as u8, self.term); 873 | encoder.data(&self.candidate_id.to_le_bytes()); 874 | encoder.data(&self.last_log_index.to_le_bytes()); 875 | encoder.data(&self.last_log_term.to_le_bytes()); 876 | encoder.done(); 877 | } 878 | } 879 | 880 | #[derive(Debug, PartialEq)] 881 | struct RequestVoteResponse { 882 | request_id: u64, // Not part of Raft. Used only for debugging. 883 | term: u64, 884 | vote_granted: bool, 885 | } 886 | 887 | impl RequestVoteResponse { 888 | fn decode( 889 | mut reader: BufReader, 890 | metadata: [u8; 33], 891 | request_id: u64, 892 | term: u64, 893 | ) -> Result { 894 | let mut buffer: [u8; 38] = [0; 38]; 895 | buffer[0..metadata.len()].copy_from_slice(&metadata); 896 | reader.read_exact(&mut buffer[metadata.len()..]).unwrap(); 897 | 898 | let checksum = u32::from_le_bytes(buffer[34..38].try_into().unwrap()); 899 | if checksum != crc32c(&buffer[0..34]) { 900 | return Err("Bad checksum.".into()); 901 | } 902 | 903 | Ok(RPCBody::RequestVoteResponse(RequestVoteResponse { 904 | request_id, 905 | term, 906 | vote_granted: buffer[33] == 1, 907 | })) 908 | } 909 | 910 | fn encode(&self, encoder: &mut RPCMessageEncoder) { 911 | encoder.metadata(RPCBodyKind::RequestVoteResponse as u8, self.term); 912 | encoder.data(&[self.vote_granted as u8]); 913 | encoder.done(); 914 | } 915 | } 916 | 917 | #[derive(Debug, PartialEq)] 918 | struct AppendEntriesRequest { 919 | request_id: u64, // Not part of Raft. Used only for debugging. 920 | term: u64, 921 | leader_id: u128, 922 | prev_log_index: u64, 923 | prev_log_term: u64, 924 | entries: Vec, 925 | leader_commit: u64, 926 | } 927 | 928 | impl AppendEntriesRequest { 929 | fn decode( 930 | mut reader: BufReader, 931 | metadata: [u8; 33], 932 | request_id: u64, 933 | term: u64, 934 | ) -> Result { 935 | let mut buffer: [u8; 78] = [0; 78]; 936 | buffer[0..metadata.len()].copy_from_slice(&metadata); 937 | reader.read_exact(&mut buffer[metadata.len()..]).unwrap(); 938 | 939 | let checksum = u32::from_le_bytes(buffer[74..78].try_into().unwrap()); 940 | if checksum != crc32c(&buffer[0..74]) { 941 | return Err("Bad checksum.".into()); 942 | } 943 | 944 | let leader_id = u128::from_le_bytes(buffer[33..49].try_into().unwrap()); 945 | let prev_log_index = u64::from_le_bytes(buffer[49..57].try_into().unwrap()); 946 | let prev_log_term = u64::from_le_bytes(buffer[57..65].try_into().unwrap()); 947 | let leader_commit = u64::from_le_bytes(buffer[65..73].try_into().unwrap()); 948 | let entries_length = buffer[73] as usize; 949 | let mut entries = Vec::::with_capacity(entries_length); 950 | 951 | while entries.len() < entries_length { 952 | let e = LogEntry::decode(&mut reader); 953 | entries.push(e); 954 | } 955 | 956 | Ok(RPCBody::AppendEntriesRequest(AppendEntriesRequest { 957 | request_id, 958 | term, 959 | leader_id, 960 | prev_log_index, 961 | prev_log_term, 962 | leader_commit, 963 | entries, 964 | })) 965 | } 966 | 967 | fn encode(&self, encoder: &mut RPCMessageEncoder) { 968 | encoder.metadata(RPCBodyKind::AppendEntriesRequest as u8, self.term); 969 | encoder.data(&self.leader_id.to_le_bytes()); 970 | encoder.data(&(self.prev_log_index).to_le_bytes()); 971 | encoder.data(&self.prev_log_term.to_le_bytes()); 972 | encoder.data(&(self.leader_commit).to_le_bytes()); 973 | assert!(self.entries.len() <= 0xFF); 974 | encoder.data(&(self.entries.len() as u8).to_le_bytes()); 975 | encoder.done(); 976 | 977 | let mut buffer: [u8; PAGESIZE as usize] = [0; PAGESIZE as usize]; 978 | for entry in self.entries.iter() { 979 | entry.encode(&mut buffer, &mut encoder.writer); 980 | } 981 | encoder.writer.flush().unwrap(); 982 | } 983 | } 984 | 985 | #[derive(Debug, PartialEq, Clone)] 986 | struct AppendEntriesResponse { 987 | request_id: u64, 988 | term: u64, 989 | success: bool, 990 | 991 | // When `success == true`, `match_index` is the value the leader 992 | // should set this server's `match_index` to. `next_index` should 993 | // be set to `match_index + 1`. 994 | // 995 | // When `success == false`, `match_index` is the value the leader 996 | // should set this server's `next_index` to. 997 | // 998 | // This isn't something the Raft paper proscribes though it is 999 | // used in the TLA+ spec. It seems necessary if you are supposed 1000 | // to allow multiple in-flight requests per follower. 1001 | match_index: u64, 1002 | } 1003 | 1004 | impl AppendEntriesResponse { 1005 | fn decode( 1006 | mut reader: BufReader, 1007 | metadata: [u8; 33], 1008 | request_id: u64, 1009 | term: u64, 1010 | ) -> Result { 1011 | let mut buffer: [u8; 46] = [0; 46]; 1012 | buffer[0..metadata.len()].copy_from_slice(&metadata); 1013 | reader.read_exact(&mut buffer[metadata.len()..]).unwrap(); 1014 | 1015 | let match_index = u64::from_le_bytes(buffer[34..42].try_into().unwrap()); 1016 | 1017 | let checksum = u32::from_le_bytes(buffer[42..46].try_into().unwrap()); 1018 | if checksum != crc32c(&buffer[0..42]) { 1019 | return Err("Bad checksum.".into()); 1020 | } 1021 | 1022 | Ok(RPCBody::AppendEntriesResponse(AppendEntriesResponse { 1023 | request_id, 1024 | term, 1025 | success: buffer[33] == 1, 1026 | match_index, 1027 | })) 1028 | } 1029 | 1030 | fn encode(&self, encoder: &mut RPCMessageEncoder) { 1031 | encoder.metadata(RPCBodyKind::AppendEntriesResponse as u8, self.term); 1032 | encoder.data(&[self.success as u8]); 1033 | encoder.data(&self.match_index.to_le_bytes()); 1034 | encoder.done(); 1035 | } 1036 | } 1037 | 1038 | enum RPCBodyKind { 1039 | RequestVoteRequest = 0, 1040 | RequestVoteResponse = 1, 1041 | AppendEntriesRequest = 2, 1042 | AppendEntriesResponse = 3, 1043 | } 1044 | 1045 | #[derive(Debug, PartialEq)] 1046 | enum RPCBody { 1047 | RequestVoteRequest(RequestVoteRequest), 1048 | RequestVoteResponse(RequestVoteResponse), 1049 | AppendEntriesRequest(AppendEntriesRequest), 1050 | AppendEntriesResponse(AppendEntriesResponse), 1051 | } 1052 | 1053 | impl RPCBody { 1054 | fn term(&self) -> u64 { 1055 | match self { 1056 | RPCBody::RequestVoteRequest(r) => r.term, 1057 | RPCBody::RequestVoteResponse(r) => r.term, 1058 | RPCBody::AppendEntriesRequest(r) => r.term, 1059 | RPCBody::AppendEntriesResponse(r) => r.term, 1060 | } 1061 | } 1062 | 1063 | fn request_id(&self) -> u64 { 1064 | match self { 1065 | RPCBody::RequestVoteRequest(r) => r.request_id, 1066 | RPCBody::RequestVoteResponse(r) => r.request_id, 1067 | RPCBody::AppendEntriesRequest(r) => r.request_id, 1068 | RPCBody::AppendEntriesResponse(r) => r.request_id, 1069 | } 1070 | } 1071 | } 1072 | 1073 | #[derive(Debug, PartialEq)] 1074 | struct RPCMessage { 1075 | from: u128, 1076 | body: RPCBody, 1077 | } 1078 | 1079 | impl RPCMessage { 1080 | fn term(&self) -> u64 { 1081 | self.body.term() 1082 | } 1083 | 1084 | fn request_id(&self) -> u64 { 1085 | self.body.request_id() 1086 | } 1087 | 1088 | fn decode(mut reader: BufReader) -> Result { 1089 | let mut metadata: [u8; 33] = [0; 33]; 1090 | if reader.read_exact(&mut metadata).is_err() { 1091 | return Err("Could not read metadata.".into()); 1092 | } 1093 | 1094 | let request_id = u64::from_le_bytes(metadata[0..8].try_into().unwrap()); 1095 | let server_id = u128::from_le_bytes(metadata[8..24].try_into().unwrap()); 1096 | 1097 | let message_type = metadata[24]; 1098 | let term = u64::from_le_bytes(metadata[25..33].try_into().unwrap()); 1099 | let body = if message_type == RPCBodyKind::RequestVoteRequest as u8 { 1100 | RequestVoteRequest::decode(reader, metadata, request_id, term) 1101 | } else if message_type == RPCBodyKind::RequestVoteResponse as u8 { 1102 | RequestVoteResponse::decode(reader, metadata, request_id, term) 1103 | } else if message_type == RPCBodyKind::AppendEntriesRequest as u8 { 1104 | AppendEntriesRequest::decode(reader, metadata, request_id, term) 1105 | } else if message_type == RPCBodyKind::AppendEntriesResponse as u8 { 1106 | AppendEntriesResponse::decode(reader, metadata, request_id, term) 1107 | } else { 1108 | return Err(format!("Unknown request type: {}.", message_type)); 1109 | }; 1110 | 1111 | Ok(RPCMessage { 1112 | from: server_id, 1113 | body: body?, 1114 | }) 1115 | } 1116 | 1117 | fn encode(&self, sender_id: u128, writer: BufWriter) { 1118 | let encoder = &mut RPCMessageEncoder::new(self.request_id(), sender_id, writer); 1119 | match &self.body { 1120 | RPCBody::RequestVoteRequest(rvr) => rvr.encode(encoder), 1121 | RPCBody::RequestVoteResponse(rvr) => rvr.encode(encoder), 1122 | RPCBody::AppendEntriesRequest(aer) => aer.encode(encoder), 1123 | RPCBody::AppendEntriesResponse(aer) => aer.encode(encoder), 1124 | }; 1125 | } 1126 | } 1127 | 1128 | #[derive(Clone)] 1129 | struct Logger { 1130 | server_id: u128, 1131 | debug: bool, 1132 | } 1133 | 1134 | impl Logger { 1135 | fn log + std::fmt::Display>( 1136 | &self, 1137 | term: u64, 1138 | log_length: u64, 1139 | condition: Condition, 1140 | msg: S, 1141 | ) { 1142 | if !self.debug { 1143 | return; 1144 | } 1145 | 1146 | println!( 1147 | "[S: {: <3} T: {: <3} L: {: <3} C: {}] {}", 1148 | self.server_id, 1149 | term, 1150 | log_length, 1151 | match condition { 1152 | Condition::Leader => "L", 1153 | Condition::Candidate => "C", 1154 | Condition::Follower => "F", 1155 | }, 1156 | msg 1157 | ); 1158 | } 1159 | } 1160 | 1161 | struct RPCManager { 1162 | cluster: Vec, 1163 | server_id: u128, 1164 | stream_sender: mpsc::Sender, 1165 | stream_receiver: mpsc::Receiver, 1166 | stop_mutex: Arc>, 1167 | logger: Logger, 1168 | } 1169 | 1170 | impl RPCManager { 1171 | fn new(server_id: u128, cluster: Vec, logger: Logger) -> RPCManager { 1172 | let (stream_sender, stream_receiver): ( 1173 | mpsc::Sender, 1174 | mpsc::Receiver, 1175 | ) = mpsc::channel(); 1176 | RPCManager { 1177 | logger, 1178 | cluster, 1179 | server_id, 1180 | stream_sender, 1181 | stream_receiver, 1182 | stop_mutex: Arc::new(Mutex::new(false)), 1183 | } 1184 | } 1185 | 1186 | fn address_from_id(&self, id: u128) -> SocketAddr { 1187 | for server in self.cluster.iter() { 1188 | if server.id == id { 1189 | return server.address; 1190 | } 1191 | } 1192 | 1193 | panic!("Bad Server Id for configuration.") 1194 | } 1195 | 1196 | fn start(&mut self) { 1197 | let address = self.address_from_id(self.server_id); 1198 | 1199 | let thread_stop = self.stop_mutex.clone(); 1200 | let thread_stream_sender = self.stream_sender.clone(); 1201 | std::thread::spawn(move || { 1202 | loop { 1203 | let listener = match std::net::TcpListener::bind(address) { 1204 | Ok(l) => l, 1205 | Err(e) => panic!("Could not bind to {address}: {e}."), 1206 | }; 1207 | 1208 | for stream in listener.incoming().flatten() { 1209 | // For this logic to be triggered, we must create a 1210 | // connection to our own server after setting 1211 | // `thread_stop` to `true`. 1212 | let stop = thread_stop.lock().unwrap(); 1213 | if *stop { 1214 | return; 1215 | } 1216 | 1217 | let bufreader = BufReader::new(stream); 1218 | match RPCMessage::decode(bufreader) { 1219 | Ok(msg) => thread_stream_sender.send(msg).unwrap(), 1220 | Err(msg) => panic!("Could not read request. Error: {}.", msg), 1221 | } 1222 | } 1223 | } 1224 | }); 1225 | } 1226 | 1227 | fn send( 1228 | &mut self, 1229 | log_length: u64, 1230 | condition: Condition, 1231 | to_server_id: u128, 1232 | message: &RPCMessage, 1233 | ) { 1234 | let address = self.address_from_id(to_server_id); 1235 | let server_id = self.server_id; 1236 | 1237 | self.logger.log( 1238 | message.term(), 1239 | log_length, 1240 | condition, 1241 | format!("Sending {:?} to {}.", message.body, to_server_id), 1242 | ); 1243 | let stream = if let Ok(stream) = std::net::TcpStream::connect(address) { 1244 | stream 1245 | } else { 1246 | self.logger.log( 1247 | message.term(), 1248 | log_length, 1249 | condition, 1250 | format!("Could not connect to {to_server_id}."), 1251 | ); 1252 | return; 1253 | }; 1254 | let bufwriter = BufWriter::new(stream.try_clone().unwrap()); 1255 | message.encode(server_id, bufwriter); 1256 | } 1257 | } 1258 | 1259 | #[derive(Debug, PartialEq)] 1260 | pub enum ApplyResult { 1261 | NotALeader, 1262 | Ok, 1263 | } 1264 | 1265 | pub trait StateMachine { 1266 | fn apply(&self, messages: Vec>) -> Vec>; 1267 | } 1268 | 1269 | #[derive(Copy, Clone)] 1270 | pub struct ServerConfig { 1271 | id: u128, 1272 | address: SocketAddr, 1273 | } 1274 | 1275 | pub struct Config { 1276 | // Cluster configuration. 1277 | server_index: usize, 1278 | server_id: u128, 1279 | cluster: Vec, 1280 | 1281 | // Timing configuration. 1282 | election_frequency: Duration, 1283 | 1284 | // Random. 1285 | random_seed: [u64; 4], 1286 | 1287 | // Logger. 1288 | logger_debug: bool, 1289 | 1290 | page_cache_size: usize, 1291 | } 1292 | 1293 | pub struct Server { 1294 | config: Config, 1295 | 1296 | sm: SM, 1297 | rpc_manager: RPCManager, 1298 | 1299 | state: Mutex, 1300 | 1301 | client_id: u128, 1302 | apply_sender: mpsc::Sender>, 1303 | } 1304 | 1305 | impl Drop for Server { 1306 | fn drop(&mut self) { 1307 | self.stop(); 1308 | } 1309 | } 1310 | 1311 | impl Server { 1312 | pub fn apply(&mut self, commands: Vec>, command_ids: Vec) -> ApplyResult { 1313 | assert_eq!(commands.len(), command_ids.len()); 1314 | 1315 | // Append commands to local durable state if leader. 1316 | let mut state = self.state.lock().unwrap(); 1317 | if state.volatile.condition != Condition::Leader { 1318 | return ApplyResult::NotALeader; 1319 | } 1320 | 1321 | let mut entries = Vec::with_capacity(commands.len()); 1322 | for (i, &id) in command_ids.iter().enumerate() { 1323 | assert_ne!(id, 0); 1324 | 1325 | entries.push(LogEntry { 1326 | index: 0, 1327 | term: state.durable.current_term, 1328 | command: commands[i].clone(), 1329 | client_serial_id: id, 1330 | client_id: self.client_id, 1331 | }); 1332 | } 1333 | 1334 | state.durable.append(&mut entries); 1335 | 1336 | // TODO: How to handle timeouts? 1337 | ApplyResult::Ok 1338 | } 1339 | 1340 | fn handle_request_vote_request( 1341 | &mut self, 1342 | request: RequestVoteRequest, 1343 | _: u128, 1344 | ) -> Option { 1345 | let mut state = self.state.lock().unwrap(); 1346 | let term = state.durable.current_term; 1347 | let false_request = RPCBody::RequestVoteResponse(RequestVoteResponse { 1348 | request_id: request.request_id, 1349 | term, 1350 | vote_granted: false, 1351 | }); 1352 | 1353 | if request.term < term { 1354 | return Some(false_request); 1355 | } 1356 | // If it isn't less than, local state would already have been 1357 | // modified so me.term == request.term in handle_message. 1358 | assert_eq!(request.term, term); 1359 | 1360 | let canvote = 1361 | state.durable.voted_for == 0 || state.durable.voted_for == request.candidate_id; 1362 | if !canvote { 1363 | return Some(false_request); 1364 | } 1365 | 1366 | // "2. If votedFor is null or candidateId, and candidate’s log 1367 | // is at least as up-to-date as receiver’s log, grant vote 1368 | // (§5.2, §5.4)." - Reference [0] Page 4 1369 | // 1370 | // "Raft determines which of two logs is more up-to-date 1371 | // by comparing the index and term of the last entries in the 1372 | // logs. If the logs have last entries with different terms, then 1373 | // the log with the later term is more up-to-date. If the logs 1374 | // end with the same term, then whichever log is longer is 1375 | // more up-to-date." - Reference [0] Page 8 1376 | 1377 | let log_length = state.durable.next_log_index; 1378 | let last_log_term = state.durable.last_log_term; 1379 | let vote_granted = request.last_log_term > last_log_term 1380 | || (request.last_log_term == last_log_term 1381 | && (request.last_log_index >= log_length - 1)); 1382 | state.log(format!( 1383 | "RVR mll: {log_length}, mlt: {}; rll: {}, rlt: {}", 1384 | last_log_term, request.last_log_index, request.last_log_term 1385 | )); 1386 | 1387 | if vote_granted { 1388 | state.durable.update(term, request.candidate_id); 1389 | 1390 | // Reset election timer. 1391 | // 1392 | // "If election timeout elapses without receiving AppendEntries 1393 | // RPC from current leader or granting vote to candidate: 1394 | // convert to candidate" - Reference [0] Page 4 1395 | state.reset_election_timeout(); 1396 | } 1397 | 1398 | let msg = RPCBody::RequestVoteResponse(RequestVoteResponse { 1399 | request_id: request.request_id, 1400 | term, 1401 | vote_granted, 1402 | }); 1403 | Some(msg) 1404 | } 1405 | 1406 | fn handle_request_vote_response( 1407 | &mut self, 1408 | response: RequestVoteResponse, 1409 | _: u128, 1410 | ) -> Option { 1411 | let mut state = self.state.lock().unwrap(); 1412 | if state.volatile.condition != Condition::Candidate { 1413 | return None; 1414 | } 1415 | 1416 | let quorum = self.config.cluster.len() / 2; 1417 | assert!(quorum > 0 || (self.config.cluster.len() == 1 && quorum == 0)); 1418 | 1419 | if response.vote_granted { 1420 | state.volatile.votes += 1; 1421 | // This will not handle the case where a single 1422 | // server-cluster needs to become the leader. 1423 | if state.volatile.votes == quorum { 1424 | drop(state); 1425 | self.candidate_become_leader(); 1426 | } 1427 | } 1428 | 1429 | None 1430 | } 1431 | 1432 | fn handle_append_entries_request( 1433 | &mut self, 1434 | mut request: AppendEntriesRequest, 1435 | from: u128, 1436 | ) -> Option { 1437 | let mut state = self.state.lock().unwrap(); 1438 | let term = state.durable.current_term; 1439 | 1440 | let false_response = |match_index| -> Option { 1441 | Some(RPCBody::AppendEntriesResponse(AppendEntriesResponse { 1442 | request_id: request.request_id, 1443 | term, 1444 | success: false, 1445 | match_index, 1446 | })) 1447 | }; 1448 | 1449 | // "1. Reply false if term < currentTerm (§5.1)." - Reference [0] Page 4 1450 | if request.term < term { 1451 | state.log(format!( 1452 | "Cannot accept AppendEntries from {} because it is out of date ({} < {}).", 1453 | from, request.term, term 1454 | )); 1455 | return false_response(0); 1456 | } 1457 | 1458 | // "If AppendEntries RPC received from new leader: convert to 1459 | // follower." - Reference [0] Page 4 1460 | if state.volatile.condition == Condition::Candidate { 1461 | state.transition(Condition::Follower, 0, 0); 1462 | } 1463 | 1464 | if state.volatile.condition != Condition::Follower { 1465 | assert_eq!(state.volatile.condition, Condition::Leader); 1466 | state.log(format!( 1467 | "Cannot accept AppendEntries from {} because I am a leader.", 1468 | from 1469 | )); 1470 | return false_response(0); 1471 | } 1472 | 1473 | // Reset heartbeat timer because we've now heard from a valid leader. 1474 | state.reset_election_timeout(); 1475 | 1476 | // "Reply false if log doesn’t contain an entry at prevLogIndex 1477 | // whose term matches prevLogTerm (§5.3)." - Reference [0] Page 4 1478 | if request.prev_log_index > 0 { 1479 | if request.prev_log_index >= state.durable.next_log_index { 1480 | state.log(format!("Cannot accept AppendEntries from {} because prev_log_index ({}) is ahead of my log ({}).", from, request.prev_log_index, state.durable.next_log_index)); 1481 | return false_response(std::cmp::max(state.durable.next_log_index, 1) - 1); 1482 | } 1483 | 1484 | let e = state.durable.log_at_index(request.prev_log_index); 1485 | if e.term != request.prev_log_term { 1486 | assert!(request.prev_log_index > 0); 1487 | state.log(format!("Cannot accept AppendEntries from {} because prev_log_term ({}) does not match mine ({}).", from, request.prev_log_term, e.term)); 1488 | return false_response(request.prev_log_index - 1); 1489 | } 1490 | } 1491 | 1492 | // "If an existing entry conflicts with a new one (same index 1493 | // but different terms), delete the existing entry and all that 1494 | // follow it (§5.3)." - Reference [0] Page 4 1495 | let mut append_offset = 0; 1496 | let mut real_index = request.prev_log_index + 1; 1497 | for entry in request.entries.iter() { 1498 | if real_index == state.durable.next_log_index { 1499 | // Found a new entry, no need to look it up. 1500 | break; 1501 | } 1502 | 1503 | let e = state.durable.log_at_index(real_index); 1504 | if e.term != entry.term { 1505 | break; 1506 | } 1507 | 1508 | real_index += 1; 1509 | append_offset += 1; 1510 | } 1511 | 1512 | // 4. Append any new entries not already in the log 1513 | state 1514 | .durable 1515 | .append_from_index(&mut request.entries[append_offset..], real_index); 1516 | 1517 | // "If leaderCommit > commitIndex, set commitIndex = 1518 | // min(leaderCommit, index of last new entry)." - Reference [0] Page 4 1519 | if request.leader_commit > state.volatile.commit_index { 1520 | state.volatile.commit_index = std::cmp::min( 1521 | request.leader_commit, 1522 | std::cmp::max(state.durable.next_log_index, 1) - 1, 1523 | ); 1524 | } 1525 | 1526 | Some(RPCBody::AppendEntriesResponse(AppendEntriesResponse { 1527 | request_id: request.request_id, 1528 | term, 1529 | success: true, 1530 | match_index: request.prev_log_index + request.entries.len() as u64, 1531 | })) 1532 | } 1533 | 1534 | fn handle_append_entries_response( 1535 | &mut self, 1536 | response: AppendEntriesResponse, 1537 | from: u128, 1538 | ) -> Option { 1539 | let mut state = self.state.lock().unwrap(); 1540 | if state.volatile.condition != Condition::Leader { 1541 | return None; 1542 | } 1543 | 1544 | let server_index = self 1545 | .config 1546 | .cluster 1547 | .iter() 1548 | .position(|server| server.id == from) 1549 | .unwrap(); 1550 | if response.success { 1551 | let new_next_index = response.match_index + 1; 1552 | assert!(new_next_index >= state.volatile.next_index[server_index]); 1553 | state.volatile.next_index[server_index] = new_next_index; 1554 | 1555 | assert!(response.match_index >= state.volatile.match_index[server_index]); 1556 | state.volatile.match_index[server_index] = response.match_index; 1557 | state.log(format!( 1558 | "AppendEntries request to {from} successful. New match index: {}.", 1559 | response.match_index 1560 | )); 1561 | } else { 1562 | // If the request is false, match_index servers as the 1563 | // next index to try. 1564 | state.volatile.next_index[server_index] = std::cmp::max(1, response.match_index); 1565 | state.log(format!( 1566 | "AppendEntries request to {from} not successful. New next index: {}.", 1567 | state.volatile.next_index[server_index] 1568 | )); 1569 | } 1570 | 1571 | None 1572 | } 1573 | 1574 | fn handle_message(&mut self, message: RPCMessage) -> Option<(RPCMessage, u128)> { 1575 | // "If RPC request or response contains term T > currentTerm: 1576 | // set currentTerm = T, convert to follower (§5.1)." 1577 | // - Reference [0] Page 4 1578 | let mut state = self.state.lock().unwrap(); 1579 | if message.term() > state.durable.current_term { 1580 | if state.volatile.condition != Condition::Follower { 1581 | let term_diff = message.term() - state.durable.current_term; 1582 | state.transition(Condition::Follower, term_diff, 0); 1583 | } else { 1584 | state.durable.update(message.term(), 0); 1585 | } 1586 | } 1587 | drop(state); 1588 | 1589 | let rsp = match message.body { 1590 | RPCBody::RequestVoteRequest(r) => self.handle_request_vote_request(r, message.from), 1591 | RPCBody::RequestVoteResponse(r) => self.handle_request_vote_response(r, message.from), 1592 | RPCBody::AppendEntriesRequest(r) => self.handle_append_entries_request(r, message.from), 1593 | RPCBody::AppendEntriesResponse(r) => { 1594 | self.handle_append_entries_response(r, message.from) 1595 | } 1596 | }; 1597 | 1598 | Some(( 1599 | RPCMessage { 1600 | body: rsp?, 1601 | from: self.config.server_id, 1602 | }, 1603 | message.from, 1604 | )) 1605 | } 1606 | 1607 | fn leader_maybe_new_quorum(&mut self) { 1608 | // "If there exists an N such that N > commitIndex, a majority 1609 | // of matchIndex[i] ≥ N, and log[N].term == currentTerm: 1610 | // set commitIndex = N (§5.3, §5.4)." - Reference [0] Page 4 1611 | let quorum_needed = self.config.cluster.len() / 2; 1612 | let mut state = self.state.lock().unwrap(); 1613 | if state.volatile.condition != Condition::Leader { 1614 | return; 1615 | } 1616 | 1617 | let log_length = state.durable.next_log_index; 1618 | // NOTE: Maybe the starting point to check can be optimized 1619 | // but be careful. One iteration tried to do the highest 1620 | // common match_index. The problem is that if one node is 1621 | // down, the highest common index may even be 0 even though 1622 | // the cluster *could* make progress. 1623 | 1624 | // Check from front to back. 1625 | for i in (0..log_length).rev() { 1626 | let mut quorum = quorum_needed; 1627 | 1628 | // We don't need to keep checking backwards for a quorum 1629 | // once we get to the current quorum. 1630 | if state.volatile.commit_index == i { 1631 | break; 1632 | } 1633 | 1634 | state.log(format!("Checking for quorum ({quorum}) at index: {i}.")); 1635 | 1636 | assert!(quorum > 0 || (self.config.cluster.len() == 1 && quorum == 0)); 1637 | let e = state.durable.log_at_index(i); 1638 | for (server_index, &match_index) in state.volatile.match_index.iter().enumerate() { 1639 | // self always counts as part of quorum, so skip it in 1640 | // the count. quorum_needed already takes self into 1641 | // consideration (`len() / 2` not `len() / 2 + 1`). 1642 | if self.config.server_index == server_index { 1643 | continue; 1644 | } 1645 | 1646 | // "If there exists an N such that N > commitIndex, a majority 1647 | // of matchIndex[i] ≥ N, and log[N].term == currentTerm: 1648 | // set commitIndex = N (§5.3, §5.4)." - Reference [0] Page 4 1649 | if match_index >= i && e.term == state.durable.current_term { 1650 | state.log(format!("Exists for ({}) at {i}.", server_index)); 1651 | quorum -= 1; 1652 | if quorum == 0 { 1653 | break; 1654 | } 1655 | } else if match_index >= i { 1656 | state.log(format!( 1657 | "Does not exist for ({}) at {i} (term is {}, our term is {}).", 1658 | server_index, e.term, state.durable.current_term, 1659 | )); 1660 | } else { 1661 | state.log(format!("Does not exist for ({}) at {i}.", server_index)); 1662 | } 1663 | } 1664 | 1665 | if quorum == 0 { 1666 | state.volatile.commit_index = i; 1667 | state.log(format!("New quorum at index: {i}.")); 1668 | break; 1669 | } else { 1670 | state.log(format!("No quorum yet at index: {i}.")); 1671 | } 1672 | } 1673 | } 1674 | 1675 | fn leader_send_heartbeat(&mut self) { 1676 | // "Upon election: send initial empty AppendEntries RPCs 1677 | // (heartbeat) to each server; repeat during idle periods to 1678 | // prevent election timeouts (§5.2)." - Reference [0] Page 4 1679 | let mut state = self.state.lock().unwrap(); 1680 | if state.volatile.condition != Condition::Leader { 1681 | return; 1682 | } 1683 | 1684 | let time_for_heartbeat = state.volatile.election_timeout > Instant::now(); 1685 | // "The broadcast time should be an order of magnitude less 1686 | // than the election timeout so that leaders can reliably send 1687 | // the heartbeat messages required to keep followers from 1688 | // starting elections" - Reference [0] Page 10 1689 | state.volatile.election_timeout = Instant::now() + (self.config.election_frequency / 10); 1690 | 1691 | let my_server_id = self.config.server_id; 1692 | for (i, server) in self.config.cluster.iter().enumerate() { 1693 | // Skip self. 1694 | if server.id == my_server_id { 1695 | continue; 1696 | } 1697 | 1698 | let next_index = state.volatile.next_index[i]; 1699 | let prev_log_index = std::cmp::max(next_index, 1) - 1; 1700 | 1701 | // Handle common case where we don't need to look up a log 1702 | // from pagecache if we're at the latest entry. 1703 | let prev_log_term = 1704 | if prev_log_index == std::cmp::max(state.durable.next_log_index, 1) - 1 { 1705 | state.durable.last_log_term 1706 | } else { 1707 | let prev_log = state.durable.log_at_index(prev_log_index); 1708 | prev_log.term 1709 | }; 1710 | 1711 | let mut entries = vec![]; 1712 | if state.durable.next_log_index > next_index { 1713 | let max_entries = 0xFF; // Length must fit into a byte. 1714 | for i in 0..max_entries { 1715 | let index = i + next_index; 1716 | // At most as many logs as we currently have. 1717 | if index == state.durable.next_log_index { 1718 | break; 1719 | } 1720 | 1721 | entries.push(state.durable.log_at_index(index)); 1722 | } 1723 | } else if !time_for_heartbeat { 1724 | // No need to send a blank request at this time. 1725 | continue; 1726 | } 1727 | 1728 | state.log(format!( 1729 | "Sending AppendEntries request. Logs: {}.", 1730 | entries.len() 1731 | )); 1732 | 1733 | let msg = RPCMessage { 1734 | from: my_server_id, 1735 | body: RPCBody::AppendEntriesRequest(AppendEntriesRequest { 1736 | request_id: state.next_request_id(), 1737 | term: state.durable.current_term, 1738 | leader_id: my_server_id, 1739 | prev_log_index, 1740 | prev_log_term, 1741 | entries, 1742 | leader_commit: state.volatile.commit_index, 1743 | }), 1744 | }; 1745 | self.rpc_manager.send( 1746 | state.durable.next_log_index, 1747 | state.volatile.condition, 1748 | server.id, 1749 | &msg, 1750 | ); 1751 | } 1752 | } 1753 | 1754 | fn follower_maybe_become_candidate(&mut self) { 1755 | // "If election timeout elapses without receiving AppendEntries 1756 | // RPC from current leader or granting vote to candidate: 1757 | // convert to candidate." - Reference [0] Page 4 1758 | let state = self.state.lock().unwrap(); 1759 | if state.volatile.condition != Condition::Follower { 1760 | return; 1761 | } 1762 | 1763 | if Instant::now() > state.volatile.election_timeout { 1764 | drop(state); 1765 | self.follower_become_candidate(); 1766 | } 1767 | } 1768 | 1769 | fn candidate_maybe_timeout(&mut self) { 1770 | let mut state = self.state.lock().unwrap(); 1771 | if state.volatile.condition != Condition::Candidate { 1772 | return; 1773 | } 1774 | 1775 | // Election timed out. Revert to follower and restart election. 1776 | if Instant::now() > state.volatile.election_timeout { 1777 | state.transition(Condition::Follower, 0, 0); 1778 | } 1779 | } 1780 | 1781 | fn candidate_become_leader(&mut self) { 1782 | let mut state = self.state.lock().unwrap(); 1783 | if state.volatile.condition != Condition::Candidate { 1784 | return; 1785 | } 1786 | 1787 | state.volatile.reset(); 1788 | state.transition(Condition::Leader, 0, 0); 1789 | state.volatile.election_timeout = Instant::now(); 1790 | 1791 | for i in 0..self.config.cluster.len() { 1792 | // "for each server, index of the next log entry 1793 | // to send to that server (initialized to leader 1794 | // last log index + 1)" - Reference [0] Page 4 1795 | state.volatile.next_index[i] = state.durable.next_log_index; 1796 | 1797 | // "for each server, index of highest log entry 1798 | // known to be replicated on server 1799 | // (initialized to 0, increases monotonically)" - Reference [0] Page 4 1800 | state.volatile.match_index[i] = 0; 1801 | } 1802 | 1803 | // "First, a leader must have the latest information on which 1804 | // entries are committed. The Leader Completeness Property 1805 | // guarantees that a leader has all committed entries, but at 1806 | // the start of its term, it may not know which those are. To 1807 | // find out, it needs to commit an entry from its term. Raft 1808 | // handles this by having each leader commit a blank no-op 1809 | // entry into the log at the start of its term." - Reference 1810 | // [0] Page 13 1811 | let term = state.durable.current_term; 1812 | state.durable.append(&mut [LogEntry { 1813 | index: 0, 1814 | command: vec![], 1815 | term, 1816 | client_serial_id: 0, 1817 | client_id: 0, 1818 | }]); 1819 | 1820 | drop(state); 1821 | 1822 | self.leader_send_heartbeat(); 1823 | } 1824 | 1825 | fn follower_become_candidate(&mut self) { 1826 | let mut state = self.state.lock().unwrap(); 1827 | if state.volatile.condition != Condition::Follower { 1828 | return; 1829 | } 1830 | 1831 | // "On conversion to candidate, start election: 1832 | // • Increment currentTerm 1833 | // • Vote for self 1834 | // • Reset election timer 1835 | // • Send RequestVote RPCs to all other servers" - Reference [0] Page 4 1836 | state.transition(Condition::Candidate, 1, self.config.server_id); 1837 | state.reset_election_timeout(); 1838 | 1839 | // Trivial case. In a single-server cluster, the server is the 1840 | // leader. 1841 | if self.config.cluster.len() == 1 { 1842 | drop(state); 1843 | self.candidate_become_leader(); 1844 | return; 1845 | } 1846 | 1847 | for server in self.config.cluster.iter() { 1848 | // Skip self. 1849 | if server.id == self.config.server_id { 1850 | continue; 1851 | } 1852 | 1853 | let msg = &RPCMessage { 1854 | body: RPCBody::RequestVoteRequest(RequestVoteRequest { 1855 | request_id: state.next_request_id(), 1856 | term: state.durable.current_term, 1857 | candidate_id: self.config.server_id, 1858 | last_log_index: std::cmp::max(state.durable.next_log_index, 1) - 1, 1859 | last_log_term: state.durable.last_log_term, 1860 | }), 1861 | from: self.config.server_id, 1862 | }; 1863 | 1864 | self.rpc_manager.send( 1865 | state.durable.next_log_index, 1866 | state.volatile.condition, 1867 | server.id, 1868 | msg, 1869 | ); 1870 | } 1871 | } 1872 | 1873 | fn apply_entries(&mut self) { 1874 | let mut state = self.state.lock().unwrap(); 1875 | let mut to_apply = Vec::>::new(); 1876 | let starting_index = state.volatile.last_applied + 1; 1877 | while state.volatile.last_applied < state.volatile.commit_index { 1878 | state.volatile.last_applied += 1; 1879 | let last_applied = state.volatile.last_applied; 1880 | to_apply.push(state.durable.log_at_index(last_applied).command); 1881 | } 1882 | 1883 | if !to_apply.is_empty() { 1884 | let results = self.sm.apply(to_apply); 1885 | for (i, result) in results.into_iter().enumerate() { 1886 | let e = state.durable.log_at_index(starting_index + i as u64); 1887 | if e.client_id == self.client_id { 1888 | self.apply_sender.send(result).unwrap(); 1889 | } 1890 | } 1891 | 1892 | state.log(format!("Entries applied: {}.", state.volatile.last_applied)); 1893 | } 1894 | } 1895 | 1896 | pub fn init(&mut self) { 1897 | self.restore(); 1898 | 1899 | self.rpc_manager.start(); 1900 | 1901 | let mut state = self.state.lock().unwrap(); 1902 | state.reset_election_timeout(); 1903 | if self.config.cluster.len() == 1 { 1904 | state.transition(Condition::Leader, 0, 0); 1905 | } 1906 | } 1907 | 1908 | pub fn stop(&mut self) { 1909 | let mut state = match self.state.lock() { 1910 | Ok(s) => s, 1911 | Err(p) => p.into_inner(), 1912 | }; 1913 | state.log("Stopping."); 1914 | // Prevent server from accepting any more log entries. 1915 | if state.volatile.condition != Condition::Follower { 1916 | state.transition(Condition::Follower, 0, 0); 1917 | } 1918 | state.stopped = true; 1919 | 1920 | // Stop RPCManager. 1921 | let mut stop = self.rpc_manager.stop_mutex.lock().unwrap(); 1922 | *stop = true; 1923 | drop(stop); 1924 | 1925 | // Make an empty connection to self so the stop_mutex logic gets triggered. 1926 | let address = self.rpc_manager.address_from_id(self.config.server_id); 1927 | _ = std::net::TcpStream::connect(address); 1928 | } 1929 | 1930 | pub fn tick(&mut self) { 1931 | let t1 = Instant::now(); 1932 | let state = self.state.lock().unwrap(); 1933 | if state.stopped { 1934 | return; 1935 | } 1936 | state.log(format!( 1937 | "Tick start. Log length: {}, commit index: {}. Timeout in: {}ms.", 1938 | state.durable.next_log_index, 1939 | state.volatile.commit_index, 1940 | (state.volatile.election_timeout - Instant::now()).as_millis(), 1941 | )); 1942 | drop(state); 1943 | 1944 | // Leader operations. 1945 | self.leader_send_heartbeat(); 1946 | self.leader_maybe_new_quorum(); 1947 | 1948 | // Follower operations. 1949 | self.follower_maybe_become_candidate(); 1950 | 1951 | // Candidate operations. 1952 | self.candidate_maybe_timeout(); 1953 | 1954 | // All condition operations. 1955 | self.apply_entries(); 1956 | 1957 | // Read from the backlog at least once and for at most 5ms. 1958 | let until = Instant::now() + Duration::from_millis(5); 1959 | while let Ok(msg) = self.rpc_manager.stream_receiver.try_recv() { 1960 | let state = self.state.lock().unwrap(); 1961 | 1962 | // "If a server receives a request with a stale term 1963 | // number, it rejects the request." - Reference [0] Page 5. 1964 | // Also: https://github.com/ongardie/raft.tla/blob/974fff7236545912c035ff8041582864449d0ffe/raft.tla#L413. 1965 | let stale = msg.term() < state.durable.current_term; 1966 | 1967 | state.log(format!( 1968 | "{} message from {}: {:?}.", 1969 | if stale { "Dropping stale" } else { "Received" }, 1970 | msg.from, 1971 | msg.body, 1972 | )); 1973 | drop(state); 1974 | if stale { 1975 | continue; 1976 | } 1977 | 1978 | if let Some((response, from)) = self.handle_message(msg) { 1979 | let state = self.state.lock().unwrap(); 1980 | let condition = state.volatile.condition; 1981 | let log_length = state.durable.next_log_index; 1982 | drop(state); 1983 | 1984 | self.rpc_manager 1985 | .send(log_length, condition, from, &response); 1986 | } 1987 | 1988 | if Instant::now() > until { 1989 | break; 1990 | } 1991 | } 1992 | 1993 | let took = (Instant::now() - t1).as_millis(); 1994 | if took > 0 { 1995 | let state = self.state.lock().unwrap(); 1996 | state.log(format!("WARNING! Tick completed in {}ms.", took)); 1997 | } 1998 | } 1999 | 2000 | pub fn restore(&self) { 2001 | let mut state = self.state.lock().unwrap(); 2002 | state.durable.restore(); 2003 | } 2004 | 2005 | pub fn new( 2006 | client_id: u128, 2007 | data_directory: &std::path::Path, 2008 | sm: SM, 2009 | config: Config, 2010 | ) -> (Server, mpsc::Receiver>) { 2011 | for server in config.cluster.iter() { 2012 | assert_ne!(server.id, 0); 2013 | } 2014 | 2015 | // 0 is reserved for control messages. 2016 | assert!(client_id > 0); 2017 | 2018 | let cluster_size = config.cluster.len(); 2019 | let logger = Logger { 2020 | server_id: config.server_id, 2021 | debug: config.logger_debug, 2022 | }; 2023 | let rpc_manager = RPCManager::new(config.server_id, config.cluster.clone(), logger.clone()); 2024 | let election_frequency = config.election_frequency; 2025 | 2026 | let rand = Random::new(config.random_seed); 2027 | let id = config.server_id; 2028 | let page_cache_size = config.page_cache_size; 2029 | 2030 | let (apply_sender, apply_receiver): (mpsc::Sender>, mpsc::Receiver>) = 2031 | mpsc::channel(); 2032 | 2033 | ( 2034 | Server { 2035 | client_id, 2036 | 2037 | rpc_manager, 2038 | config, 2039 | sm, 2040 | 2041 | apply_sender, 2042 | 2043 | state: Mutex::new(State { 2044 | durable: DurableState::new(data_directory, id, page_cache_size), 2045 | volatile: VolatileState::new(cluster_size, election_frequency, rand), 2046 | logger, 2047 | stopped: false, 2048 | }), 2049 | }, 2050 | apply_receiver, 2051 | ) 2052 | } 2053 | } 2054 | 2055 | #[cfg(test)] 2056 | mod server_tests { 2057 | use super::*; 2058 | 2059 | pub struct TmpDir { 2060 | pub dir: std::path::PathBuf, 2061 | } 2062 | 2063 | impl TmpDir { 2064 | pub fn new() -> TmpDir { 2065 | let mut counter: u32 = 0; 2066 | loop { 2067 | let dir = format!("test{}", counter); 2068 | // Atomically try to create a new directory. 2069 | if std::fs::create_dir(&dir).is_ok() { 2070 | return TmpDir { dir: dir.into() }; 2071 | } 2072 | 2073 | counter += 1; 2074 | } 2075 | } 2076 | } 2077 | 2078 | // Delete the temp directory when it goes out of scope. 2079 | impl Drop for TmpDir { 2080 | fn drop(&mut self) { 2081 | std::fs::remove_dir_all(&self.dir).unwrap(); 2082 | } 2083 | } 2084 | 2085 | #[test] 2086 | fn test_update_and_restore() { 2087 | let tmp = TmpDir::new(); 2088 | 2089 | let mut durable = DurableState::new(&tmp.dir, 1, 1); 2090 | durable.restore(); 2091 | assert_eq!(durable.current_term, 0); 2092 | assert_eq!(durable.voted_for, 0); 2093 | assert_eq!(durable.next_log_index, 1); 2094 | assert_eq!( 2095 | durable.log_at_index(0), 2096 | LogEntry { 2097 | index: 0, 2098 | term: 0, 2099 | command: vec![], 2100 | client_serial_id: 0, 2101 | client_id: 0, 2102 | } 2103 | ); 2104 | 2105 | durable.update(10234, 40592); 2106 | assert_eq!(durable.current_term, 10234); 2107 | assert_eq!(durable.voted_for, 40592); 2108 | assert_eq!(durable.next_log_index, 1); 2109 | assert_eq!( 2110 | durable.log_at_index(0), 2111 | LogEntry { 2112 | index: 0, 2113 | term: 0, 2114 | command: vec![], 2115 | client_serial_id: 0, 2116 | client_id: 0, 2117 | } 2118 | ); 2119 | drop(durable); 2120 | 2121 | let mut durable = DurableState::new(&tmp.dir, 1, 1); 2122 | assert_eq!(durable.current_term, 0); 2123 | assert_eq!(durable.voted_for, 0); 2124 | assert_eq!(durable.next_log_index, 0); 2125 | assert_eq!( 2126 | durable.log_at_index(0), 2127 | LogEntry { 2128 | index: 0, 2129 | term: 0, 2130 | command: vec![], 2131 | client_serial_id: 0, 2132 | client_id: 0, 2133 | } 2134 | ); 2135 | 2136 | durable.restore(); 2137 | assert_eq!(durable.current_term, 10234); 2138 | assert_eq!(durable.voted_for, 40592); 2139 | assert_eq!(durable.next_log_index, 1); 2140 | assert_eq!( 2141 | durable.log_at_index(0), 2142 | LogEntry { 2143 | index: 0, 2144 | term: 0, 2145 | command: vec![], 2146 | client_serial_id: 0, 2147 | client_id: 0, 2148 | } 2149 | ); 2150 | } 2151 | 2152 | #[test] 2153 | fn test_log_append() { 2154 | let tmp = TmpDir::new(); 2155 | 2156 | let mut v = Vec::::new(); 2157 | v.push(LogEntry { 2158 | index: 1, 2159 | term: 0, 2160 | command: "abcdef123456".as_bytes().to_vec(), 2161 | client_serial_id: 0, 2162 | client_id: 0, 2163 | }); 2164 | v.push(LogEntry { 2165 | index: 2, 2166 | term: 0, 2167 | command: "foobar".as_bytes().to_vec(), 2168 | client_serial_id: 0, 2169 | client_id: 0, 2170 | }); 2171 | 2172 | // Write two entries and shut down. 2173 | let mut durable = DurableState::new(&tmp.dir, 1, 1); 2174 | durable.restore(); 2175 | assert_eq!(durable.next_log_index, 1); 2176 | durable.append(&mut v); 2177 | assert_eq!(durable.next_log_index, 3); 2178 | let prev_offset = durable.next_log_offset; 2179 | drop(durable); 2180 | 2181 | // Start up and restore. Should have three entries. 2182 | let mut durable = DurableState::new(&tmp.dir, 1, 1); 2183 | durable.restore(); 2184 | assert_eq!(prev_offset, durable.next_log_offset); 2185 | assert_eq!(durable.next_log_index, 3); 2186 | 2187 | for (i, entry) in v.iter().enumerate() { 2188 | assert_eq!(durable.log_at_index(1 + i as u64), *entry); 2189 | } 2190 | // Add in double the existing entries and shut down. 2191 | let before_reverse = v.clone(); 2192 | v.reverse(); 2193 | let longcommand = b"a".repeat(10_000); 2194 | let longcommand2 = b"a".repeat(PAGESIZE as usize); 2195 | let longcommand3 = b"a".repeat(1 + PAGESIZE as usize); 2196 | v.push(LogEntry { 2197 | index: 3, 2198 | command: longcommand.to_vec(), 2199 | term: 0, 2200 | client_serial_id: 0, 2201 | client_id: 0, 2202 | }); 2203 | v.push(LogEntry { 2204 | index: 4, 2205 | command: longcommand2.to_vec(), 2206 | term: 0, 2207 | client_serial_id: 0, 2208 | client_id: 0, 2209 | }); 2210 | v.push(LogEntry { 2211 | index: 5, 2212 | command: longcommand3.to_vec(), 2213 | term: 0, 2214 | client_serial_id: 0, 2215 | client_id: 0, 2216 | }); 2217 | durable.append(&mut v); 2218 | 2219 | let mut all = before_reverse; 2220 | all.append(&mut v); 2221 | for (i, entry) in all.iter().enumerate() { 2222 | assert_eq!(durable.log_at_index(1 + i as u64), *entry); 2223 | } 2224 | 2225 | drop(durable); 2226 | 2227 | // Start up and restore. Should now have 8 entries. 2228 | let mut durable = DurableState::new(&tmp.dir, 1, 1); 2229 | durable.restore(); 2230 | assert_eq!(durable.next_log_index, 8); 2231 | for (i, entry) in all.iter().enumerate() { 2232 | assert_eq!(durable.log_at_index(1 + i as u64), *entry); 2233 | } 2234 | 2235 | // Check in reverse as well. 2236 | for (i, entry) in all.iter().rev().enumerate() { 2237 | let real_index = all.len() - i; 2238 | assert_eq!(durable.log_at_index(real_index as u64), *entry); 2239 | } 2240 | } 2241 | 2242 | #[test] 2243 | fn test_rpc_message_encode_decode() { 2244 | let tests = vec![ 2245 | RPCMessage { 2246 | body: RPCBody::AppendEntriesRequest(AppendEntriesRequest { 2247 | request_id: 1948233, 2248 | term: 88, 2249 | leader_id: 2132, 2250 | prev_log_index: 1823, 2251 | prev_log_term: 193, 2252 | entries: vec![ 2253 | LogEntry { 2254 | index: 0, 2255 | term: 88, 2256 | command: "hey there".into(), 2257 | client_serial_id: 102, 2258 | client_id: 1, 2259 | }, 2260 | LogEntry { 2261 | index: 0, 2262 | term: 90, 2263 | command: "blub".into(), 2264 | client_serial_id: 19, 2265 | client_id: 1, 2266 | }, 2267 | ], 2268 | leader_commit: 95, 2269 | }), 2270 | from: 9999, 2271 | }, 2272 | RPCMessage { 2273 | body: RPCBody::AppendEntriesRequest(AppendEntriesRequest { 2274 | request_id: 9234742, 2275 | term: 91, 2276 | leader_id: 2132, 2277 | prev_log_index: 1823, 2278 | prev_log_term: 193, 2279 | entries: vec![], 2280 | leader_commit: 95, 2281 | }), 2282 | from: 9999, 2283 | }, 2284 | RPCMessage { 2285 | body: RPCBody::AppendEntriesResponse(AppendEntriesResponse { 2286 | request_id: 123813, 2287 | term: 10, 2288 | success: true, 2289 | match_index: 0, 2290 | }), 2291 | from: 9999, 2292 | }, 2293 | RPCMessage { 2294 | body: RPCBody::AppendEntriesResponse(AppendEntriesResponse { 2295 | request_id: 983911002, 2296 | term: 10, 2297 | success: false, 2298 | match_index: 1230984, 2299 | }), 2300 | from: 9999, 2301 | }, 2302 | RPCMessage { 2303 | body: RPCBody::RequestVoteRequest(RequestVoteRequest { 2304 | request_id: 1241, 2305 | term: 1023, 2306 | candidate_id: 2132, 2307 | last_log_index: 1823, 2308 | last_log_term: 193, 2309 | }), 2310 | from: 9999, 2311 | }, 2312 | RPCMessage { 2313 | body: RPCBody::RequestVoteResponse(RequestVoteResponse { 2314 | request_id: 1912390, 2315 | term: 1023, 2316 | vote_granted: true, 2317 | }), 2318 | from: 9999, 2319 | }, 2320 | RPCMessage { 2321 | body: RPCBody::RequestVoteResponse(RequestVoteResponse { 2322 | request_id: 12309814, 2323 | term: 1023, 2324 | vote_granted: false, 2325 | }), 2326 | from: 9999, 2327 | }, 2328 | ]; 2329 | 2330 | for rpcmessage in tests.into_iter() { 2331 | let mut file = Vec::new(); 2332 | let mut cursor = std::io::Cursor::new(&mut file); 2333 | let bufwriter = BufWriter::new(&mut cursor); 2334 | println!("Testing transformation of {:?}.", rpcmessage); 2335 | rpcmessage.encode(rpcmessage.from, bufwriter); 2336 | 2337 | drop(cursor); 2338 | 2339 | let mut cursor = std::io::Cursor::new(&mut file); 2340 | let bufreader = BufReader::new(&mut cursor); 2341 | let result = RPCMessage::decode(bufreader); 2342 | assert_eq!(result, Ok(rpcmessage)); 2343 | } 2344 | } 2345 | 2346 | pub fn test_server( 2347 | tmp: &TmpDir, 2348 | port: u16, 2349 | servers: usize, 2350 | debug: bool, 2351 | ) -> (Server, mpsc::Receiver>) { 2352 | let mut cluster = vec![]; 2353 | for i in 0..servers { 2354 | cluster.push(ServerConfig { 2355 | id: 1 + i as u128, 2356 | address: format!("127.0.0.1:{}", port + i as u16).parse().unwrap(), 2357 | }) 2358 | } 2359 | let config = Config { 2360 | server_id: 1, 2361 | server_index: 0, 2362 | cluster, 2363 | 2364 | election_frequency: Duration::from_secs(10), 2365 | 2366 | random_seed: [0; 4], 2367 | logger_debug: debug, 2368 | page_cache_size: 100, 2369 | }; 2370 | 2371 | let sm = TestStateMachine {}; 2372 | return Server::new(1, &tmp.dir, sm, config); 2373 | } 2374 | 2375 | #[test] 2376 | fn test_rpc_manager() { 2377 | let tmpdir = &TmpDir::new(); 2378 | let debug = false; 2379 | let (server, _) = test_server(&tmpdir, 20010, 2, debug); 2380 | 2381 | let server_id = server.config.cluster[0].id; 2382 | let logger = Logger { server_id, debug }; 2383 | let mut rpcm = RPCManager::new(server_id, server.config.cluster.clone(), logger); 2384 | rpcm.start(); 2385 | 2386 | let msg = RPCMessage { 2387 | body: RPCBody::RequestVoteRequest(RequestVoteRequest { 2388 | request_id: 0, 2389 | term: 1023, 2390 | candidate_id: 2, 2391 | last_log_index: 1823, 2392 | last_log_term: 193, 2393 | }), 2394 | from: 1, 2395 | }; 2396 | rpcm.send(0, Condition::Follower, server.config.server_id, &msg); 2397 | let received = rpcm.stream_receiver.recv().unwrap(); 2398 | assert_eq!(msg, received); 2399 | 2400 | let mut stop = rpcm.stop_mutex.lock().unwrap(); 2401 | *stop = true; 2402 | } 2403 | 2404 | pub struct TestStateMachine {} 2405 | 2406 | impl StateMachine for TestStateMachine { 2407 | fn apply(&self, messages: Vec>) -> Vec> { 2408 | return messages; 2409 | } 2410 | } 2411 | 2412 | #[test] 2413 | fn test_single_server_apply_end_to_end() { 2414 | let tmpdir = &TmpDir::new(); 2415 | let debug = false; 2416 | let (mut server, result_receiver) = test_server(&tmpdir, 20002, 1, debug); 2417 | 2418 | // First test apply doesn't work as not a leader. 2419 | let apply_result = server.apply(vec![vec![]], vec![1]); 2420 | // Use try_recv() not recv() since recv() would block so the 2421 | // test would hang if the logic were ever wrong. try_recv() is 2422 | // correct since the message *must* at this point be available 2423 | // to read. 2424 | assert_eq!(apply_result, ApplyResult::NotALeader); 2425 | 2426 | // Now after initializing (realizing there's only one server, so is leader), try again. 2427 | server.init(); 2428 | 2429 | let apply_result = server.apply(vec!["abc".as_bytes().to_vec()], vec![1]); 2430 | assert_eq!(apply_result, ApplyResult::Ok); 2431 | 2432 | server.tick(); 2433 | 2434 | // See above note about try_recv() vs recv(). 2435 | let result = result_receiver.try_recv().unwrap(); 2436 | assert_eq!(result, "abc".as_bytes().to_vec()); 2437 | } 2438 | 2439 | #[test] 2440 | fn test_handle_request_vote_request() { 2441 | let tmpdir = &TmpDir::new(); 2442 | let debug = false; 2443 | let (mut server, _) = test_server(&tmpdir, 20003, 1, debug); 2444 | server.init(); 2445 | 2446 | let msg = RequestVoteRequest { 2447 | request_id: 88, 2448 | term: 1, 2449 | candidate_id: 2, 2450 | last_log_index: 2, 2451 | last_log_term: 1, 2452 | }; 2453 | let response = server.handle_message(RPCMessage { 2454 | body: RPCBody::RequestVoteRequest(msg), 2455 | from: 9999, 2456 | }); 2457 | assert_eq!( 2458 | response, 2459 | Some(( 2460 | RPCMessage { 2461 | body: RPCBody::RequestVoteResponse(RequestVoteResponse { 2462 | request_id: 88, 2463 | term: 1, 2464 | vote_granted: true, 2465 | }), 2466 | from: server.config.server_id 2467 | }, 2468 | 9999, 2469 | )), 2470 | ); 2471 | } 2472 | 2473 | #[test] 2474 | fn test_handle_request_vote_response() { 2475 | let tmpdir = &TmpDir::new(); 2476 | let debug = false; 2477 | let (mut server, _) = test_server(&tmpdir, 20004, 1, debug); 2478 | server.init(); 2479 | 2480 | let msg = RequestVoteResponse { 2481 | request_id: 0, 2482 | term: 1, 2483 | vote_granted: false, 2484 | }; 2485 | let response = server.handle_message(RPCMessage { 2486 | body: RPCBody::RequestVoteResponse(msg), 2487 | from: server.config.server_id, 2488 | }); 2489 | assert_eq!(response, None); 2490 | } 2491 | 2492 | #[test] 2493 | fn test_handle_append_entries_request_all_new_data() { 2494 | let tmpdir = &TmpDir::new(); 2495 | let debug = false; 2496 | let (mut server, _) = test_server(&tmpdir, 20007, 1, debug); 2497 | server.init(); 2498 | 2499 | // Must be a follower to accept entries. 2500 | let mut state = server.state.lock().unwrap(); 2501 | state.transition(Condition::Follower, 0, 0); 2502 | drop(state); 2503 | 2504 | let e = LogEntry { 2505 | index: 0, 2506 | term: 0, 2507 | command: "hey there".as_bytes().to_vec(), 2508 | client_serial_id: 0, 2509 | client_id: 1, 2510 | }; 2511 | let msg = AppendEntriesRequest { 2512 | request_id: 90, 2513 | term: 0, 2514 | leader_id: 2132, 2515 | prev_log_index: 0, 2516 | prev_log_term: 0, 2517 | entries: vec![e.clone()], 2518 | leader_commit: 95, 2519 | }; 2520 | let response = server.handle_message(RPCMessage { 2521 | body: RPCBody::AppendEntriesRequest(msg), 2522 | from: 2132, 2523 | }); 2524 | assert_eq!( 2525 | response, 2526 | Some(( 2527 | RPCMessage { 2528 | body: RPCBody::AppendEntriesResponse(AppendEntriesResponse { 2529 | request_id: 90, 2530 | term: 0, 2531 | success: true, 2532 | match_index: 1, 2533 | }), 2534 | from: server.config.server_id, 2535 | }, 2536 | 2132, 2537 | )) 2538 | ); 2539 | 2540 | let mut state = server.state.lock().unwrap(); 2541 | assert_eq!(state.durable.log_at_index(1), e); 2542 | } 2543 | 2544 | #[test] 2545 | fn test_handle_append_entries_request_overwrite() { 2546 | let tmpdir = &TmpDir::new(); 2547 | let debug = false; 2548 | let (mut server, _) = test_server(&tmpdir, 20008, 1, debug); 2549 | server.init(); 2550 | 2551 | // Must be a follower to accept entries. 2552 | let mut state = server.state.lock().unwrap(); 2553 | state.transition(Condition::Follower, 0, 0); 2554 | drop(state); 2555 | 2556 | let mut entries = vec![LogEntry { 2557 | index: 0, 2558 | term: 0, 2559 | command: "abc".as_bytes().to_vec(), 2560 | client_serial_id: 0, 2561 | client_id: 1, 2562 | }]; 2563 | 2564 | let mut state = server.state.lock().unwrap(); 2565 | state.durable.append(&mut entries); 2566 | assert_eq!(state.durable.log_at_index(1), entries[0]); 2567 | drop(state); 2568 | 2569 | let e = LogEntry { 2570 | index: 0, 2571 | // New term differing from what is stored although 2572 | // inserted at index `1` should cause overwrite. 2573 | term: 1, 2574 | command: "hey there".as_bytes().to_vec(), 2575 | client_serial_id: 0, 2576 | client_id: 1, 2577 | }; 2578 | let msg = AppendEntriesRequest { 2579 | request_id: 100, 2580 | term: 0, 2581 | leader_id: 2132, 2582 | prev_log_index: 0, 2583 | prev_log_term: 0, 2584 | entries: vec![e.clone()], 2585 | leader_commit: 95, 2586 | }; 2587 | let response = server.handle_message(RPCMessage { 2588 | body: RPCBody::AppendEntriesRequest(msg), 2589 | from: 2132, 2590 | }); 2591 | assert_eq!( 2592 | response, 2593 | Some(( 2594 | RPCMessage { 2595 | body: RPCBody::AppendEntriesResponse(AppendEntriesResponse { 2596 | request_id: 100, 2597 | term: 0, 2598 | success: true, 2599 | match_index: 1, 2600 | }), 2601 | from: server.config.server_id, 2602 | }, 2603 | 2132, 2604 | )) 2605 | ); 2606 | 2607 | let mut state = server.state.lock().unwrap(); 2608 | assert_eq!(state.durable.log_at_index(1), e); 2609 | } 2610 | 2611 | #[test] 2612 | fn test_handle_append_entries_request() { 2613 | let tmpdir = &TmpDir::new(); 2614 | let debug = false; 2615 | let (mut server, _) = test_server(&tmpdir, 20005, 1, debug); 2616 | server.init(); 2617 | 2618 | let msg = AppendEntriesRequest { 2619 | request_id: 0, 2620 | term: 91, 2621 | leader_id: 2132, 2622 | prev_log_index: 1823, 2623 | prev_log_term: 193, 2624 | entries: vec![], 2625 | leader_commit: 95, 2626 | }; 2627 | let response = server.handle_message(RPCMessage { 2628 | body: RPCBody::AppendEntriesRequest(msg), 2629 | from: 2132, 2630 | }); 2631 | assert_eq!( 2632 | response, 2633 | Some(( 2634 | RPCMessage { 2635 | body: RPCBody::AppendEntriesResponse(AppendEntriesResponse { 2636 | request_id: 0, 2637 | term: 91, 2638 | success: false, 2639 | match_index: 0, 2640 | }), 2641 | from: server.config.server_id, 2642 | }, 2643 | 2132 2644 | )) 2645 | ); 2646 | } 2647 | 2648 | #[test] 2649 | fn test_handle_append_entries_response() { 2650 | let tmpdir = &TmpDir::new(); 2651 | let debug = false; 2652 | let (mut server, _) = test_server(&tmpdir, 20006, 2, debug); 2653 | server.init(); 2654 | 2655 | let mut state = server.state.lock().unwrap(); 2656 | // Multiple servers in the cluster so all start as follower. 2657 | assert_eq!(state.volatile.condition, Condition::Follower); 2658 | // Set to leader. 2659 | state.volatile.condition = Condition::Leader; 2660 | assert_eq!(state.volatile.condition, Condition::Leader); 2661 | assert_eq!(state.durable.current_term, 0); 2662 | drop(state); 2663 | 2664 | let msg = AppendEntriesResponse { 2665 | request_id: 0, 2666 | // Newer term than server so the server will become a 2667 | // follower and not process the request. 2668 | term: 1, 2669 | success: true, 2670 | match_index: 12, 2671 | }; 2672 | let response = server.handle_message(RPCMessage { 2673 | body: RPCBody::AppendEntriesResponse(msg.clone()), 2674 | from: 2, 2675 | }); 2676 | assert_eq!(response, None); 2677 | 2678 | // Term has been updated, server is now follower. 2679 | let mut state = server.state.lock().unwrap(); 2680 | assert_eq!(state.volatile.condition, Condition::Follower); 2681 | assert_eq!(state.durable.current_term, 1); 2682 | 2683 | // Reset state to leader for next tests. 2684 | state.volatile.condition = Condition::Leader; 2685 | drop(state); 2686 | 2687 | // This time the existing `term: 1` is for the same term so no 2688 | // transition to follower. 2689 | let response = server.handle_message(RPCMessage { 2690 | body: RPCBody::AppendEntriesResponse(msg.clone()), 2691 | from: 2, 2692 | }); 2693 | assert_eq!(response, None); 2694 | 2695 | // Since the response was marked as successful, the 2696 | // `match_index` and `next_index` for this server should have 2697 | // been updated according to `msg.match_index`. 2698 | let state = server.state.lock().unwrap(); 2699 | assert_eq!(state.volatile.next_index[1], msg.match_index + 1); 2700 | assert_eq!(state.volatile.match_index[1], msg.match_index); 2701 | drop(state); 2702 | 2703 | // Let's do another check for `match_index` if the response is 2704 | // marked as not successful. 2705 | let msg = AppendEntriesResponse { 2706 | request_id: 0, 2707 | // Newer term than server so the server will become a 2708 | // follower and not process the request. 2709 | term: 1, 2710 | success: false, 2711 | match_index: 12, 2712 | }; 2713 | let response = server.handle_message(RPCMessage { 2714 | body: RPCBody::AppendEntriesResponse(msg.clone()), 2715 | from: 2, 2716 | }); 2717 | assert_eq!(response, None); 2718 | 2719 | let state = server.state.lock().unwrap(); 2720 | assert_eq!(state.volatile.next_index[1], msg.match_index); 2721 | } 2722 | } 2723 | 2724 | // CRC32C is a port of (what seems to be) FreeBSD's public-domain 2725 | // implementation: 2726 | // https://web.mit.edu/freebsd/head/sys/libkern/crc32.c. 2727 | 2728 | struct CRC32C { 2729 | result: u32, 2730 | } 2731 | 2732 | const CRC32C_TABLE: [u32; 256] = [ 2733 | 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, 2734 | 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, 2735 | 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, 2736 | 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, 2737 | 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, 2738 | 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, 2739 | 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, 2740 | 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, 2741 | 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, 2742 | 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, 2743 | 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, 2744 | 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, 2745 | 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, 2746 | 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, 2747 | 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, 2748 | 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, 2749 | 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, 2750 | 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, 2751 | 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, 2752 | 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, 2753 | 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, 2754 | 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, 2755 | 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, 2756 | 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, 2757 | 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, 2758 | 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, 2759 | 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, 2760 | 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, 2761 | 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, 2762 | 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, 2763 | 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, 2764 | 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, 2765 | ]; 2766 | 2767 | impl CRC32C { 2768 | fn new() -> CRC32C { 2769 | CRC32C { result: !0 } 2770 | } 2771 | 2772 | fn update(&mut self, input: &[u8]) { 2773 | for &byte in input.iter() { 2774 | self.result = 2775 | CRC32C_TABLE[((self.result ^ byte as u32) & 0xFF) as usize] ^ (self.result >> 8); 2776 | } 2777 | } 2778 | 2779 | fn sum(&self) -> u32 { 2780 | self.result ^ !0 2781 | } 2782 | } 2783 | 2784 | fn crc32c(input: &[u8]) -> u32 { 2785 | let mut c = CRC32C::new(); 2786 | c.update(input); 2787 | c.sum() 2788 | } 2789 | 2790 | #[cfg(test)] 2791 | mod crc32c_tests { 2792 | use super::*; 2793 | 2794 | #[test] 2795 | fn test_crc32c() { 2796 | let input = vec![ 2797 | ("", 0), 2798 | ( 2799 | "sadkjflksadfjsdklfjsdlkfjasdflaksdjfalskdfjasldkfjasdlfasdf", 2800 | 0xDE647747, 2801 | ), 2802 | ("What a great little message.", 0x165AD1D7), 2803 | ("f;lkjasdf;lkasdfasd", 0x4EA35847), 2804 | ]; 2805 | for (input, output) in input.iter() { 2806 | assert_eq!(crc32c(input.as_bytes()), *output); 2807 | 2808 | // Test streaming (*multiple* calls to update()) too. 2809 | let mut c = CRC32C::new(); 2810 | for &byte in input.as_bytes().iter() { 2811 | c.update(&[byte]); 2812 | } 2813 | assert_eq!(c.sum(), *output); 2814 | } 2815 | } 2816 | } 2817 | 2818 | struct Random { 2819 | state: [u64; 4], 2820 | } 2821 | 2822 | impl Random { 2823 | fn new(seed: [u64; 4]) -> Random { 2824 | Random { state: seed } 2825 | } 2826 | 2827 | #[allow(dead_code)] 2828 | fn seed() -> [u64; 4] { 2829 | let os = std::env::consts::OS; 2830 | assert!(os == "linux" || os == "macos"); 2831 | 2832 | let mut file = std::fs::File::options() 2833 | .read(true) 2834 | .open("/dev/urandom") 2835 | .unwrap(); 2836 | let mut seed = [0; 4]; 2837 | let mut bytes = vec![0; 8 * seed.len()]; 2838 | file.read_exact(bytes.as_mut_slice()).unwrap(); 2839 | for i in 0..seed.len() { 2840 | seed[i] = u64::from_le_bytes(bytes[i * 8..(i + 1) * 8].try_into().unwrap()); 2841 | } 2842 | seed 2843 | } 2844 | 2845 | // Port of https://prng.di.unimi.it/xoshiro256plusplus.c. 2846 | fn next(&mut self) -> u64 { 2847 | let result: u64 = self.state[0].wrapping_add(self.state[3]); 2848 | 2849 | let t: u64 = self.state[1] << 17; 2850 | 2851 | self.state[2] ^= self.state[0]; 2852 | self.state[3] ^= self.state[1]; 2853 | self.state[1] ^= self.state[2]; 2854 | self.state[0] ^= self.state[3]; 2855 | 2856 | self.state[2] ^= t; 2857 | 2858 | self.state[3] = self.state[3].rotate_left(45); 2859 | 2860 | result 2861 | } 2862 | 2863 | fn generate_u64(&mut self) -> u64 { 2864 | self.next() 2865 | } 2866 | 2867 | fn generate_bool(&mut self) -> bool { 2868 | self.generate_f64() < 0.0 2869 | } 2870 | 2871 | fn generate_f64(&mut self) -> f64 { 2872 | let i = self.next(); 2873 | 2874 | // Reinterpret integer bytes as f64 bytes. 2875 | f64::from_le_bytes(i.to_le_bytes()) 2876 | } 2877 | 2878 | fn generate_u32(&mut self) -> u32 { 2879 | let i = self.next(); 2880 | 2881 | // Reinterpret integer bytes as u32 bytes. 2882 | u32::from_le_bytes(i.to_le_bytes()[0..4].try_into().unwrap()) 2883 | } 2884 | 2885 | fn generate_percent(&mut self) -> f32 { 2886 | let u = self.generate_u32(); 2887 | (u as f64 / u32::MAX as f64) as f32 2888 | } 2889 | 2890 | #[allow(dead_code)] 2891 | fn generate_seed(&mut self) -> [u64; 4] { 2892 | [self.next(), self.next(), self.next(), self.next()] 2893 | } 2894 | } 2895 | 2896 | #[cfg(test)] 2897 | mod random_tests { 2898 | use super::*; 2899 | 2900 | #[test] 2901 | fn test_random() { 2902 | let mut r = Random::new([0; 4]); 2903 | let _ = r.generate_f64(); 2904 | let _ = r.generate_bool(); 2905 | let _ = r.generate_u32(); 2906 | 2907 | let p = r.generate_percent(); 2908 | assert!(p >= 0.0); 2909 | assert!(p <= 1.0); 2910 | } 2911 | } 2912 | 2913 | #[cfg(test)] 2914 | mod e2e_tests { 2915 | use super::*; 2916 | 2917 | fn assert_leader_election_duration_state( 2918 | server: &Server, 2919 | leader_elected: &mut u128, 2920 | post_election_ticks: &mut usize, 2921 | ) { 2922 | let state = server.state.lock().unwrap(); 2923 | // If it's a leader, it should be the same leader as before. 2924 | if state.volatile.condition == Condition::Leader { 2925 | if *leader_elected == 0 { 2926 | *leader_elected = server.config.server_id; 2927 | } else { 2928 | // Once one is elected it shouldn't change. 2929 | assert_eq!(*leader_elected, server.config.server_id); 2930 | *post_election_ticks -= 1; 2931 | } 2932 | } 2933 | 2934 | // And the other way around. If it was a leader, it should still be a leader. 2935 | if *leader_elected == server.config.server_id { 2936 | assert_eq!(state.volatile.condition, Condition::Leader); 2937 | } 2938 | } 2939 | 2940 | fn assert_leader_election_final_state( 2941 | servers: &Vec>, 2942 | leader_elected: u128, 2943 | ) { 2944 | // A leader should have been elected. 2945 | assert_ne!(leader_elected, 0); 2946 | 2947 | for server in servers.iter() { 2948 | let state = server.state.lock().unwrap(); 2949 | if server.config.server_id == leader_elected { 2950 | // Leader should be a leader. 2951 | assert_eq!(state.volatile.condition, Condition::Leader); 2952 | } else { 2953 | // All other servers should not be a leader. 2954 | assert_ne!(state.volatile.condition, Condition::Leader); 2955 | } 2956 | } 2957 | } 2958 | 2959 | fn get_seed() -> [u64; 4] { 2960 | let mut seed = Random::seed(); 2961 | let seed_to_string = |s: [u64; 4]| -> String { 2962 | let mut string = String::new(); 2963 | for chunk in s.iter() { 2964 | for byte in chunk.to_le_bytes().iter() { 2965 | string = format!("{}{:02X?}", string, byte); 2966 | } 2967 | } 2968 | assert_eq!(string.len(), 8 * 4 * 2); 2969 | return string; 2970 | }; 2971 | if let Ok(s) = std::env::var("RAFT_SEED") { 2972 | assert_eq!(s.len(), 8 * 4 * 2); 2973 | let mut i = 0; 2974 | while i < s.len() { 2975 | let mut bytes = [0; 8]; 2976 | let mut j = 0; 2977 | while j < bytes.len() * 2 { 2978 | bytes[j / 2] = u8::from_str_radix(&s[i + j..i + j + 2], 16).unwrap(); 2979 | j += 2; 2980 | } 2981 | 2982 | seed[i / 16] = u64::from_le_bytes(bytes); 2983 | i += 16; 2984 | } 2985 | 2986 | assert_eq!(seed_to_string(seed), s); 2987 | } 2988 | 2989 | println!("RAFT_SEED={}", seed_to_string(seed)); 2990 | 2991 | seed 2992 | } 2993 | 2994 | fn test_cluster( 2995 | tmpdir: &server_tests::TmpDir, 2996 | port: u16, 2997 | debug: bool, 2998 | ) -> ( 2999 | Vec>, 3000 | Vec>>, 3001 | Duration, 3002 | ) { 3003 | let random_seed = get_seed(); 3004 | let tick_freq = Duration::from_millis(1); 3005 | 3006 | let mut cluster = vec![]; 3007 | const SERVERS: u8 = 3; 3008 | for i in 0..SERVERS { 3009 | cluster.push(ServerConfig { 3010 | id: 1 + i as u128, 3011 | address: format!("127.0.0.1:{}", port + i as u16).parse().unwrap(), 3012 | }) 3013 | } 3014 | 3015 | let page_cache_size = match std::env::var("PAGECACHE") { 3016 | Ok(var) => match var.parse::() { 3017 | Ok(size) => size, 3018 | _ => 1000, 3019 | }, 3020 | _ => 1000, 3021 | }; 3022 | 3023 | let mut servers = vec![]; 3024 | let mut results_receivers = vec![]; 3025 | let mut per_server_random_seed_generator = Random::new(random_seed); 3026 | for i in 0..SERVERS { 3027 | let config = Config { 3028 | server_id: 1 + i as u128, 3029 | server_index: i as usize, 3030 | cluster: cluster.clone(), 3031 | 3032 | election_frequency: 500 * tick_freq, 3033 | 3034 | random_seed: per_server_random_seed_generator.generate_seed(), 3035 | 3036 | logger_debug: debug, 3037 | page_cache_size, 3038 | }; 3039 | 3040 | let sm = server_tests::TestStateMachine {}; 3041 | let (server, results_receiver) = Server::new(1, &tmpdir.dir, sm, config); 3042 | servers.push(server); 3043 | results_receivers.push(results_receiver); 3044 | servers[i as usize].init(); 3045 | } 3046 | 3047 | return (servers, results_receivers, tick_freq); 3048 | } 3049 | 3050 | fn assert_leader_converge( 3051 | servers: &mut Vec>, 3052 | tick_freq: Duration, 3053 | skip_id: u128, 3054 | ) -> u128 { 3055 | let mut post_election_ticks = 50; 3056 | let mut leader_elected = 0; 3057 | let mut max_ticks = 500; 3058 | while leader_elected == 0 || post_election_ticks > 0 { 3059 | max_ticks -= 1; 3060 | if max_ticks == 0 { 3061 | panic!("Ran too long without leader election. Something is wrong."); 3062 | } 3063 | 3064 | for server in servers.iter_mut() { 3065 | if server.config.server_id == skip_id { 3066 | continue; 3067 | } 3068 | 3069 | server.tick(); 3070 | 3071 | assert_leader_election_duration_state( 3072 | &server, 3073 | &mut leader_elected, 3074 | &mut post_election_ticks, 3075 | ); 3076 | } 3077 | assert!(tick_freq > Duration::from_millis(0)); 3078 | std::thread::sleep(tick_freq); 3079 | } 3080 | 3081 | assert_leader_election_final_state(&servers, leader_elected); 3082 | return leader_elected; 3083 | } 3084 | 3085 | #[test] 3086 | fn test_converge_leader_no_entries() { 3087 | let tmpdir = server_tests::TmpDir::new(); 3088 | let debug = false; 3089 | let (mut servers, _, tick_freq) = test_cluster(&tmpdir, 20030, debug); 3090 | 3091 | let old_leader = assert_leader_converge(&mut servers, tick_freq, 0); 3092 | 3093 | println!("\n\n----- EPOCH -----\n\n"); 3094 | 3095 | // Now what happens if the old leader stops doing anything? 3096 | let mut leader_elected = 0; 3097 | let mut post_election_ticks = 20; 3098 | while leader_elected == 0 { 3099 | for server in servers.iter_mut() { 3100 | if server.config.server_id == old_leader { 3101 | let mut state = server.state.lock().unwrap(); 3102 | if state.volatile.condition == Condition::Leader { 3103 | state.transition(Condition::Follower, 0, 0); 3104 | } 3105 | continue; 3106 | } 3107 | 3108 | server.tick(); 3109 | 3110 | assert_leader_election_duration_state( 3111 | &server, 3112 | &mut leader_elected, 3113 | &mut post_election_ticks, 3114 | ); 3115 | } 3116 | 3117 | std::thread::sleep(tick_freq); 3118 | } 3119 | 3120 | assert_leader_election_final_state(&servers, leader_elected); 3121 | 3122 | println!("\n\n----- EPOCH -----\n\n"); 3123 | 3124 | // And if all are back up do we converge again? 3125 | 3126 | _ = assert_leader_converge(&mut servers, tick_freq, 0); 3127 | } 3128 | 3129 | fn wait_for_all_applied( 3130 | servers: &mut Vec>, 3131 | tick_freq: Duration, 3132 | skip_id: u128, 3133 | waiting_for: &Vec, 3134 | ) { 3135 | let mut applied = vec![false; servers.len()]; 3136 | let mut applied_at = vec![0; servers.len()]; 3137 | for _ in 0..100 { 3138 | for (i, server) in servers.iter_mut().enumerate() { 3139 | if server.config.server_id == skip_id { 3140 | continue; 3141 | } 3142 | 3143 | server.tick(); 3144 | 3145 | let mut state = server.state.lock().unwrap(); 3146 | let mut checked = state.volatile.commit_index + 1; 3147 | while checked > 0 { 3148 | checked -= 1; 3149 | println!("Checking index: {checked}."); 3150 | 3151 | let log = state.durable.log_at_index(checked); 3152 | let exists_in_log = log.command == *waiting_for; 3153 | if exists_in_log { 3154 | println!("Exists for {i} in log at entry: {checked}."); 3155 | // It should not exist twice in a different location. 3156 | if applied[i] { 3157 | assert_eq!(applied_at[i], checked); 3158 | } 3159 | applied[i] = true; 3160 | applied_at[i] = checked; 3161 | } else { 3162 | // There shouldn't be any other non-empty data. 3163 | assert_eq!(log.command.len(), 0); 3164 | } 3165 | } 3166 | } 3167 | 3168 | // End the check as soon as we can so tests don't take 3169 | // unnecessarily long. 3170 | let mut all_applied = true; 3171 | for (i, server) in servers.iter().enumerate() { 3172 | if server.config.server_id == skip_id { 3173 | continue; 3174 | } 3175 | 3176 | if !applied[i] { 3177 | all_applied = false; 3178 | break; 3179 | } 3180 | } 3181 | if all_applied { 3182 | break; 3183 | } 3184 | 3185 | std::thread::sleep(tick_freq); 3186 | } 3187 | 3188 | for i in 0..applied.len() { 3189 | if servers[i].config.server_id == skip_id { 3190 | continue; 3191 | } 3192 | 3193 | assert!(applied[i]); 3194 | } 3195 | } 3196 | 3197 | fn test_apply_skip_id(skip_id: u128, port: u16) { 3198 | let tmpdir = server_tests::TmpDir::new(); 3199 | let debug = std::env::var("RAFT_DEBUG").unwrap_or("".into()) == "true"; 3200 | let (mut servers, results_receivers, tick_freq) = test_cluster(&tmpdir, port, debug); 3201 | 3202 | for server in servers.iter_mut() { 3203 | if server.config.server_id == skip_id as u128 { 3204 | server.stop(); 3205 | } 3206 | } 3207 | 3208 | let leader_id = assert_leader_converge(&mut servers, tick_freq, skip_id); 3209 | assert_ne!(leader_id, skip_id); 3210 | 3211 | let msg = "abc".as_bytes().to_vec(); 3212 | let cmds = vec![msg.clone()]; 3213 | let cmd_ids = vec![1]; 3214 | let (apply_result, result_receiver) = 'apply_result: { 3215 | for (i, server) in servers.iter_mut().enumerate() { 3216 | if server.config.server_id == leader_id { 3217 | break 'apply_result (server.apply(cmds, cmd_ids), &results_receivers[i]); 3218 | } 3219 | } 3220 | 3221 | unreachable!("Invalid leader."); 3222 | }; 3223 | 3224 | // Message should be applied in cluster within 20 ticks. 3225 | for _ in 0..20 { 3226 | if let ApplyResult::Ok = apply_result { 3227 | if let Ok(received) = result_receiver.try_recv() { 3228 | assert_eq!(received, msg.clone()); 3229 | break; 3230 | } 3231 | } else { 3232 | panic!("Expected ok result."); 3233 | } 3234 | 3235 | for server in servers.iter_mut() { 3236 | if server.config.server_id == skip_id { 3237 | continue; 3238 | } 3239 | 3240 | server.tick(); 3241 | } 3242 | 3243 | std::thread::sleep(tick_freq); 3244 | } 3245 | 3246 | // Within another 100 ticks, all (but skipped) servers 3247 | // should have applied the same message. (Only 2/3 are 3248 | // required for committing, remember). Actually this case 3249 | // isn't meaningful unless we expanded the cluster size to 3250 | // 5 because then a quorum would be 3. 3251 | wait_for_all_applied(&mut servers, tick_freq, skip_id, &msg); 3252 | 3253 | println!("\n\nBringing skipped server back.\n\n"); 3254 | 3255 | drop(servers); 3256 | let (mut servers, senders, tick_freq) = test_cluster(&tmpdir, port, debug); 3257 | 3258 | _ = assert_leader_converge(&mut servers, tick_freq, skip_id); 3259 | 3260 | // And within another 100 ticks where we DONT SKIP skip_id, 3261 | // ALL servers in the cluster should have the message. 3262 | // That is, a downed server should catch up with message 3263 | // it missed when it does come back up. 3264 | let skip_id = 0; // 0 is so that none are skipped since 0 isn't a valid id. 3265 | wait_for_all_applied(&mut servers, tick_freq, skip_id, &msg); 3266 | 3267 | // Explicitly keep this around so the cluster sending results doesn't panic. 3268 | drop(senders); 3269 | } 3270 | 3271 | #[test] 3272 | fn test_apply_none_down() { 3273 | let port = 20033; 3274 | // Skipping server 0 does nothing since 0 is not a valid 3275 | // server id. 3276 | let skip_id = 0; 3277 | test_apply_skip_id(skip_id, port); 3278 | } 3279 | 3280 | #[test] 3281 | fn test_apply_one_down() { 3282 | let port = 20036; 3283 | // Skipping `1` checks to make sure application 3284 | // still happens even with 2/3 servers up. 3285 | let skip_id = 1; 3286 | test_apply_skip_id(skip_id, port); 3287 | } 3288 | 3289 | #[test] 3290 | fn test_bulk() { 3291 | let port = 20039; 3292 | let tmpdir = server_tests::TmpDir::new(); 3293 | let debug = false; 3294 | let (mut servers, mut result_receivers, tick_freq) = test_cluster(&tmpdir, port, debug); 3295 | 3296 | let mut input_senders = vec![]; 3297 | let mut output_receivers = vec![]; 3298 | 3299 | const BATCHES: usize = 10; 3300 | const BATCH_SIZE: usize = 10; 3301 | const INNER_BATCH: usize = 10; 3302 | 3303 | while servers.len() > 0 { 3304 | let (input_sender, input_receiver): ( 3305 | mpsc::Sender<(Vec>, Vec)>, 3306 | mpsc::Receiver<(Vec>, Vec)>, 3307 | ) = mpsc::channel(); 3308 | input_senders.push(input_sender); 3309 | 3310 | let (output_sender, output_receiver): ( 3311 | mpsc::Sender, 3312 | mpsc::Receiver, 3313 | ) = mpsc::channel(); 3314 | output_receivers.push(output_receiver); 3315 | 3316 | let mut server = servers.pop().unwrap(); 3317 | 3318 | std::thread::spawn(move || { 3319 | loop { 3320 | for (msgs, ids) in input_receiver.try_iter() { 3321 | // println!("Server received message: {:?}, {:?}.", msgs, ids); 3322 | // Gracefully shut down when we receive an 3323 | // empty message, so that we can gracefully 3324 | // shut down when we're done in this test. 3325 | if msgs.len() == 0 { 3326 | println!("Shutting server down."); 3327 | server.stop(); 3328 | drop(server); 3329 | return; 3330 | } 3331 | 3332 | let result = server.apply(msgs, ids); 3333 | output_sender.send(result).unwrap(); 3334 | } 3335 | 3336 | server.tick(); 3337 | 3338 | std::thread::sleep(tick_freq); 3339 | } 3340 | }); 3341 | } 3342 | 3343 | println!("Started servers."); 3344 | 3345 | // 1 Million batches of 10 preallocate before inserting into 3346 | // cluster so that we don't measure allocation time. 3347 | let mut batches = vec![]; 3348 | let mut msg: u64 = 0; 3349 | for _ in 0..BATCHES { 3350 | let mut batch = vec![vec![]; BATCH_SIZE]; 3351 | for j in 0..BATCH_SIZE { 3352 | batch[j] = vec![]; 3353 | for _ in 0..INNER_BATCH { 3354 | batch[j].extend(msg.to_le_bytes().to_vec()); 3355 | msg += 1; 3356 | } 3357 | } 3358 | batches.push(batch); 3359 | } 3360 | 3361 | println!("Created batches."); 3362 | 3363 | // Insert batches. 3364 | let t1 = Instant::now(); 3365 | let mut client_serial_id: u128 = 1; 3366 | let mut ids = vec![0; BATCH_SIZE]; 3367 | for batch in batches.iter() { 3368 | for i in 0..ids.len() { 3369 | ids[i] = client_serial_id + i as u128; 3370 | } 3371 | // Need to keep submitting each individual batch until it 3372 | // is handled by someone who is a leader. 3373 | 'batch: loop { 3374 | for input_sender in input_senders.iter() { 3375 | // TODO: Could we do this to not need the clone. 3376 | input_sender.send((batch.clone(), ids.clone())).unwrap(); 3377 | } 3378 | 3379 | for receiver in output_receivers.iter() { 3380 | 'inner: loop { 3381 | if let Ok(result) = receiver.try_recv() { 3382 | match result { 3383 | ApplyResult::NotALeader => { 3384 | break 'inner; 3385 | } 3386 | // Otherwise keep checking until we hear it's ok. 3387 | ApplyResult::Ok => { 3388 | client_serial_id += BATCH_SIZE as u128; 3389 | // println!("Submitted: {}.", client_serial_id - 1); 3390 | break 'batch; 3391 | } 3392 | } 3393 | } 3394 | } 3395 | } 3396 | } 3397 | } 3398 | 3399 | println!("Submitted batches."); 3400 | 3401 | // Wait for completion. 3402 | let (sender, receiver) = mpsc::channel(); 3403 | while result_receivers.len() > 0 { 3404 | let receiver = result_receivers.pop().unwrap(); 3405 | let sender_clone = sender.clone(); 3406 | std::thread::spawn(move || loop { 3407 | if let Ok(_) = receiver.recv() { 3408 | sender_clone.send(1).unwrap(); 3409 | continue; 3410 | } 3411 | 3412 | return; 3413 | }); 3414 | } 3415 | 3416 | let mut seen = 0; 3417 | while seen < BATCHES * BATCH_SIZE { 3418 | if let Ok(_) = receiver.recv() { 3419 | seen += INNER_BATCH; 3420 | } 3421 | } 3422 | 3423 | let t = (Instant::now() - t1).as_secs_f64(); 3424 | println!( 3425 | "All batches (total entries: {}) complete in {}s. Throughput: {}/s.", 3426 | BATCHES * BATCH_SIZE * INNER_BATCH, 3427 | t, 3428 | (BATCHES as f64 * BATCH_SIZE as f64 * INNER_BATCH as f64) / t, 3429 | ); 3430 | 3431 | if let Ok(skip_check) = std::env::var("SKIP_CHECK") { 3432 | if skip_check == "1" { 3433 | return; 3434 | } 3435 | } 3436 | 3437 | // Give them time to all apply logs. 3438 | std::thread::sleep(Duration::from_millis(10000)); 3439 | 3440 | // Now shut down all servers. 3441 | for sender in input_senders.iter() { 3442 | sender.send((vec![], vec![])).unwrap(); 3443 | } 3444 | 3445 | // Each thread ticks for 1ms so give ours 2s to wait. 3446 | std::thread::sleep(Duration::from_millis(2000)); 3447 | 3448 | // Now check that batches are in all servers and in the 3449 | // correct order and with nothing else. 3450 | let (servers, _, _) = test_cluster(&tmpdir, port, debug); 3451 | for server in servers.iter() { 3452 | let mut state = server.state.lock().unwrap(); 3453 | let mut match_index: u64 = 0; 3454 | let mut checked_index = 0; 3455 | 3456 | println!("Checking for {}.", server.config.server_id); 3457 | assert_eq!( 3458 | state.durable.debug_client_entry_count(), 3459 | BATCH_SIZE as u64 * BATCHES as u64 3460 | ); 3461 | 3462 | while match_index < BATCH_SIZE as u64 * BATCHES as u64 * INNER_BATCH as u64 { 3463 | let mut expected_msg = vec![]; 3464 | for i in 0..INNER_BATCH { 3465 | expected_msg.extend((i as u64 + match_index).to_le_bytes().to_vec()); 3466 | } 3467 | let e = state.durable.log_at_index(checked_index); 3468 | 3469 | // It must only EITHER be 1) the one we expect or 2) an empty command. 3470 | if e.command == expected_msg { 3471 | println!( 3472 | "Found {checked_index}: {:?} ?= {:?}", 3473 | e.command, expected_msg 3474 | ); 3475 | match_index += INNER_BATCH as u64; 3476 | } else { 3477 | println!( 3478 | "Checking {checked_index}: {:?} ?= {:?}", 3479 | e.command, expected_msg 3480 | ); 3481 | assert_eq!(e.command.len(), 0); 3482 | } 3483 | 3484 | checked_index += 1; 3485 | } 3486 | 3487 | // All remaining entries must be empty messages. 3488 | while checked_index < state.durable.next_log_index - 1 { 3489 | let e = state.durable.log_at_index(checked_index); 3490 | assert_eq!(e.command.len(), 0); 3491 | checked_index += 1; 3492 | } 3493 | } 3494 | } 3495 | } 3496 | --------------------------------------------------------------------------------