├── .gitignore ├── .travis.yml ├── Cargo.toml ├── README.rst ├── examples ├── .gitignore └── demo1.rs ├── rustfmt.toml ├── src ├── lib.rs └── wal.rs └── tests ├── common └── mod.rs └── rand_fail.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | - beta 5 | - nightly 6 | matrix: 7 | allow_failures: 8 | - rust: nightly 9 | fast_finish: true 10 | cache: cargo 11 | before_install: 12 | - sudo apt-get -y install libaio-dev 13 | script: 14 | - cargo build --verbose --all 15 | - cargo test --release --verbose --all 16 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "growth-ring" 3 | version = "0.3.1" 4 | authors = ["Determinant "] 5 | edition = "2018" 6 | homepage = "https://github.com/Determinant/growth-ring" 7 | keywords = ["wal", "db", "futures"] 8 | license = "MIT" 9 | description = "Simple and modular write-ahead-logging implementation." 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dependencies] 14 | crc = "3.0.1" 15 | lru = "0.12.2" 16 | scan_fmt = "0.2.6" 17 | regex = "1.10.3" 18 | async-trait = "0.1.77" 19 | futures = "0.3.30" 20 | libaio-futures = "0.2.3" 21 | nix = { version = "0.27.1", features = ["dir"] } 22 | libc = "0.2.153" 23 | 24 | [dev-dependencies] 25 | hex = "0.4.3" 26 | rand = "0.8.5" 27 | indexmap = "2.2.2" 28 | 29 | [patch.crates-io] 30 | #libaio-futures = { path = "/home/ymf/work/current/libaio-futures" } 31 | 32 | [lib] 33 | name = "growthring" 34 | path = "src/lib.rs" 35 | crate-type = ["dylib", "rlib", "staticlib"] 36 | 37 | 38 | [profile.release] 39 | lto = true 40 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | growth-ring 2 | =========== 3 | 4 | .. image:: https://travis-ci.com/ava-labs/growth-ring.svg?token=EbLxqxy3qxjHrZKkyoP4&branch=master 5 | :target: https://travis-ci.com/Determinant/growth-ring 6 | 7 | Documentation 8 | ------------- 9 | - Latest_ 10 | 11 | .. _Latest: https://docs.rs/growth-ring 12 | -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | demo 2 | testdb 3 | -------------------------------------------------------------------------------- /examples/demo1.rs: -------------------------------------------------------------------------------- 1 | use futures::executor::block_on; 2 | use growthring::{ 3 | wal::{WALBytes, WALLoader, WALRingId, WALWriter}, 4 | WALStoreAIO, 5 | }; 6 | use rand::{seq::SliceRandom, Rng, SeedableRng}; 7 | 8 | fn test( 9 | records: Vec, 10 | wal: &mut WALWriter, 11 | ) -> Vec { 12 | let mut res = Vec::new(); 13 | for r in wal.grow(records).into_iter() { 14 | let ring_id = futures::executor::block_on(r).unwrap().1; 15 | println!("got ring id: {:?}", ring_id); 16 | res.push(ring_id); 17 | } 18 | res 19 | } 20 | 21 | fn recover(payload: WALBytes, ringid: WALRingId) -> Result<(), ()> { 22 | println!( 23 | "recover(payload={}, ringid={:?}", 24 | std::str::from_utf8(&payload).unwrap(), 25 | ringid 26 | ); 27 | Ok(()) 28 | } 29 | 30 | fn main() { 31 | let wal_dir = "./wal_demo1"; 32 | let mut rng = rand::rngs::StdRng::seed_from_u64(0); 33 | let mut loader = WALLoader::new(); 34 | loader.file_nbit(9).block_nbit(8); 35 | 36 | let store = WALStoreAIO::new(&wal_dir, true, None, None).unwrap(); 37 | let mut wal = block_on(loader.load(store, recover, 0)).unwrap(); 38 | for _ in 0..3 { 39 | test( 40 | ["hi", "hello", "lol"] 41 | .iter() 42 | .map(|s| s.to_string()) 43 | .collect::>(), 44 | &mut wal, 45 | ); 46 | } 47 | for _ in 0..3 { 48 | test( 49 | vec!["a".repeat(10), "b".repeat(100), "c".repeat(1000)], 50 | &mut wal, 51 | ); 52 | } 53 | 54 | let store = WALStoreAIO::new(&wal_dir, false, None, None).unwrap(); 55 | let mut wal = block_on(loader.load(store, recover, 0)).unwrap(); 56 | for _ in 0..3 { 57 | test( 58 | vec![ 59 | "a".repeat(10), 60 | "b".repeat(100), 61 | "c".repeat(300), 62 | "d".repeat(400), 63 | ], 64 | &mut wal, 65 | ); 66 | } 67 | 68 | let store = WALStoreAIO::new(&wal_dir, false, None, None).unwrap(); 69 | let mut wal = block_on(loader.load(store, recover, 100)).unwrap(); 70 | let mut history = std::collections::VecDeque::new(); 71 | for _ in 0..3 { 72 | let mut ids = Vec::new(); 73 | for _ in 0..3 { 74 | let mut records = Vec::new(); 75 | for _ in 0..100 { 76 | let rec = "a".repeat(rng.gen_range(1..1000)); 77 | history.push_back(rec.clone()); 78 | if history.len() > 100 { 79 | history.pop_front(); 80 | } 81 | records.push(rec) 82 | } 83 | for id in test(records, &mut wal).iter() { 84 | ids.push(*id) 85 | } 86 | } 87 | ids.shuffle(&mut rng); 88 | for e in ids.chunks(20) { 89 | println!("peel(20)"); 90 | futures::executor::block_on(wal.peel(e, 100)).unwrap(); 91 | } 92 | } 93 | for (rec, ans) in block_on( 94 | wal.read_recent_records(100, &growthring::wal::RecoverPolicy::Strict), 95 | ) 96 | .unwrap() 97 | .into_iter() 98 | .zip(history.into_iter().rev()) 99 | { 100 | assert_eq!(std::str::from_utf8(&rec).unwrap(), &ans); 101 | println!("{}", std::str::from_utf8(&rec).unwrap()); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2018" 2 | unstable_features = true 3 | max_width = 80 4 | binop_separator = "Back" 5 | inline_attribute_width = 80 6 | fn_params_layout = "Compressed" 7 | hard_tabs = false 8 | tab_spaces = 4 9 | trailing_semicolon = false 10 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Simple and modular write-ahead-logging implementation. 2 | //! 3 | //! # Examples 4 | //! 5 | //! ``` 6 | //! use growthring::{WALStoreAIO, wal::WALLoader}; 7 | //! use futures::executor::block_on; 8 | //! let mut loader = WALLoader::new(); 9 | //! loader.file_nbit(9).block_nbit(8); 10 | //! 11 | //! 12 | //! // Start with empty WAL (truncate = true). 13 | //! let store = WALStoreAIO::new("./walfiles", true, None, None).unwrap(); 14 | //! let mut wal = block_on(loader.load(store, |_, _| {Ok(())}, 0)).unwrap(); 15 | //! // Write a vector of records to WAL. 16 | //! for f in wal.grow(vec!["record1(foo)", "record2(bar)", "record3(foobar)"]).into_iter() { 17 | //! let ring_id = block_on(f).unwrap().1; 18 | //! println!("WAL recorded record to {:?}", ring_id); 19 | //! } 20 | //! 21 | //! 22 | //! // Load from WAL (truncate = false). 23 | //! let store = WALStoreAIO::new("./walfiles", false, None, None).unwrap(); 24 | //! let mut wal = block_on(loader.load(store, |payload, ringid| { 25 | //! // redo the operations in your application 26 | //! println!("recover(payload={}, ringid={:?})", 27 | //! std::str::from_utf8(&payload).unwrap(), 28 | //! ringid); 29 | //! Ok(()) 30 | //! }, 0)).unwrap(); 31 | //! // We saw some log playback, even there is no failure. 32 | //! // Let's try to grow the WAL to create many files. 33 | //! let ring_ids = wal.grow((1..100).into_iter().map(|i| "a".repeat(i)).collect::>()) 34 | //! .into_iter().map(|f| block_on(f).unwrap().1).collect::>(); 35 | //! // Then assume all these records are not longer needed. We can tell WALWriter by the `peel` 36 | //! // method. 37 | //! block_on(wal.peel(ring_ids, 0)).unwrap(); 38 | //! // There will only be one remaining file in ./walfiles. 39 | //! 40 | //! let store = WALStoreAIO::new("./walfiles", false, None, None).unwrap(); 41 | //! let wal = block_on(loader.load(store, |payload, _| { 42 | //! println!("payload.len() = {}", payload.len()); 43 | //! Ok(()) 44 | //! }, 0)).unwrap(); 45 | //! // After each recovery, the ./walfiles is empty. 46 | //! ``` 47 | 48 | #[macro_use] extern crate scan_fmt; 49 | pub mod wal; 50 | 51 | use aiofut::{AIOBuilder, AIOManager}; 52 | use async_trait::async_trait; 53 | use libc::off_t; 54 | use nix::fcntl::{fallocate, open, openat, FallocateFlags, OFlag}; 55 | use nix::sys::stat::Mode; 56 | use nix::unistd::{ftruncate, mkdir, unlinkat, UnlinkatFlags}; 57 | use std::os::fd::{AsFd, AsRawFd, BorrowedFd, FromRawFd, OwnedFd}; 58 | use std::sync::Arc; 59 | use wal::{WALBytes, WALFile, WALPos, WALStore}; 60 | 61 | pub struct WALFileAIO { 62 | fd: OwnedFd, 63 | aiomgr: Arc, 64 | } 65 | 66 | impl WALFileAIO { 67 | pub fn new( 68 | rootfd: BorrowedFd, filename: &str, aiomgr: Arc, 69 | ) -> Result { 70 | openat( 71 | rootfd.as_raw_fd(), 72 | filename, 73 | OFlag::O_CREAT | OFlag::O_RDWR, 74 | Mode::S_IRUSR | Mode::S_IWUSR, 75 | ) 76 | .and_then(|fd| { 77 | Ok(WALFileAIO { 78 | fd: unsafe { OwnedFd::from_raw_fd(fd) }, 79 | aiomgr, 80 | }) 81 | }) 82 | .or_else(|_| Err(())) 83 | } 84 | } 85 | 86 | #[async_trait(?Send)] 87 | impl WALFile for WALFileAIO { 88 | async fn allocate(&self, offset: WALPos, length: usize) -> Result<(), ()> { 89 | // TODO: is there any async version of fallocate? 90 | fallocate( 91 | self.fd.as_raw_fd(), 92 | FallocateFlags::FALLOC_FL_ZERO_RANGE, 93 | offset as off_t, 94 | length as off_t, 95 | ) 96 | .and_then(|_| Ok(())) 97 | .or_else(|_| Err(())) 98 | } 99 | 100 | fn truncate(&self, length: usize) -> Result<(), ()> { 101 | ftruncate(&self.fd, length as off_t).or_else(|_| Err(())) 102 | } 103 | 104 | async fn write(&self, offset: WALPos, data: WALBytes) -> Result<(), ()> { 105 | let (res, data) = self 106 | .aiomgr 107 | .write(self.fd.as_raw_fd(), offset, data, None) 108 | .await; 109 | res.or_else(|_| Err(())).and_then(|nwrote| { 110 | if nwrote == data.len() { 111 | Ok(()) 112 | } else { 113 | Err(()) 114 | } 115 | }) 116 | } 117 | 118 | async fn read( 119 | &self, offset: WALPos, length: usize, 120 | ) -> Result, ()> { 121 | let (res, data) = self 122 | .aiomgr 123 | .read(self.fd.as_raw_fd(), offset, length, None) 124 | .await; 125 | res.or_else(|_| Err(())).and_then(|nread| { 126 | Ok(if nread == length { Some(data) } else { None }) 127 | }) 128 | } 129 | } 130 | 131 | pub struct WALStoreAIO { 132 | rootfd: OwnedFd, 133 | aiomgr: Arc, 134 | } 135 | 136 | unsafe impl Send for WALStoreAIO {} 137 | 138 | impl WALStoreAIO { 139 | pub fn new( 140 | wal_dir: &str, truncate: bool, rootfd: Option, 141 | aiomgr: Option, 142 | ) -> Result { 143 | let aiomgr = Arc::new(aiomgr.ok_or(Err(())).or_else( 144 | |_: Result| { 145 | AIOBuilder::default().build().or(Err(())) 146 | }, 147 | )?); 148 | 149 | if truncate { 150 | let _ = std::fs::remove_dir_all(wal_dir); 151 | } 152 | let walfd; 153 | match rootfd { 154 | None => { 155 | match mkdir( 156 | wal_dir, 157 | Mode::S_IRUSR | Mode::S_IWUSR | Mode::S_IXUSR, 158 | ) { 159 | Err(e) => { 160 | if truncate { 161 | panic!("error while creating directory: {}", e) 162 | } 163 | } 164 | Ok(_) => (), 165 | } 166 | walfd = match open( 167 | wal_dir, 168 | OFlag::O_DIRECTORY | OFlag::O_PATH, 169 | Mode::empty(), 170 | ) { 171 | Ok(fd) => fd, 172 | Err(_) => panic!("error while opening the WAL directory"), 173 | } 174 | } 175 | Some(fd) => { 176 | let dirstr = std::ffi::CString::new(wal_dir).unwrap(); 177 | let ret = unsafe { 178 | libc::mkdirat( 179 | fd.as_raw_fd(), 180 | dirstr.as_ptr(), 181 | libc::S_IRUSR | libc::S_IWUSR | libc::S_IXUSR, 182 | ) 183 | }; 184 | if ret != 0 { 185 | if truncate { 186 | panic!("error while creating directory") 187 | } 188 | } 189 | walfd = match nix::fcntl::openat( 190 | fd.as_raw_fd(), 191 | wal_dir, 192 | OFlag::O_DIRECTORY | OFlag::O_PATH, 193 | Mode::empty(), 194 | ) { 195 | Ok(fd) => fd, 196 | Err(_) => panic!("error while opening the WAL directory"), 197 | } 198 | } 199 | } 200 | Ok(WALStoreAIO { 201 | rootfd: unsafe { OwnedFd::from_raw_fd(walfd) }, 202 | aiomgr, 203 | }) 204 | } 205 | } 206 | 207 | #[async_trait(?Send)] 208 | impl WALStore for WALStoreAIO { 209 | type FileNameIter = std::vec::IntoIter; 210 | 211 | async fn open_file( 212 | &self, filename: &str, _touch: bool, 213 | ) -> Result, ()> { 214 | let filename = filename.to_string(); 215 | WALFileAIO::new(self.rootfd.as_fd(), &filename, self.aiomgr.clone()) 216 | .and_then(|f| Ok(Box::new(f) as Box)) 217 | } 218 | 219 | async fn remove_file(&self, filename: String) -> Result<(), ()> { 220 | unlinkat( 221 | Some(self.rootfd.as_raw_fd()), 222 | filename.as_str(), 223 | UnlinkatFlags::NoRemoveDir, 224 | ) 225 | .or_else(|_| Err(())) 226 | } 227 | 228 | fn enumerate_files(&self) -> Result { 229 | let mut logfiles = Vec::new(); 230 | for ent in nix::dir::Dir::openat( 231 | self.rootfd.as_raw_fd(), 232 | "./", 233 | OFlag::empty(), 234 | Mode::empty(), 235 | ) 236 | .unwrap() 237 | .iter() 238 | { 239 | logfiles 240 | .push(ent.unwrap().file_name().to_str().unwrap().to_string()) 241 | } 242 | Ok(logfiles.into_iter()) 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /src/wal.rs: -------------------------------------------------------------------------------- 1 | use async_trait::async_trait; 2 | use futures::{ 3 | future::{self, FutureExt, TryFutureExt}, 4 | stream::StreamExt, 5 | Future, 6 | }; 7 | 8 | use std::cell::{RefCell, UnsafeCell}; 9 | use std::collections::{hash_map, BinaryHeap, HashMap, VecDeque}; 10 | use std::convert::{TryFrom, TryInto}; 11 | use std::mem::MaybeUninit; 12 | use std::num::NonZeroUsize; 13 | use std::pin::Pin; 14 | 15 | const FILENAME_FMT: &str = r"[0-9a-f]+\.log"; 16 | 17 | enum WALRingType { 18 | #[allow(dead_code)] 19 | Null = 0x0, 20 | Full, 21 | First, 22 | Middle, 23 | Last, 24 | } 25 | 26 | #[repr(packed)] 27 | struct WALRingBlob { 28 | counter: u32, 29 | crc32: u32, 30 | rsize: u32, 31 | rtype: u8, 32 | // payload follows 33 | } 34 | 35 | impl TryFrom for WALRingType { 36 | type Error = (); 37 | fn try_from(v: u8) -> Result { 38 | match v { 39 | x if x == WALRingType::Null as u8 => Ok(WALRingType::Null), 40 | x if x == WALRingType::Full as u8 => Ok(WALRingType::Full), 41 | x if x == WALRingType::First as u8 => Ok(WALRingType::First), 42 | x if x == WALRingType::Middle as u8 => Ok(WALRingType::Middle), 43 | x if x == WALRingType::Last as u8 => Ok(WALRingType::Last), 44 | _ => Err(()), 45 | } 46 | } 47 | } 48 | 49 | type WALFileId = u64; 50 | pub type WALBytes = Box<[u8]>; 51 | pub type WALPos = u64; 52 | 53 | fn get_fid(fname: &str) -> WALFileId { 54 | scan_fmt!(fname, "{x}.log", [hex WALFileId]).unwrap() 55 | } 56 | 57 | fn get_fname(fid: WALFileId) -> String { 58 | format!("{:08x}.log", fid) 59 | } 60 | 61 | fn sort_fids(file_nbit: u64, mut fids: Vec) -> Vec<(u8, u64)> { 62 | let (min, max) = fids.iter().fold((u64::MAX, u64::MIN), |acc, fid| { 63 | ((*fid).min(acc.0), (*fid).max(acc.1)) 64 | }); 65 | let fid_half = u64::MAX >> (file_nbit + 1); 66 | if max > min && max - min > fid_half { 67 | // we treat this as u64 overflow has happened, take proper care here 68 | let mut aux: Vec<_> = fids 69 | .into_iter() 70 | .map(|fid| (if fid < fid_half { 1 } else { 0 }, fid)) 71 | .collect(); 72 | aux.sort(); 73 | aux 74 | } else { 75 | fids.sort(); 76 | fids.into_iter().map(|fid| (0, fid)).collect() 77 | } 78 | } 79 | 80 | fn counter_lt(a: u32, b: u32) -> bool { 81 | if u32::abs_diff(a, b) > u32::MAX / 2 { 82 | b < a 83 | } else { 84 | a < b 85 | } 86 | } 87 | 88 | #[repr(C)] 89 | struct Header { 90 | /// all preceding files ((); 95 | 96 | #[repr(C)] 97 | #[derive(Eq, PartialEq, Copy, Clone, Debug, Hash)] 98 | pub struct WALRingId { 99 | start: WALPos, 100 | end: WALPos, 101 | counter: u32, 102 | } 103 | 104 | impl WALRingId { 105 | pub fn empty_id() -> Self { 106 | WALRingId { 107 | start: 0, 108 | end: 0, 109 | counter: 0, 110 | } 111 | } 112 | pub fn get_start(&self) -> WALPos { 113 | self.start 114 | } 115 | pub fn get_end(&self) -> WALPos { 116 | self.end 117 | } 118 | } 119 | 120 | impl Ord for WALRingId { 121 | fn cmp(&self, other: &WALRingId) -> std::cmp::Ordering { 122 | other 123 | .start 124 | .cmp(&self.start) 125 | .then_with(|| other.end.cmp(&self.end)) 126 | } 127 | } 128 | 129 | impl PartialOrd for WALRingId { 130 | fn partial_cmp(&self, other: &WALRingId) -> Option { 131 | Some(self.cmp(other)) 132 | } 133 | } 134 | 135 | pub trait Record { 136 | fn serialize(&self) -> WALBytes; 137 | } 138 | 139 | impl Record for WALBytes { 140 | fn serialize(&self) -> WALBytes { 141 | self[..].into() 142 | } 143 | } 144 | 145 | impl Record for String { 146 | fn serialize(&self) -> WALBytes { 147 | self.as_bytes().into() 148 | } 149 | } 150 | 151 | impl Record for &str { 152 | fn serialize(&self) -> WALBytes { 153 | self.as_bytes().into() 154 | } 155 | } 156 | 157 | /// the state for a WAL writer 158 | struct WALState { 159 | /// the next position for a record, addressed in the entire WAL space 160 | next: WALPos, 161 | /// number of bits for a file 162 | file_nbit: u64, 163 | next_complete: WALRingId, 164 | counter: u32, 165 | io_complete: BinaryHeap, 166 | pending_removal: VecDeque<(WALFileId, u32)>, 167 | } 168 | 169 | #[async_trait(?Send)] 170 | pub trait WALFile { 171 | /// Initialize the file space in [offset, offset + length) to zero. 172 | async fn allocate(&self, offset: WALPos, length: usize) -> Result<(), ()>; 173 | /// Write data with offset. We assume all previous `allocate`/`truncate` invocations are visible 174 | /// if ordered earlier (should be guaranteed by most OS). Additionally, the write caused 175 | /// by each invocation of this function should be _atomic_ (the entire single write should be 176 | /// all or nothing). 177 | async fn write(&self, offset: WALPos, data: WALBytes) -> Result<(), ()>; 178 | /// Read data with offset. Return `Ok(None)` when it reaches EOF. 179 | async fn read( 180 | &self, offset: WALPos, length: usize, 181 | ) -> Result, ()>; 182 | /// Truncate a file to a specified length. 183 | fn truncate(&self, length: usize) -> Result<(), ()>; 184 | } 185 | 186 | #[async_trait(?Send)] 187 | pub trait WALStore { 188 | type FileNameIter: Iterator; 189 | 190 | /// Open a file given the filename, create the file if not exists when `touch` is `true`. 191 | async fn open_file( 192 | &self, filename: &str, touch: bool, 193 | ) -> Result, ()>; 194 | /// Unlink a file given the filename. 195 | async fn remove_file(&self, filename: String) -> Result<(), ()>; 196 | /// Enumerate all WAL filenames. It should include all WAL files that are previously opened 197 | /// (created) but not removed. The list could be unordered. 198 | fn enumerate_files(&self) -> Result; 199 | } 200 | 201 | struct WALFileHandle<'a, F: WALStore> { 202 | fid: WALFileId, 203 | handle: &'a dyn WALFile, 204 | pool: *const WALFilePool, 205 | } 206 | 207 | impl<'a, F: WALStore> std::ops::Deref for WALFileHandle<'a, F> { 208 | type Target = dyn WALFile + 'a; 209 | fn deref(&self) -> &Self::Target { 210 | self.handle 211 | } 212 | } 213 | 214 | impl<'a, F: WALStore> Drop for WALFileHandle<'a, F> { 215 | fn drop(&mut self) { 216 | unsafe { 217 | (&*self.pool).release_file(self.fid); 218 | } 219 | } 220 | } 221 | 222 | /// The middle layer that manages WAL file handles and invokes public trait functions to actually 223 | /// manipulate files and their contents. 224 | struct WALFilePool { 225 | store: F, 226 | header_file: Box, 227 | handle_cache: RefCell>>, 228 | handle_used: 229 | RefCell, usize)>>>, 230 | last_write: 231 | UnsafeCell>>>>>, 232 | last_peel: 233 | UnsafeCell>>>>>, 234 | file_nbit: u64, 235 | file_size: u64, 236 | block_nbit: u64, 237 | } 238 | 239 | impl WALFilePool { 240 | async fn new( 241 | store: F, file_nbit: u64, block_nbit: u64, cache_size: NonZeroUsize, 242 | ) -> Result { 243 | let file_nbit = file_nbit as u64; 244 | let block_nbit = block_nbit as u64; 245 | let header_file = store.open_file("HEAD", true).await?; 246 | header_file.truncate(HEADER_SIZE)?; 247 | Ok(WALFilePool { 248 | store, 249 | header_file, 250 | handle_cache: RefCell::new(lru::LruCache::new(cache_size)), 251 | handle_used: RefCell::new(HashMap::new()), 252 | last_write: UnsafeCell::new(MaybeUninit::new(Box::pin( 253 | future::ready(Ok(())), 254 | ))), 255 | last_peel: UnsafeCell::new(MaybeUninit::new(Box::pin( 256 | future::ready(Ok(())), 257 | ))), 258 | file_nbit, 259 | file_size: 1 << (file_nbit as u64), 260 | block_nbit, 261 | }) 262 | } 263 | 264 | async fn read_header(&self) -> Result { 265 | let bytes = self.header_file.read(0, HEADER_SIZE).await?.unwrap(); 266 | let bytes: [u8; HEADER_SIZE] = (&*bytes).try_into().unwrap(); 267 | let header = unsafe { std::mem::transmute::<_, Header>(bytes) }; 268 | Ok(header) 269 | } 270 | 271 | async fn write_header(&self, header: &Header) -> Result<(), ()> { 272 | let base = header as *const Header as usize as *const u8; 273 | let bytes = unsafe { std::slice::from_raw_parts(base, HEADER_SIZE) }; 274 | self.header_file.write(0, bytes.into()).await?; 275 | Ok(()) 276 | } 277 | 278 | fn get_file<'a>( 279 | &'a self, fid: u64, touch: bool, 280 | ) -> impl Future, ()>> { 281 | async move { 282 | let pool = self as *const WALFilePool; 283 | if let Some(h) = self.handle_cache.borrow_mut().pop(&fid) { 284 | let handle = match self.handle_used.borrow_mut().entry(fid) { 285 | hash_map::Entry::Vacant(e) => unsafe { 286 | &*(*e.insert(UnsafeCell::new((h, 1))).get()).0 287 | }, 288 | _ => unreachable!(), 289 | }; 290 | Ok(WALFileHandle { fid, handle, pool }) 291 | } else { 292 | let v = unsafe { 293 | &mut *match self.handle_used.borrow_mut().entry(fid) { 294 | hash_map::Entry::Occupied(e) => e.into_mut(), 295 | hash_map::Entry::Vacant(e) => { 296 | e.insert(UnsafeCell::new(( 297 | self.store 298 | .open_file(&get_fname(fid), touch) 299 | .await?, 300 | 0, 301 | ))) 302 | } 303 | } 304 | .get() 305 | }; 306 | v.1 += 1; 307 | Ok(WALFileHandle { 308 | fid, 309 | handle: &*v.0, 310 | pool, 311 | }) 312 | } 313 | } 314 | } 315 | 316 | fn release_file(&self, fid: WALFileId) { 317 | match self.handle_used.borrow_mut().entry(fid) { 318 | hash_map::Entry::Occupied(e) => { 319 | let v = unsafe { &mut *e.get().get() }; 320 | v.1 -= 1; 321 | if v.1 == 0 { 322 | self.handle_cache 323 | .borrow_mut() 324 | .put(fid, e.remove().into_inner().0); 325 | } 326 | } 327 | _ => unreachable!(), 328 | } 329 | } 330 | 331 | fn write<'a>( 332 | &'a mut self, writes: Vec<(WALPos, WALBytes)>, 333 | ) -> Vec> + 'a>>> { 334 | if writes.is_empty() { 335 | return Vec::new() 336 | } 337 | let file_size = self.file_size; 338 | let file_nbit = self.file_nbit; 339 | let meta: Vec<(u64, u64)> = writes 340 | .iter() 341 | .map(|(off, w)| ((*off) >> file_nbit, w.len() as u64)) 342 | .collect(); 343 | let mut files: Vec + 'a>>> = Vec::new(); 344 | for &(fid, _) in meta.iter() { 345 | files.push(Box::pin(self.get_file(fid, true)) 346 | as Pin + 'a>>) 347 | } 348 | let mut fid = writes[0].0 >> file_nbit; 349 | let mut alloc_start = writes[0].0 & (self.file_size - 1); 350 | let mut alloc_end = alloc_start + writes[0].1.len() as u64; 351 | let last_write = unsafe { 352 | std::mem::replace( 353 | &mut *self.last_write.get(), 354 | std::mem::MaybeUninit::uninit(), 355 | ) 356 | .assume_init() 357 | }; 358 | // pre-allocate the file space 359 | let alloc = async move { 360 | last_write.await?; 361 | let mut last_h: Option< 362 | Pin< 363 | Box< 364 | dyn Future, ()>> 365 | + 'a, 366 | >, 367 | >, 368 | > = None; 369 | for ((next_fid, wl), h) in meta.into_iter().zip(files.into_iter()) { 370 | if let Some(lh) = last_h.take() { 371 | if next_fid != fid { 372 | lh.await? 373 | .allocate( 374 | alloc_start, 375 | (alloc_end - alloc_start) as usize, 376 | ) 377 | .await?; 378 | last_h = Some(h); 379 | alloc_start = 0; 380 | alloc_end = alloc_start + wl; 381 | fid = next_fid; 382 | } else { 383 | last_h = Some(lh); 384 | alloc_end += wl; 385 | } 386 | } else { 387 | last_h = Some(h); 388 | } 389 | } 390 | if let Some(lh) = last_h { 391 | lh.await? 392 | .allocate(alloc_start, (alloc_end - alloc_start) as usize) 393 | .await? 394 | } 395 | Ok(()) 396 | }; 397 | let mut res = Vec::new(); 398 | let mut prev = Box::pin(alloc) as Pin + 'a>>; 399 | for (off, w) in writes.into_iter() { 400 | let f = self.get_file(off >> file_nbit, true); 401 | let w = (async move { 402 | prev.await?; 403 | f.await?.write(off & (file_size - 1), w).await 404 | }) 405 | .shared(); 406 | prev = Box::pin(w.clone()); 407 | res.push(Box::pin(w) as Pin + 'a>>) 408 | } 409 | unsafe { 410 | (*self.last_write.get()) = MaybeUninit::new(std::mem::transmute::< 411 | Pin + 'a>>, 412 | Pin + 'static>>, 413 | >(prev)) 414 | } 415 | res 416 | } 417 | 418 | fn remove_files<'a>( 419 | &'a mut self, state: &mut WALState, keep_nrecords: u32, 420 | ) -> impl Future> + 'a { 421 | let last_peel = unsafe { 422 | std::mem::replace( 423 | &mut *self.last_peel.get(), 424 | std::mem::MaybeUninit::uninit(), 425 | ) 426 | .assume_init() 427 | }; 428 | 429 | let mut removes: Vec>>>> = 430 | Vec::new(); 431 | while state.pending_removal.len() > 1 { 432 | let (fid, counter) = state.pending_removal.front().unwrap(); 433 | if counter_lt(counter + keep_nrecords, state.counter) { 434 | removes.push(self.store.remove_file(get_fname(*fid)) 435 | as Pin + 'a>>); 436 | state.pending_removal.pop_front(); 437 | } else { 438 | break 439 | } 440 | } 441 | let p = async move { 442 | last_peel.await.ok(); 443 | for r in removes.into_iter() { 444 | r.await.ok(); 445 | } 446 | Ok(()) 447 | } 448 | .shared(); 449 | unsafe { 450 | (*self.last_peel.get()) = 451 | MaybeUninit::new(std::mem::transmute(Box::pin(p.clone()) 452 | as Pin + 'a>>)) 453 | } 454 | p 455 | } 456 | 457 | fn in_use_len(&self) -> usize { 458 | self.handle_used.borrow().len() 459 | } 460 | 461 | fn reset(&mut self) { 462 | self.handle_cache.borrow_mut().clear(); 463 | self.handle_used.borrow_mut().clear() 464 | } 465 | } 466 | 467 | pub struct WALWriter { 468 | state: WALState, 469 | file_pool: WALFilePool, 470 | block_buffer: WALBytes, 471 | block_size: u32, 472 | msize: usize, 473 | } 474 | 475 | unsafe impl Send for WALWriter where F: WALStore + Send {} 476 | 477 | impl WALWriter { 478 | fn new(state: WALState, file_pool: WALFilePool) -> Self { 479 | let mut b = Vec::new(); 480 | let block_size = 1 << file_pool.block_nbit as u32; 481 | let msize = std::mem::size_of::(); 482 | b.resize(block_size as usize, 0); 483 | WALWriter { 484 | state, 485 | file_pool, 486 | block_buffer: b.into_boxed_slice(), 487 | block_size, 488 | msize, 489 | } 490 | } 491 | 492 | /// Submit a sequence of records to WAL. It returns a vector of futures, each of which 493 | /// corresponds to one record. When a future resolves to `WALRingId`, it is guaranteed the 494 | /// record is already logged. Then, after finalizing the changes encoded by that record to 495 | /// the persistent storage, the caller can recycle the WAL files by invoking the given 496 | /// `peel` with the given `WALRingId`s. Note: each serialized record should contain at least 1 497 | /// byte (empty record payload will result in assertion failure). 498 | pub fn grow<'a, R: Record + 'a>( 499 | &'a mut self, records: Vec, 500 | ) -> Vec> + 'a> { 501 | let mut res = Vec::new(); 502 | let mut writes = Vec::new(); 503 | let msize = self.msize as u32; 504 | // the global offest of the begining of the block 505 | // the start of the unwritten data 506 | let mut bbuff_start = self.state.next as u32 & (self.block_size - 1); 507 | // the end of the unwritten data 508 | let mut bbuff_cur = bbuff_start; 509 | 510 | for rec in records.iter() { 511 | let bytes = rec.serialize(); 512 | let mut rec = &bytes[..]; 513 | let mut rsize = rec.len() as u32; 514 | let mut ring_start = None; 515 | assert!(rsize > 0); 516 | while rsize > 0 { 517 | let remain = self.block_size - bbuff_cur; 518 | if remain > msize { 519 | let d = remain - msize; 520 | let rs0 = 521 | self.state.next + (bbuff_cur - bbuff_start) as u64; 522 | let blob = unsafe { 523 | std::mem::transmute::<*mut u8, &mut WALRingBlob>( 524 | (&mut self.block_buffer[bbuff_cur as usize..]) 525 | .as_mut_ptr(), 526 | ) 527 | }; 528 | bbuff_cur += msize; 529 | if d >= rsize { 530 | // the remaining rec fits in the block 531 | let payload = rec; 532 | blob.counter = self.state.counter; 533 | blob.crc32 = CRC32.checksum(payload); 534 | blob.rsize = rsize; 535 | let (rs, rt) = if let Some(rs) = ring_start.take() { 536 | self.state.counter += 1; 537 | (rs, WALRingType::Last) 538 | } else { 539 | self.state.counter += 1; 540 | (rs0, WALRingType::Full) 541 | }; 542 | blob.rtype = rt as u8; 543 | self.block_buffer[bbuff_cur as usize.. 544 | bbuff_cur as usize + payload.len()] 545 | .copy_from_slice(payload); 546 | bbuff_cur += rsize; 547 | rsize = 0; 548 | let end = 549 | self.state.next + (bbuff_cur - bbuff_start) as u64; 550 | res.push(( 551 | WALRingId { 552 | start: rs, 553 | end, 554 | counter: blob.counter, 555 | }, 556 | Vec::new(), 557 | )); 558 | } else { 559 | // the remaining block can only accommodate partial rec 560 | let payload = &rec[..d as usize]; 561 | blob.counter = self.state.counter; 562 | blob.crc32 = CRC32.checksum(payload); 563 | blob.rsize = d; 564 | blob.rtype = if ring_start.is_some() { 565 | WALRingType::Middle 566 | } else { 567 | ring_start = Some(rs0); 568 | WALRingType::First 569 | } as u8; 570 | self.block_buffer[bbuff_cur as usize.. 571 | bbuff_cur as usize + payload.len()] 572 | .copy_from_slice(payload); 573 | bbuff_cur += d; 574 | rsize -= d; 575 | rec = &rec[d as usize..]; 576 | } 577 | } else { 578 | // add padding space by moving the point to the end of the block 579 | bbuff_cur = self.block_size; 580 | } 581 | if bbuff_cur == self.block_size { 582 | writes.push(( 583 | self.state.next, 584 | self.block_buffer[bbuff_start as usize..] 585 | .to_vec() 586 | .into_boxed_slice(), 587 | )); 588 | self.state.next += (self.block_size - bbuff_start) as u64; 589 | bbuff_start = 0; 590 | bbuff_cur = 0; 591 | } 592 | } 593 | } 594 | if bbuff_cur > bbuff_start { 595 | writes.push(( 596 | self.state.next, 597 | self.block_buffer[bbuff_start as usize..bbuff_cur as usize] 598 | .to_vec() 599 | .into_boxed_slice(), 600 | )); 601 | self.state.next += (bbuff_cur - bbuff_start) as u64; 602 | } 603 | 604 | // mark the block info for each record 605 | let mut i = 0; 606 | 'outer: for (j, (off, w)) in writes.iter().enumerate() { 607 | let blk_s = *off; 608 | let blk_e = blk_s + w.len() as u64; 609 | while res[i].0.end <= blk_s { 610 | i += 1; 611 | if i >= res.len() { 612 | break 'outer 613 | } 614 | } 615 | while res[i].0.start < blk_e { 616 | res[i].1.push(j); 617 | if res[i].0.end >= blk_e { 618 | break 619 | } 620 | i += 1; 621 | if i >= res.len() { 622 | break 'outer 623 | } 624 | } 625 | } 626 | 627 | let writes: Vec> = self 628 | .file_pool 629 | .write(writes) 630 | .into_iter() 631 | .map(move |f| async move { f.await }.shared()) 632 | .collect(); 633 | let res = res 634 | .into_iter() 635 | .zip(records.into_iter()) 636 | .map(|((ringid, blks), rec)| { 637 | future::try_join_all( 638 | blks.into_iter().map(|idx| writes[idx].clone()), 639 | ) 640 | .or_else(|_| future::ready(Err(()))) 641 | .and_then(move |_| future::ready(Ok((rec, ringid)))) 642 | }) 643 | .collect(); 644 | res 645 | } 646 | 647 | /// Inform the `WALWriter` that some data writes are complete so that it could automatically 648 | /// remove obsolete WAL files. The given list of `WALRingId` does not need to be ordered and 649 | /// could be of arbitrary length. Use `0` for `keep_nrecords` if all obsolete WAL files 650 | /// need to removed (the obsolete files do not affect the speed of recovery or correctness). 651 | pub async fn peel<'a, T: AsRef<[WALRingId]>>( 652 | &'a mut self, records: T, keep_nrecords: u32, 653 | ) -> Result<(), ()> { 654 | let msize = self.msize as u64; 655 | let block_size = self.block_size as u64; 656 | let state = &mut self.state; 657 | for rec in records.as_ref() { 658 | state.io_complete.push(*rec); 659 | } 660 | while let Some(s) = 661 | state.io_complete.peek().and_then(|&e| Some(e.start)) 662 | { 663 | if s != state.next_complete.end { 664 | break 665 | } 666 | let mut m = state.io_complete.pop().unwrap(); 667 | let block_remain = block_size - (m.end & (block_size - 1)); 668 | if block_remain <= msize as u64 { 669 | m.end += block_remain 670 | } 671 | let fid = m.start >> state.file_nbit; 672 | match state.pending_removal.back_mut() { 673 | Some(l) => { 674 | if l.0 == fid { 675 | l.1 = m.counter 676 | } else { 677 | for i in l.0 + 1..fid + 1 { 678 | state.pending_removal.push_back((i, m.counter)) 679 | } 680 | } 681 | } 682 | None => state.pending_removal.push_back((fid, m.counter)), 683 | } 684 | state.next_complete = m; 685 | } 686 | self.file_pool.remove_files(state, keep_nrecords).await.ok(); 687 | Ok(()) 688 | } 689 | 690 | pub fn file_pool_in_use(&self) -> usize { 691 | self.file_pool.in_use_len() 692 | } 693 | 694 | pub async fn read_recent_records<'a>( 695 | &'a self, nrecords: usize, recover_policy: &RecoverPolicy, 696 | ) -> Result, ()> { 697 | let filename_fmt = regex::Regex::new(FILENAME_FMT).unwrap(); 698 | let file_pool = &self.file_pool; 699 | let file_nbit = file_pool.file_nbit; 700 | let block_size = 1 << file_pool.block_nbit; 701 | let msize = std::mem::size_of::(); 702 | 703 | let logfiles = sort_fids( 704 | file_nbit, 705 | file_pool 706 | .store 707 | .enumerate_files()? 708 | .filter(|f| filename_fmt.is_match(f)) 709 | .map(|s| get_fid(&s)) 710 | .collect(), 711 | ); 712 | 713 | let mut chunks: Option> = None; 714 | let mut records = Vec::new(); 715 | 'outer: for (_, fid) in logfiles.into_iter().rev() { 716 | let f = file_pool.get_file(fid, false).await?; 717 | let ring_stream = WALLoader::read_rings( 718 | &f, 719 | true, 720 | file_pool.block_nbit, 721 | recover_policy, 722 | ); 723 | let mut off = fid << file_nbit; 724 | let mut rings = Vec::new(); 725 | futures::pin_mut!(ring_stream); 726 | while let Some(ring) = ring_stream.next().await { 727 | rings.push(ring); 728 | } 729 | for ring in rings.into_iter().rev() { 730 | let ring = ring.map_err(|_| ())?; 731 | let (header, payload) = ring; 732 | let payload = payload.unwrap(); 733 | match header.rtype.try_into() { 734 | Ok(WALRingType::Full) => { 735 | assert!(chunks.is_none()); 736 | if !WALLoader::verify_checksum_( 737 | &payload, 738 | header.crc32, 739 | recover_policy, 740 | )? { 741 | return Err(()) 742 | } 743 | off += header.rsize as u64; 744 | records.push(payload); 745 | } 746 | Ok(WALRingType::First) => { 747 | if !WALLoader::verify_checksum_( 748 | &payload, 749 | header.crc32, 750 | recover_policy, 751 | )? { 752 | return Err(()) 753 | } 754 | if let Some(mut chunks) = chunks.take() { 755 | chunks.push(payload); 756 | let mut acc = Vec::new(); 757 | chunks.into_iter().rev().fold( 758 | &mut acc, 759 | |acc, v| { 760 | acc.extend(v.iter()); 761 | acc 762 | }, 763 | ); 764 | records.push(acc.into()); 765 | } else { 766 | unreachable!() 767 | } 768 | off += header.rsize as u64; 769 | } 770 | Ok(WALRingType::Middle) => { 771 | if let Some(chunks) = &mut chunks { 772 | chunks.push(payload); 773 | } else { 774 | unreachable!() 775 | } 776 | off += header.rsize as u64; 777 | } 778 | Ok(WALRingType::Last) => { 779 | assert!(chunks.is_none()); 780 | chunks = Some(vec![payload]); 781 | off += header.rsize as u64; 782 | } 783 | Ok(WALRingType::Null) => break, 784 | Err(_) => match recover_policy { 785 | RecoverPolicy::Strict => return Err(()), 786 | RecoverPolicy::BestEffort => break 'outer, 787 | }, 788 | } 789 | let block_remain = block_size - (off & (block_size - 1)); 790 | if block_remain <= msize as u64 { 791 | off += block_remain; 792 | } 793 | if records.len() >= nrecords { 794 | break 'outer 795 | } 796 | } 797 | } 798 | Ok(records) 799 | } 800 | } 801 | 802 | #[derive(Copy, Clone)] 803 | pub enum RecoverPolicy { 804 | /// all checksums must be correct, otherwise recovery fails 805 | Strict, 806 | /// stop recovering when hitting the first corrupted record 807 | BestEffort, 808 | } 809 | 810 | pub struct WALLoader { 811 | file_nbit: u64, 812 | block_nbit: u64, 813 | cache_size: NonZeroUsize, 814 | recover_policy: RecoverPolicy, 815 | } 816 | 817 | impl Default for WALLoader { 818 | fn default() -> Self { 819 | WALLoader { 820 | file_nbit: 22, // 4MB 821 | block_nbit: 15, // 32KB, 822 | cache_size: NonZeroUsize::new(16).unwrap(), 823 | recover_policy: RecoverPolicy::Strict, 824 | } 825 | } 826 | } 827 | 828 | impl WALLoader { 829 | pub fn new() -> Self { 830 | Default::default() 831 | } 832 | 833 | pub fn file_nbit(&mut self, v: u64) -> &mut Self { 834 | self.file_nbit = v; 835 | self 836 | } 837 | 838 | pub fn block_nbit(&mut self, v: u64) -> &mut Self { 839 | self.block_nbit = v; 840 | self 841 | } 842 | 843 | pub fn cache_size(&mut self, v: NonZeroUsize) -> &mut Self { 844 | self.cache_size = v; 845 | self 846 | } 847 | 848 | pub fn recover_policy(&mut self, p: RecoverPolicy) -> &mut Self { 849 | self.recover_policy = p; 850 | self 851 | } 852 | 853 | fn verify_checksum_( 854 | data: &[u8], checksum: u32, p: &RecoverPolicy, 855 | ) -> Result { 856 | if checksum == CRC32.checksum(data) { 857 | Ok(true) 858 | } else { 859 | match p { 860 | RecoverPolicy::Strict => Err(()), 861 | RecoverPolicy::BestEffort => Ok(false), 862 | } 863 | } 864 | } 865 | 866 | fn verify_checksum(&self, data: &[u8], checksum: u32) -> Result { 867 | Self::verify_checksum_(data, checksum, &self.recover_policy) 868 | } 869 | 870 | fn read_rings<'a, F: WALStore + 'a>( 871 | file: &'a WALFileHandle<'a, F>, read_payload: bool, block_nbit: u64, 872 | recover_policy: &'a RecoverPolicy, 873 | ) -> impl futures::Stream), bool>> 874 | + 'a { 875 | let block_size = 1 << block_nbit; 876 | let msize = std::mem::size_of::(); 877 | 878 | struct Vars<'a, F: WALStore> { 879 | done: bool, 880 | off: u64, 881 | file: &'a WALFileHandle<'a, F>, 882 | } 883 | 884 | let vars = std::rc::Rc::new(std::cell::RefCell::new(Vars { 885 | done: false, 886 | off: 0, 887 | file, 888 | })); 889 | 890 | futures::stream::unfold((), move |_| { 891 | let v = vars.clone(); 892 | async move { 893 | let mut v = v.borrow_mut(); 894 | 895 | macro_rules! check { 896 | ($res: expr) => { 897 | match $res { 898 | Ok(t) => t, 899 | Err(_) => die!(), 900 | } 901 | }; 902 | } 903 | 904 | macro_rules! die { 905 | () => {{ 906 | v.done = true; 907 | return Some((Err(true), ())) 908 | }}; 909 | } 910 | 911 | macro_rules! _yield { 912 | () => {{ 913 | v.done = true; 914 | return None 915 | }}; 916 | ($v: expr) => {{ 917 | let v = $v; 918 | catch_up!(); 919 | return Some((Ok(v), ())) 920 | }}; 921 | } 922 | 923 | macro_rules! catch_up { 924 | () => {{ 925 | let block_remain = 926 | block_size - (v.off & (block_size - 1)); 927 | if block_remain <= msize as u64 { 928 | v.off += block_remain; 929 | } 930 | }}; 931 | } 932 | 933 | if v.done { 934 | return None 935 | } 936 | let header_raw = 937 | match check!(v.file.read(v.off, msize as usize).await) { 938 | Some(h) => h, 939 | None => _yield!(), 940 | }; 941 | v.off += msize as u64; 942 | let header = unsafe { 943 | std::mem::transmute::<*const u8, &WALRingBlob>( 944 | header_raw.as_ptr(), 945 | ) 946 | }; 947 | let header = WALRingBlob { 948 | counter: header.counter, 949 | crc32: header.crc32, 950 | rsize: header.rsize, 951 | rtype: header.rtype, 952 | }; 953 | let payload; 954 | match header.rtype.try_into() { 955 | Ok(WALRingType::Full) | 956 | Ok(WALRingType::First) | 957 | Ok(WALRingType::Middle) | 958 | Ok(WALRingType::Last) => { 959 | payload = if read_payload { 960 | Some(check!(check!( 961 | v.file.read(v.off, header.rsize as usize).await 962 | ) 963 | .ok_or(()))) 964 | } else { 965 | None 966 | }; 967 | v.off += header.rsize as u64; 968 | } 969 | Ok(WALRingType::Null) => _yield!(), 970 | Err(_) => match recover_policy { 971 | RecoverPolicy::Strict => die!(), 972 | RecoverPolicy::BestEffort => { 973 | v.done = true; 974 | return Some((Err(false), ())) 975 | } 976 | }, 977 | } 978 | _yield!((header, payload)) 979 | } 980 | }) 981 | } 982 | 983 | fn read_records<'a, F: WALStore + 'a>( 984 | &'a self, file: &'a WALFileHandle<'a, F>, 985 | chunks: &'a mut Option<(Vec, WALPos)>, 986 | ) -> impl futures::Stream> + 'a 987 | { 988 | let fid = file.fid; 989 | let file_nbit = self.file_nbit; 990 | let block_size = 1 << self.block_nbit; 991 | let msize = std::mem::size_of::(); 992 | 993 | struct Vars<'a, F: WALStore> { 994 | done: bool, 995 | chunks: &'a mut Option<(Vec, WALPos)>, 996 | off: u64, 997 | file: &'a WALFileHandle<'a, F>, 998 | } 999 | 1000 | let vars = std::rc::Rc::new(std::cell::RefCell::new(Vars { 1001 | done: false, 1002 | off: 0, 1003 | chunks, 1004 | file, 1005 | })); 1006 | 1007 | futures::stream::unfold((), move |_| { 1008 | let v = vars.clone(); 1009 | async move { 1010 | let mut v = v.borrow_mut(); 1011 | 1012 | macro_rules! check { 1013 | ($res: expr) => { 1014 | match $res { 1015 | Ok(t) => t, 1016 | Err(_) => die!(), 1017 | } 1018 | }; 1019 | } 1020 | 1021 | macro_rules! die { 1022 | () => {{ 1023 | v.done = true; 1024 | return Some((Err(true), ())) 1025 | }}; 1026 | } 1027 | 1028 | macro_rules! _yield { 1029 | () => {{ 1030 | v.done = true; 1031 | return None 1032 | }}; 1033 | ($v: expr) => {{ 1034 | let v = $v; 1035 | catch_up!(); 1036 | return Some((Ok(v), ())) 1037 | }}; 1038 | } 1039 | 1040 | macro_rules! catch_up { 1041 | () => {{ 1042 | let block_remain = 1043 | block_size - (v.off & (block_size - 1)); 1044 | if block_remain <= msize as u64 { 1045 | v.off += block_remain; 1046 | } 1047 | }}; 1048 | } 1049 | 1050 | if v.done { 1051 | return None 1052 | } 1053 | loop { 1054 | let header_raw = match check!( 1055 | v.file.read(v.off, msize as usize).await 1056 | ) { 1057 | Some(h) => h, 1058 | None => _yield!(), 1059 | }; 1060 | let ringid_start = (fid << file_nbit) + v.off; 1061 | v.off += msize as u64; 1062 | let header = unsafe { 1063 | std::mem::transmute::<*const u8, &WALRingBlob>( 1064 | header_raw.as_ptr(), 1065 | ) 1066 | }; 1067 | let rsize = header.rsize; 1068 | match header.rtype.try_into() { 1069 | Ok(WALRingType::Full) => { 1070 | assert!(v.chunks.is_none()); 1071 | let payload = check!(check!( 1072 | v.file.read(v.off, rsize as usize).await 1073 | ) 1074 | .ok_or(())); 1075 | // TODO: improve the behavior when CRC32 fails 1076 | if !check!( 1077 | self.verify_checksum(&payload, header.crc32) 1078 | ) { 1079 | die!() 1080 | } 1081 | v.off += rsize as u64; 1082 | _yield!(( 1083 | payload, 1084 | WALRingId { 1085 | start: ringid_start, 1086 | end: (fid << file_nbit) + v.off, 1087 | counter: header.counter 1088 | }, 1089 | header.counter 1090 | )) 1091 | } 1092 | Ok(WALRingType::First) => { 1093 | assert!(v.chunks.is_none()); 1094 | let chunk = check!(check!( 1095 | v.file.read(v.off, rsize as usize).await 1096 | ) 1097 | .ok_or(())); 1098 | if !check!( 1099 | self.verify_checksum(&chunk, header.crc32) 1100 | ) { 1101 | die!() 1102 | } 1103 | *v.chunks = Some((vec![chunk], ringid_start)); 1104 | v.off += rsize as u64; 1105 | } 1106 | Ok(WALRingType::Middle) => { 1107 | let Vars { 1108 | chunks, off, file, .. 1109 | } = &mut *v; 1110 | if let Some((chunks, _)) = chunks { 1111 | let chunk = check!(check!( 1112 | file.read(*off, rsize as usize).await 1113 | ) 1114 | .ok_or(())); 1115 | if !check!( 1116 | self.verify_checksum(&chunk, header.crc32) 1117 | ) { 1118 | die!() 1119 | } 1120 | chunks.push(chunk); 1121 | } // otherwise ignore the leftover 1122 | *off += rsize as u64; 1123 | } 1124 | Ok(WALRingType::Last) => { 1125 | let v_off = v.off; 1126 | v.off += rsize as u64; 1127 | if let Some((mut chunks, ringid_start)) = 1128 | v.chunks.take() 1129 | { 1130 | let chunk = check!(check!( 1131 | v.file.read(v_off, rsize as usize).await 1132 | ) 1133 | .ok_or(())); 1134 | if !check!( 1135 | self.verify_checksum(&chunk, header.crc32) 1136 | ) { 1137 | die!() 1138 | } 1139 | chunks.push(chunk); 1140 | let mut payload = Vec::new(); 1141 | payload.resize( 1142 | chunks 1143 | .iter() 1144 | .fold(0, |acc, v| acc + v.len()), 1145 | 0, 1146 | ); 1147 | let mut ps = &mut payload[..]; 1148 | for c in chunks { 1149 | ps[..c.len()].copy_from_slice(&*c); 1150 | ps = &mut ps[c.len()..]; 1151 | } 1152 | _yield!(( 1153 | payload.into_boxed_slice(), 1154 | WALRingId { 1155 | start: ringid_start, 1156 | end: (fid << file_nbit) + v.off, 1157 | counter: header.counter, 1158 | }, 1159 | header.counter 1160 | )) 1161 | } 1162 | } 1163 | Ok(WALRingType::Null) => _yield!(), 1164 | Err(_) => match self.recover_policy { 1165 | RecoverPolicy::Strict => die!(), 1166 | RecoverPolicy::BestEffort => { 1167 | v.done = true; 1168 | return Some((Err(false), ())) 1169 | } 1170 | }, 1171 | } 1172 | catch_up!() 1173 | } 1174 | } 1175 | }) 1176 | } 1177 | 1178 | /// Recover by reading the WAL files. 1179 | pub async fn load< 1180 | S: WALStore, 1181 | F: FnMut(WALBytes, WALRingId) -> Result<(), ()>, 1182 | >( 1183 | &self, store: S, mut recover_func: F, keep_nrecords: u32, 1184 | ) -> Result, ()> { 1185 | let msize = std::mem::size_of::(); 1186 | assert!(self.file_nbit > self.block_nbit); 1187 | assert!(msize < 1 << self.block_nbit); 1188 | let filename_fmt = regex::Regex::new(FILENAME_FMT).unwrap(); 1189 | let mut file_pool = WALFilePool::new( 1190 | store, 1191 | self.file_nbit, 1192 | self.block_nbit, 1193 | self.cache_size, 1194 | ) 1195 | .await?; 1196 | let logfiles = sort_fids( 1197 | self.file_nbit, 1198 | file_pool 1199 | .store 1200 | .enumerate_files()? 1201 | .filter(|f| filename_fmt.is_match(f)) 1202 | .map(|s| get_fid(&s)) 1203 | .collect(), 1204 | ); 1205 | 1206 | let header = file_pool.read_header().await?; 1207 | 1208 | let mut chunks = None; 1209 | let mut pre_skip = true; 1210 | let mut scanned: Vec<(String, WALFileHandle)> = Vec::new(); 1211 | let mut counter = 0; 1212 | 1213 | // TODO: check for missing logfiles 1214 | 'outer: for (_, fid) in logfiles.into_iter() { 1215 | let fname = get_fname(fid); 1216 | let f = file_pool.get_file(fid, false).await?; 1217 | if header.recover_fid == fid { 1218 | pre_skip = false; 1219 | } 1220 | if pre_skip { 1221 | scanned.push((fname, f)); 1222 | continue 1223 | } 1224 | { 1225 | let stream = self.read_records(&f, &mut chunks); 1226 | futures::pin_mut!(stream); 1227 | while let Some(res) = stream.next().await { 1228 | let (bytes, ring_id, _) = match res { 1229 | Err(e) => { 1230 | if e { 1231 | return Err(()) 1232 | } else { 1233 | break 'outer 1234 | } 1235 | } 1236 | Ok(t) => t, 1237 | }; 1238 | recover_func(bytes, ring_id)?; 1239 | } 1240 | } 1241 | scanned.push((fname, f)); 1242 | } 1243 | 1244 | 'outer: for (_, f) in scanned.iter().rev() { 1245 | let records: Vec<_> = Self::read_rings( 1246 | f, 1247 | false, 1248 | self.block_nbit, 1249 | &self.recover_policy, 1250 | ) 1251 | .collect() 1252 | .await; 1253 | for e in records.into_iter().rev() { 1254 | let (rec, _) = e.map_err(|_| ())?; 1255 | if rec.rtype == WALRingType::Full as u8 || 1256 | rec.rtype == WALRingType::Last as u8 1257 | { 1258 | counter = rec.counter + 1; 1259 | break 'outer 1260 | } 1261 | } 1262 | } 1263 | 1264 | let fid_mask = (!0) >> self.file_nbit; 1265 | let mut pending_removal = VecDeque::new(); 1266 | let recover_fid = match scanned.last() { 1267 | Some((_, f)) => (f.fid + 1) & fid_mask, 1268 | None => 0, 1269 | }; 1270 | 1271 | file_pool.write_header(&Header { recover_fid }).await?; 1272 | 1273 | let mut skip_remove = false; 1274 | for (fname, f) in scanned.into_iter() { 1275 | let mut last = None; 1276 | let stream = Self::read_rings( 1277 | &f, 1278 | false, 1279 | self.block_nbit, 1280 | &self.recover_policy, 1281 | ); 1282 | futures::pin_mut!(stream); 1283 | while let Some(r) = stream.next().await { 1284 | last = Some(r.map_err(|_| ())?); 1285 | } 1286 | if let Some((last_rec, _)) = last { 1287 | if !counter_lt(last_rec.counter + keep_nrecords, counter) { 1288 | skip_remove = true; 1289 | } 1290 | if skip_remove { 1291 | pending_removal.push_back((f.fid, last_rec.counter)); 1292 | } 1293 | } 1294 | if !skip_remove { 1295 | f.truncate(0)?; 1296 | file_pool.store.remove_file(fname).await?; 1297 | } 1298 | } 1299 | 1300 | file_pool.reset(); 1301 | 1302 | let next = recover_fid << self.file_nbit; 1303 | let next_complete = WALRingId { 1304 | start: 0, 1305 | end: next, 1306 | counter, 1307 | }; 1308 | Ok(WALWriter::new( 1309 | WALState { 1310 | counter, 1311 | next_complete, 1312 | next, 1313 | file_nbit: self.file_nbit, 1314 | io_complete: BinaryHeap::new(), 1315 | pending_removal, 1316 | }, 1317 | file_pool, 1318 | )) 1319 | } 1320 | } 1321 | 1322 | pub const CRC32: crc::Crc = crc::Crc::::new(&crc::CRC_32_CKSUM); 1323 | -------------------------------------------------------------------------------- /tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | #[allow(dead_code)] 3 | use async_trait::async_trait; 4 | use futures::executor::block_on; 5 | use growthring::wal::{ 6 | WALBytes, WALFile, WALLoader, WALPos, WALRingId, WALStore, 7 | }; 8 | use indexmap::{map::Entry, IndexMap}; 9 | use rand::Rng; 10 | use std::cell::RefCell; 11 | use std::collections::VecDeque; 12 | use std::collections::{hash_map, HashMap}; 13 | use std::convert::TryInto; 14 | use std::rc::Rc; 15 | 16 | pub trait FailGen { 17 | fn next_fail(&self) -> bool; 18 | } 19 | 20 | struct FileContentEmul(RefCell>); 21 | 22 | impl FileContentEmul { 23 | pub fn new() -> Self { 24 | FileContentEmul(RefCell::new(Vec::new())) 25 | } 26 | } 27 | 28 | impl std::ops::Deref for FileContentEmul { 29 | type Target = RefCell>; 30 | fn deref(&self) -> &Self::Target { 31 | &self.0 32 | } 33 | } 34 | 35 | /// Emulate the a virtual file handle. 36 | pub struct WALFileEmul { 37 | file: Rc, 38 | fgen: Rc, 39 | } 40 | 41 | #[async_trait(?Send)] 42 | impl WALFile for WALFileEmul { 43 | async fn allocate(&self, offset: WALPos, length: usize) -> Result<(), ()> { 44 | if self.fgen.next_fail() { 45 | return Err(()) 46 | } 47 | let offset = offset as usize; 48 | if offset + length > self.file.borrow().len() { 49 | self.file.borrow_mut().resize(offset + length, 0) 50 | } 51 | for v in &mut self.file.borrow_mut()[offset..offset + length] { 52 | *v = 0 53 | } 54 | Ok(()) 55 | } 56 | 57 | fn truncate(&self, length: usize) -> Result<(), ()> { 58 | if self.fgen.next_fail() { 59 | return Err(()) 60 | } 61 | self.file.borrow_mut().resize(length, 0); 62 | Ok(()) 63 | } 64 | 65 | async fn write(&self, offset: WALPos, data: WALBytes) -> Result<(), ()> { 66 | if self.fgen.next_fail() { 67 | return Err(()) 68 | } 69 | let offset = offset as usize; 70 | self.file.borrow_mut()[offset..offset + data.len()] 71 | .copy_from_slice(&data); 72 | Ok(()) 73 | } 74 | 75 | async fn read( 76 | &self, 77 | offset: WALPos, 78 | length: usize, 79 | ) -> Result, ()> { 80 | if self.fgen.next_fail() { 81 | return Err(()) 82 | } 83 | 84 | let offset = offset as usize; 85 | let file = self.file.borrow(); 86 | if offset + length > file.len() { 87 | Ok(None) 88 | } else { 89 | Ok(Some( 90 | (&file[offset..offset + length]).to_vec().into_boxed_slice(), 91 | )) 92 | } 93 | } 94 | } 95 | 96 | pub struct WALStoreEmulState { 97 | files: HashMap>, 98 | } 99 | 100 | impl WALStoreEmulState { 101 | pub fn new() -> Self { 102 | WALStoreEmulState { 103 | files: HashMap::new(), 104 | } 105 | } 106 | pub fn clone(&self) -> Self { 107 | WALStoreEmulState { 108 | files: self.files.clone(), 109 | } 110 | } 111 | } 112 | 113 | /// Emulate the persistent storage state. 114 | pub struct WALStoreEmul<'a, G> 115 | where 116 | G: FailGen, 117 | { 118 | state: RefCell<&'a mut WALStoreEmulState>, 119 | fgen: Rc, 120 | } 121 | 122 | impl<'a, G: FailGen> WALStoreEmul<'a, G> { 123 | pub fn new(state: &'a mut WALStoreEmulState, fgen: Rc) -> Self { 124 | let state = RefCell::new(state); 125 | WALStoreEmul { state, fgen } 126 | } 127 | } 128 | 129 | #[async_trait(?Send)] 130 | impl<'a, G> WALStore for WALStoreEmul<'a, G> 131 | where 132 | G: 'static + FailGen, 133 | { 134 | type FileNameIter = std::vec::IntoIter; 135 | 136 | async fn open_file( 137 | &self, 138 | filename: &str, 139 | touch: bool, 140 | ) -> Result, ()> { 141 | if self.fgen.next_fail() { 142 | return Err(()) 143 | } 144 | match self.state.borrow_mut().files.entry(filename.to_string()) { 145 | hash_map::Entry::Occupied(e) => Ok(Box::new(WALFileEmul { 146 | file: e.get().clone(), 147 | fgen: self.fgen.clone(), 148 | })), 149 | hash_map::Entry::Vacant(e) => { 150 | if touch { 151 | Ok(Box::new(WALFileEmul { 152 | file: e.insert(Rc::new(FileContentEmul::new())).clone(), 153 | fgen: self.fgen.clone(), 154 | })) 155 | } else { 156 | Err(()) 157 | } 158 | } 159 | } 160 | } 161 | 162 | async fn remove_file(&self, filename: String) -> Result<(), ()> { 163 | //println!("remove_file(filename={})", filename); 164 | if self.fgen.next_fail() { 165 | return Err(()) 166 | } 167 | self.state 168 | .borrow_mut() 169 | .files 170 | .remove(&filename) 171 | .ok_or(()) 172 | .and_then(|_| Ok(())) 173 | } 174 | 175 | fn enumerate_files(&self) -> Result { 176 | if self.fgen.next_fail() { 177 | return Err(()) 178 | } 179 | let mut logfiles = Vec::new(); 180 | for (fname, _) in self.state.borrow().files.iter() { 181 | logfiles.push(fname.clone()) 182 | } 183 | Ok(logfiles.into_iter()) 184 | } 185 | } 186 | 187 | pub struct SingleFailGen { 188 | cnt: std::cell::Cell, 189 | fail_point: usize, 190 | } 191 | 192 | impl SingleFailGen { 193 | pub fn new(fail_point: usize) -> Self { 194 | SingleFailGen { 195 | cnt: std::cell::Cell::new(0), 196 | fail_point, 197 | } 198 | } 199 | } 200 | 201 | impl FailGen for SingleFailGen { 202 | fn next_fail(&self) -> bool { 203 | let c = self.cnt.get(); 204 | self.cnt.set(c + 1); 205 | c == self.fail_point 206 | } 207 | } 208 | 209 | pub struct ZeroFailGen; 210 | 211 | impl FailGen for ZeroFailGen { 212 | fn next_fail(&self) -> bool { 213 | false 214 | } 215 | } 216 | 217 | pub struct CountFailGen(std::cell::Cell); 218 | 219 | impl CountFailGen { 220 | pub fn new() -> Self { 221 | CountFailGen(std::cell::Cell::new(0)) 222 | } 223 | pub fn get_count(&self) -> usize { 224 | self.0.get() 225 | } 226 | } 227 | 228 | impl FailGen for CountFailGen { 229 | fn next_fail(&self) -> bool { 230 | self.0.set(self.0.get() + 1); 231 | false 232 | } 233 | } 234 | 235 | /// An ordered list of intervals: `(begin, end, color)*`. 236 | #[derive(Clone)] 237 | pub struct PaintStrokes(Vec<(u32, u32, u32)>); 238 | 239 | impl PaintStrokes { 240 | pub fn new() -> Self { 241 | PaintStrokes(Vec::new()) 242 | } 243 | 244 | pub fn to_bytes(&self) -> WALBytes { 245 | let mut res: Vec = Vec::new(); 246 | let is = std::mem::size_of::(); 247 | let len = self.0.len() as u32; 248 | res.resize(is * (1 + 3 * self.0.len()), 0); 249 | let mut rs = &mut res[..]; 250 | rs[..is].copy_from_slice(&len.to_le_bytes()); 251 | rs = &mut rs[is..]; 252 | for (s, e, c) in self.0.iter() { 253 | rs[..is].copy_from_slice(&s.to_le_bytes()); 254 | rs[is..is * 2].copy_from_slice(&e.to_le_bytes()); 255 | rs[is * 2..is * 3].copy_from_slice(&c.to_le_bytes()); 256 | rs = &mut rs[is * 3..]; 257 | } 258 | res.into_boxed_slice() 259 | } 260 | 261 | pub fn from_bytes(raw: &[u8]) -> Self { 262 | assert!(raw.len() > 4); 263 | assert!(raw.len() & 3 == 0); 264 | let is = std::mem::size_of::(); 265 | let (len_raw, mut rest) = raw.split_at(is); 266 | let len = u32::from_le_bytes(len_raw.try_into().unwrap()); 267 | let mut res = Vec::new(); 268 | for _ in 0..len { 269 | let (s_raw, rest1) = rest.split_at(is); 270 | let (e_raw, rest2) = rest1.split_at(is); 271 | let (c_raw, rest3) = rest2.split_at(is); 272 | res.push(( 273 | u32::from_le_bytes(s_raw.try_into().unwrap()), 274 | u32::from_le_bytes(e_raw.try_into().unwrap()), 275 | u32::from_le_bytes(c_raw.try_into().unwrap()), 276 | )); 277 | rest = rest3 278 | } 279 | PaintStrokes(res) 280 | } 281 | 282 | pub fn gen_rand( 283 | max_pos: u32, 284 | max_len: u32, 285 | max_col: u32, 286 | n: usize, 287 | rng: &mut R, 288 | ) -> PaintStrokes { 289 | assert!(max_pos > 0); 290 | let mut strokes = Self::new(); 291 | for _ in 0..n { 292 | let pos = rng.gen_range(0..max_pos); 293 | let len = 294 | rng.gen_range(1..std::cmp::min(max_len, max_pos - pos + 1)); 295 | strokes.stroke(pos, pos + len, rng.gen_range(0..max_col)) 296 | } 297 | strokes 298 | } 299 | 300 | pub fn stroke(&mut self, start: u32, end: u32, color: u32) { 301 | self.0.push((start, end, color)) 302 | } 303 | 304 | pub fn into_vec(self) -> Vec<(u32, u32, u32)> { 305 | self.0 306 | } 307 | } 308 | 309 | impl growthring::wal::Record for PaintStrokes { 310 | fn serialize(&self) -> WALBytes { 311 | self.to_bytes() 312 | } 313 | } 314 | 315 | #[test] 316 | fn test_paint_strokes() { 317 | let mut p = PaintStrokes::new(); 318 | for i in 0..3 { 319 | p.stroke(i, i + 3, i + 10) 320 | } 321 | let pr = p.to_bytes(); 322 | for ((s, e, c), i) in PaintStrokes::from_bytes(&pr) 323 | .into_vec() 324 | .into_iter() 325 | .zip(0..) 326 | { 327 | assert_eq!(s, i); 328 | assert_eq!(e, i + 3); 329 | assert_eq!(c, i + 10); 330 | } 331 | } 332 | 333 | pub struct Canvas { 334 | waiting: HashMap, 335 | queue: IndexMap>, 336 | canvas: Box<[u32]>, 337 | } 338 | 339 | impl Canvas { 340 | pub fn new(size: usize) -> Self { 341 | let mut canvas = Vec::new(); 342 | // fill the background color 0 343 | canvas.resize(size, 0); 344 | let canvas = canvas.into_boxed_slice(); 345 | Canvas { 346 | waiting: HashMap::new(), 347 | queue: IndexMap::new(), 348 | canvas, 349 | } 350 | } 351 | 352 | pub fn new_reference(&self, ops: &[PaintStrokes]) -> Self { 353 | let mut res = Self::new(self.canvas.len()); 354 | for op in ops { 355 | for (s, e, c) in op.0.iter() { 356 | for i in *s..*e { 357 | res.canvas[i as usize] = *c 358 | } 359 | } 360 | } 361 | res 362 | } 363 | 364 | fn get_waiting(&mut self, rid: WALRingId) -> &mut usize { 365 | match self.waiting.entry(rid) { 366 | hash_map::Entry::Occupied(e) => e.into_mut(), 367 | hash_map::Entry::Vacant(e) => e.insert(0), 368 | } 369 | } 370 | 371 | fn get_queued(&mut self, pos: u32) -> &mut VecDeque<(u32, WALRingId)> { 372 | match self.queue.entry(pos) { 373 | Entry::Occupied(e) => e.into_mut(), 374 | Entry::Vacant(e) => e.insert(VecDeque::new()), 375 | } 376 | } 377 | 378 | pub fn prepaint(&mut self, strokes: &PaintStrokes, rid: &WALRingId) { 379 | let rid = *rid; 380 | let mut nwait = 0; 381 | for (s, e, c) in strokes.0.iter() { 382 | for i in *s..*e { 383 | nwait += 1; 384 | self.get_queued(i).push_back((*c, rid)) 385 | } 386 | } 387 | *self.get_waiting(rid) = nwait 388 | } 389 | 390 | // TODO: allow customized scheduler 391 | /// Schedule to paint one position, randomly. It optionally returns a finished batch write 392 | /// identified by its start position of WALRingId. 393 | pub fn rand_paint( 394 | &mut self, 395 | rng: &mut R, 396 | ) -> Option<(Option, u32)> { 397 | if self.is_empty() { 398 | return None 399 | } 400 | let idx = rng.gen_range(0..self.queue.len()); 401 | let (pos, _) = self.queue.get_index(idx).unwrap(); 402 | let pos = *pos; 403 | Some((self.paint(pos), pos)) 404 | } 405 | 406 | pub fn clear_queued(&mut self) { 407 | self.queue.clear(); 408 | self.waiting.clear(); 409 | } 410 | 411 | pub fn paint_all(&mut self) { 412 | for (pos, q) in self.queue.iter() { 413 | self.canvas[*pos as usize] = q.back().unwrap().0; 414 | } 415 | self.clear_queued() 416 | } 417 | 418 | pub fn is_empty(&self) -> bool { 419 | self.queue.is_empty() 420 | } 421 | 422 | pub fn paint(&mut self, pos: u32) -> Option { 423 | let q = self.queue.get_mut(&pos).unwrap(); 424 | let (c, rid) = q.pop_front().unwrap(); 425 | if q.is_empty() { 426 | self.queue.swap_remove(&pos); 427 | } 428 | self.canvas[pos as usize] = c; 429 | if let Some(cnt) = self.waiting.get_mut(&rid) { 430 | *cnt -= 1; 431 | if *cnt == 0 { 432 | self.waiting.remove(&rid); 433 | Some(rid) 434 | } else { 435 | None 436 | } 437 | } else { 438 | None 439 | } 440 | } 441 | 442 | pub fn is_same(&self, other: &Canvas) -> bool { 443 | self.canvas.cmp(&other.canvas) == std::cmp::Ordering::Equal 444 | } 445 | 446 | pub fn print(&self, max_col: usize) { 447 | println!("# begin canvas"); 448 | for r in self.canvas.chunks(max_col) { 449 | for c in r.iter() { 450 | print!("{:02x} ", c & 0xff); 451 | } 452 | println!(""); 453 | } 454 | println!("# end canvas"); 455 | } 456 | } 457 | 458 | #[test] 459 | fn test_canvas() { 460 | let mut rng = ::seed_from_u64(42); 461 | let mut canvas1 = Canvas::new(100); 462 | let mut canvas2 = Canvas::new(100); 463 | let canvas3 = Canvas::new(101); 464 | let dummy = WALRingId::empty_id(); 465 | let s1 = PaintStrokes::gen_rand(100, 10, 256, 2, &mut rng); 466 | let s2 = PaintStrokes::gen_rand(100, 10, 256, 2, &mut rng); 467 | assert!(canvas1.is_same(&canvas2)); 468 | assert!(!canvas2.is_same(&canvas3)); 469 | canvas1.prepaint(&s1, &dummy); 470 | canvas1.prepaint(&s2, &dummy); 471 | canvas2.prepaint(&s1, &dummy); 472 | canvas2.prepaint(&s2, &dummy); 473 | assert!(canvas1.is_same(&canvas2)); 474 | canvas1.rand_paint(&mut rng); 475 | assert!(!canvas1.is_same(&canvas2)); 476 | while let Some(_) = canvas1.rand_paint(&mut rng) {} 477 | while let Some(_) = canvas2.rand_paint(&mut rng) {} 478 | assert!(canvas1.is_same(&canvas2)); 479 | canvas1.print(10); 480 | } 481 | 482 | pub struct PaintingSim { 483 | pub block_nbit: u64, 484 | pub file_nbit: u64, 485 | pub file_cache: usize, 486 | /// number of PaintStrokes (WriteBatch) 487 | pub n: usize, 488 | /// number of strokes per PaintStrokes 489 | pub m: usize, 490 | /// number of scheduled ticks per PaintStroke submission 491 | pub k: usize, 492 | /// the size of canvas 493 | pub csize: usize, 494 | /// max length of a single stroke 495 | pub stroke_max_len: u32, 496 | /// max color value 497 | pub stroke_max_col: u32, 498 | /// max number of strokes per PaintStroke 499 | pub stroke_max_n: usize, 500 | /// random seed 501 | pub seed: u64, 502 | } 503 | 504 | impl PaintingSim { 505 | pub fn run( 506 | &self, 507 | state: &mut WALStoreEmulState, 508 | canvas: &mut Canvas, 509 | loader: WALLoader, 510 | ops: &mut Vec, 511 | ringid_map: &mut HashMap, 512 | fgen: Rc, 513 | ) -> Result<(), ()> { 514 | let mut rng = 515 | ::seed_from_u64(self.seed); 516 | let mut wal = block_on(loader.load( 517 | WALStoreEmul::new(state, fgen.clone()), 518 | |_, _| { 519 | if fgen.next_fail() { 520 | Err(()) 521 | } else { 522 | Ok(()) 523 | } 524 | }, 525 | 0 526 | ))?; 527 | for _ in 0..self.n { 528 | let pss = (0..self.m) 529 | .map(|_| { 530 | PaintStrokes::gen_rand( 531 | self.csize as u32, 532 | self.stroke_max_len, 533 | self.stroke_max_col, 534 | rng.gen_range(1..self.stroke_max_n + 1), 535 | &mut rng, 536 | ) 537 | }) 538 | .collect::>(); 539 | let pss_ = pss.clone(); 540 | // write ahead 541 | let rids = wal.grow(pss); 542 | assert_eq!(rids.len(), self.m); 543 | let recs = rids 544 | .into_iter() 545 | .zip(pss_.into_iter()) 546 | .map(|(r, ps)| -> Result<_, _> { 547 | ops.push(ps); 548 | let (rec, rid) = futures::executor::block_on(r)?; 549 | ringid_map.insert(rid, ops.len() - 1); 550 | Ok((rec, rid)) 551 | }) 552 | .collect::, ()>>()?; 553 | // finish appending to WAL 554 | /* 555 | for rid in rids.iter() { 556 | println!("got ringid: {:?}", rid); 557 | } 558 | */ 559 | // prepare data writes 560 | for (ps, rid) in recs.into_iter() { 561 | canvas.prepaint(&ps, &rid); 562 | } 563 | // run k ticks of the fine-grained scheduler 564 | for _ in 0..rng.gen_range(1..self.k) { 565 | // storage I/O could fail 566 | if fgen.next_fail() { 567 | return Err(()) 568 | } 569 | if let Some((fin_rid, _)) = canvas.rand_paint(&mut rng) { 570 | if let Some(rid) = fin_rid { 571 | futures::executor::block_on(wal.peel(&[rid], 0))? 572 | } 573 | } else { 574 | break 575 | } 576 | } 577 | } 578 | //canvas.print(40); 579 | assert_eq!(wal.file_pool_in_use(), 0); 580 | Ok(()) 581 | } 582 | 583 | pub fn get_walloader(&self) -> WALLoader { 584 | let mut loader = WALLoader::new(); 585 | loader 586 | .file_nbit(self.file_nbit) 587 | .block_nbit(self.block_nbit) 588 | .cache_size(std::num::NonZeroUsize::new(self.file_cache).unwrap()); 589 | loader 590 | } 591 | 592 | pub fn get_nticks(&self, state: &mut WALStoreEmulState) -> usize { 593 | let mut canvas = Canvas::new(self.csize); 594 | let mut ops: Vec = Vec::new(); 595 | let mut ringid_map = HashMap::new(); 596 | let fgen = Rc::new(CountFailGen::new()); 597 | self.run( 598 | state, 599 | &mut canvas, 600 | self.get_walloader(), 601 | &mut ops, 602 | &mut ringid_map, 603 | fgen.clone(), 604 | ) 605 | .unwrap(); 606 | fgen.get_count() 607 | } 608 | 609 | pub fn check( 610 | &self, 611 | state: &mut WALStoreEmulState, 612 | canvas: &mut Canvas, 613 | wal: WALLoader, 614 | ops: &Vec, 615 | ringid_map: &HashMap, 616 | ) -> bool { 617 | if ops.is_empty() { 618 | return true 619 | } 620 | let mut last_idx = 0; 621 | let mut napplied = 0; 622 | canvas.clear_queued(); 623 | block_on(wal.load( 624 | WALStoreEmul::new(state, Rc::new(ZeroFailGen)), 625 | |payload, ringid| { 626 | let s = PaintStrokes::from_bytes(&payload); 627 | canvas.prepaint(&s, &ringid); 628 | last_idx = *ringid_map.get(&ringid).unwrap() + 1; 629 | napplied += 1; 630 | Ok(()) 631 | }, 632 | 0, 633 | )) 634 | .unwrap(); 635 | println!("last = {}/{}, applied = {}", last_idx, ops.len(), napplied); 636 | canvas.paint_all(); 637 | // recover complete 638 | let canvas0 = if last_idx > 0 { 639 | let canvas0 = canvas.new_reference(&ops[..last_idx]); 640 | if canvas.is_same(&canvas0) { 641 | None 642 | } else { 643 | Some(canvas0) 644 | } 645 | } else { 646 | let canvas0 = canvas.new_reference(&[]); 647 | if canvas.is_same(&canvas0) { 648 | None 649 | } else { 650 | let i0 = ops.len() - self.m; 651 | let mut canvas0 = canvas0.new_reference(&ops[..i0]); 652 | let mut res = None; 653 | 'outer: loop { 654 | if canvas.is_same(&canvas0) { 655 | break 656 | } 657 | for i in i0..ops.len() { 658 | canvas0.prepaint(&ops[i], &WALRingId::empty_id()); 659 | canvas0.paint_all(); 660 | if canvas.is_same(&canvas0) { 661 | break 'outer 662 | } 663 | } 664 | res = Some(canvas0); 665 | break 666 | } 667 | res 668 | } 669 | }; 670 | if let Some(canvas0) = canvas0 { 671 | canvas.print(40); 672 | canvas0.print(40); 673 | false 674 | } else { 675 | true 676 | } 677 | } 678 | 679 | pub fn new_canvas(&self) -> Canvas { 680 | Canvas::new(self.csize) 681 | } 682 | } 683 | -------------------------------------------------------------------------------- /tests/rand_fail.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] mod common; 2 | 3 | use std::collections::HashMap; 4 | use std::rc::Rc; 5 | 6 | fn _multi_point_failure( 7 | sims: &[common::PaintingSim], 8 | state: &common::WALStoreEmulState, 9 | f: usize, 10 | ) { 11 | let sim = &sims[0]; 12 | // save the current state and start from there 13 | let mut state = state.clone(); 14 | let mut state0 = state.clone(); 15 | let nticks = sim.get_nticks(&mut state0); 16 | println!("fail = {}, nticks = {}", f, nticks); 17 | for i in 0..nticks { 18 | println!("fail = {}, pos = {}", f, i); 19 | let mut canvas = sim.new_canvas(); 20 | let mut ops: Vec = Vec::new(); 21 | let mut ringid_map = HashMap::new(); 22 | let fgen = common::SingleFailGen::new(i); 23 | if sim 24 | .run( 25 | &mut state, 26 | &mut canvas, 27 | sim.get_walloader(), 28 | &mut ops, 29 | &mut ringid_map, 30 | Rc::new(fgen), 31 | ) 32 | .is_err() 33 | { 34 | if sims.len() > 1 { 35 | _multi_point_failure(&sims[1..], &state, f + 1) 36 | } else { 37 | assert!(sim.check( 38 | &mut state, 39 | &mut canvas, 40 | sim.get_walloader(), 41 | &ops, 42 | &ringid_map, 43 | )) 44 | } 45 | } 46 | } 47 | } 48 | 49 | fn multi_point_failure(sims: &[common::PaintingSim]) { 50 | _multi_point_failure(sims, &common::WALStoreEmulState::new(), 1); 51 | } 52 | 53 | #[test] 54 | fn single_point_failure1() { 55 | let sim = common::PaintingSim { 56 | block_nbit: 5, 57 | file_nbit: 6, 58 | file_cache: 1000, 59 | n: 100, 60 | m: 10, 61 | k: 1000, 62 | csize: 1000, 63 | stroke_max_len: 10, 64 | stroke_max_col: 256, 65 | stroke_max_n: 5, 66 | seed: 0, 67 | }; 68 | multi_point_failure(&[sim]); 69 | } 70 | 71 | #[test] 72 | fn two_failures() { 73 | let sims = [ 74 | common::PaintingSim { 75 | block_nbit: 5, 76 | file_nbit: 6, 77 | file_cache: 1000, 78 | n: 10, 79 | m: 5, 80 | k: 100, 81 | csize: 1000, 82 | stroke_max_len: 10, 83 | stroke_max_col: 256, 84 | stroke_max_n: 3, 85 | seed: 0, 86 | }, 87 | common::PaintingSim { 88 | block_nbit: 5, 89 | file_nbit: 6, 90 | file_cache: 1000, 91 | n: 10, 92 | m: 5, 93 | k: 100, 94 | csize: 1000, 95 | stroke_max_len: 10, 96 | stroke_max_col: 256, 97 | stroke_max_n: 3, 98 | seed: 0, 99 | }, 100 | ]; 101 | multi_point_failure(&sims); 102 | } 103 | --------------------------------------------------------------------------------