├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md ├── rustfmt.toml └── src ├── database.rs ├── lib.rs ├── mem_table.rs ├── utils.rs ├── wal.rs └── wal_iterator.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /data -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "cfg-if" 7 | version = "1.0.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 10 | 11 | [[package]] 12 | name = "database-engine" 13 | version = "0.1.0" 14 | dependencies = [ 15 | "rand", 16 | ] 17 | 18 | [[package]] 19 | name = "getrandom" 20 | version = "0.1.16" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" 23 | dependencies = [ 24 | "cfg-if", 25 | "libc", 26 | "wasi", 27 | ] 28 | 29 | [[package]] 30 | name = "libc" 31 | version = "0.2.146" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" 34 | 35 | [[package]] 36 | name = "ppv-lite86" 37 | version = "0.2.17" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 40 | 41 | [[package]] 42 | name = "rand" 43 | version = "0.7.3" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" 46 | dependencies = [ 47 | "getrandom", 48 | "libc", 49 | "rand_chacha", 50 | "rand_core", 51 | "rand_hc", 52 | ] 53 | 54 | [[package]] 55 | name = "rand_chacha" 56 | version = "0.2.2" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" 59 | dependencies = [ 60 | "ppv-lite86", 61 | "rand_core", 62 | ] 63 | 64 | [[package]] 65 | name = "rand_core" 66 | version = "0.5.1" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" 69 | dependencies = [ 70 | "getrandom", 71 | ] 72 | 73 | [[package]] 74 | name = "rand_hc" 75 | version = "0.2.0" 76 | source = "registry+https://github.com/rust-lang/crates.io-index" 77 | checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" 78 | dependencies = [ 79 | "rand_core", 80 | ] 81 | 82 | [[package]] 83 | name = "wasi" 84 | version = "0.9.0+wasi-snapshot-preview1" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" 87 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "database-engine" 3 | version = "0.1.0" 4 | authors = ["Adam Comer "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | rand = "0.7.3" 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How to Build a Simple Database 2 | 3 | Databases are an integral part to software development and to better understand how they work, I made my own. In this project, I built a LSM-Tree Key-Value Store based on [RocksDB](https://github.com/facebook/rocksdb) in Rust. 4 | 5 | ## Documentation 6 | 7 | While building this database, I wrote easy to understand articles explaining how each component works and fits into the larger project. Knowledge of database storage engines is very scattered online, and I spent several months reading papers, documentation, and code to get a basic understanding. I made this guide for my beginner self who was often lost trying to piece together how a database worked. 8 | 9 | - [Build a Database Pt. 1: Motivation & Design](https://adambcomer.com/blog/simple-database/motivation-design/) 10 | - [Build a Database Pt. 2: MemTable](https://adambcomer.com/blog/simple-database/memtable/) 11 | - [Build a Database Pt. 3: Write Ahead Log(WAL)](https://adambcomer.com/blog/simple-database/wal/) 12 | - Build a Database Pt. 4: SSTable 13 | - Build a Database Pt. 5: Compaction 14 | - Build a Database Pt. 6: Putting it Together 15 | - Build a Database Pt. 7: Using the Database 16 | 17 | ## Tests 18 | 19 | Running tests is very simple with cargo. 20 | 21 | ```shell 22 | cargo test 23 | ``` 24 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | tab_spaces = 2 -------------------------------------------------------------------------------- /src/database.rs: -------------------------------------------------------------------------------- 1 | use crate::mem_table::MemTable; 2 | use crate::wal::WAL; 3 | use std::path::PathBuf; 4 | use std::time::{SystemTime, UNIX_EPOCH}; 5 | 6 | #[derive(Debug)] 7 | pub struct DatabaseEntry { 8 | key: Vec, 9 | value: Vec, 10 | timestamp: u128, 11 | } 12 | 13 | impl DatabaseEntry { 14 | pub fn key(&self) -> &[u8] { 15 | &self.key 16 | } 17 | 18 | pub fn value(&self) -> &[u8] { 19 | &self.value 20 | } 21 | 22 | pub fn timestamp(&self) -> u128 { 23 | self.timestamp 24 | } 25 | } 26 | 27 | pub struct Database { 28 | dir: PathBuf, 29 | mem_table: MemTable, 30 | wal: WAL, 31 | } 32 | 33 | impl Database { 34 | pub fn new(dir: &str) -> Database { 35 | let dir = PathBuf::from(dir); 36 | 37 | let (wal, mem_table) = WAL::load_from_dir(&dir).unwrap(); 38 | 39 | Database { 40 | dir: dir, 41 | mem_table, 42 | wal, 43 | } 44 | } 45 | 46 | pub fn get(&self, key: &[u8]) -> Option { 47 | if let Some(mem_entry) = self.mem_table.get(key) { 48 | return Some(DatabaseEntry { 49 | key: mem_entry.key.clone(), 50 | value: mem_entry.value.as_ref().unwrap().clone(), 51 | timestamp: mem_entry.timestamp, 52 | }); 53 | } 54 | 55 | None 56 | } 57 | 58 | pub fn set(&mut self, key: &[u8], value: &[u8]) -> Result { 59 | let timestamp = SystemTime::now() 60 | .duration_since(UNIX_EPOCH) 61 | .unwrap() 62 | .as_micros(); 63 | 64 | let wal_res = self.wal.set(key, value, timestamp); 65 | if wal_res.is_err() { 66 | return Err(0); 67 | } 68 | if self.wal.flush().is_err() { 69 | return Err(0); 70 | } 71 | 72 | self.mem_table.set(key, value, timestamp); 73 | 74 | Ok(1) 75 | } 76 | 77 | pub fn delete(&mut self, key: &[u8]) -> Result { 78 | let timestamp = SystemTime::now() 79 | .duration_since(UNIX_EPOCH) 80 | .unwrap() 81 | .as_micros(); 82 | 83 | let wal_res = self.wal.delete(key, timestamp); 84 | if wal_res.is_err() { 85 | return Err(0); 86 | } 87 | if self.wal.flush().is_err() { 88 | return Err(0); 89 | } 90 | 91 | self.mem_table.delete(key, timestamp); 92 | 93 | Ok(1) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod database; 2 | mod mem_table; 3 | mod utils; 4 | mod wal; 5 | mod wal_iterator; 6 | 7 | // mod table; 8 | // mod table_manager; 9 | // mod utils; 10 | -------------------------------------------------------------------------------- /src/mem_table.rs: -------------------------------------------------------------------------------- 1 | /// MemTable entry. 2 | pub struct MemTableEntry { 3 | pub key: Vec, 4 | pub value: Option>, 5 | pub timestamp: u128, 6 | pub deleted: bool, 7 | } 8 | 9 | /// MemTable holds a sorted list of the latest written records. 10 | /// 11 | /// Writes are duplicated to the WAL for recovery of the MemTable in the event of a restart. 12 | /// 13 | /// MemTables have a max capacity and when that is reached, we flush the MemTable 14 | /// to disk as a Table(SSTable). 15 | /// 16 | /// Entries are stored in a Vector instead of a HashMap to support Scans. 17 | pub struct MemTable { 18 | entries: Vec, 19 | size: usize, 20 | } 21 | 22 | impl MemTable { 23 | /// Creates a new empty MemTable 24 | pub fn new() -> MemTable { 25 | MemTable { 26 | entries: Vec::new(), 27 | size: 0, 28 | } 29 | } 30 | 31 | /// Sets a Key-Value pair in the MemTable. 32 | pub fn set(&mut self, key: &[u8], value: &[u8], timestamp: u128) { 33 | let entry = MemTableEntry { 34 | key: key.to_owned(), 35 | value: Some(value.to_owned()), 36 | timestamp, 37 | deleted: false, 38 | }; 39 | 40 | match self.get_index(key) { 41 | Ok(idx) => { 42 | // If a Value existed on the deleted record, then add the difference of the new and old Value to the MemTable's size. 43 | if let Some(v) = self.entries[idx].value.as_ref() { 44 | if value.len() < v.len() { 45 | self.size -= v.len() - value.len(); 46 | } else { 47 | self.size += value.len() - v.len(); 48 | } 49 | } else { 50 | self.size += value.len(); 51 | } 52 | self.entries[idx] = entry; 53 | } 54 | Err(idx) => { 55 | self.size += key.len() + value.len() + 16 + 1; // Increase the size of the MemTable by the Key size, Value size, Timestamp size (16 bytes), Tombstone size (1 byte). 56 | self.entries.insert(idx, entry) 57 | } 58 | } 59 | } 60 | 61 | /// Deletes a Key-Value pair in the MemTable. 62 | /// 63 | /// This is achieved using tombstones. 64 | pub fn delete(&mut self, key: &[u8], timestamp: u128) { 65 | let entry = MemTableEntry { 66 | key: key.to_owned(), 67 | value: None, 68 | timestamp: timestamp, 69 | deleted: true, 70 | }; 71 | match self.get_index(key) { 72 | Ok(idx) => { 73 | // If a Value existed on the deleted record, then subtract the size of the Value from the MemTable. 74 | if let Some(value) = self.entries[idx].value.as_ref() { 75 | self.size -= value.len(); 76 | } 77 | self.entries[idx] = entry; 78 | } 79 | Err(idx) => { 80 | self.size += key.len() + 16 + 1; // Increase the size of the MemTable by the Key size, Timestamp size (16 bytes), Tombstone size (1 byte). 81 | self.entries.insert(idx, entry); 82 | } 83 | } 84 | } 85 | 86 | /// Gets a Key-Value pair from the MemTable.alloc 87 | /// 88 | /// If no record with the same key exists in the MemTable, return None. 89 | pub fn get(&self, key: &[u8]) -> Option<&MemTableEntry> { 90 | if let Ok(idx) = self.get_index(key) { 91 | return Some(&self.entries[idx]); 92 | } 93 | None 94 | } 95 | 96 | /// Performs Binary Search to find a record in the MemTable. 97 | /// 98 | /// If the record is found `[Result::Ok]` is returned, with the index of record. If the record is not 99 | /// found then `[Result::Err]` is returned, with the index to insert the record at. 100 | fn get_index(&self, key: &[u8]) -> Result { 101 | self 102 | .entries 103 | .binary_search_by_key(&key, |e| e.key.as_slice()) 104 | } 105 | 106 | /// Gets the number of records in the MemTable. 107 | pub fn len(&self) -> usize { 108 | self.entries.len() 109 | } 110 | 111 | /// Gets all of the records from the MemTable. 112 | pub fn entries(&self) -> &[MemTableEntry] { 113 | &self.entries 114 | } 115 | 116 | /// Gets the total size of the records in the MemTable 117 | pub fn size(&self) -> usize { 118 | self.size 119 | } 120 | } 121 | 122 | #[cfg(test)] 123 | mod tests { 124 | use crate::mem_table::MemTable; 125 | 126 | #[test] 127 | fn test_mem_table_put_start() { 128 | let mut table = MemTable::new(); 129 | table.set(b"Lime", b"Lime Smoothie", 0); // 17 + 16 + 1 130 | table.set(b"Orange", b"Orange Smoothie", 10); // 21 + 16 + 1 131 | 132 | table.set(b"Apple", b"Apple Smoothie", 20); // 19 + 16 + 1 133 | 134 | assert_eq!(table.entries[0].key, b"Apple"); 135 | assert_eq!(table.entries[0].value.as_ref().unwrap(), b"Apple Smoothie"); 136 | assert_eq!(table.entries[0].timestamp, 20); 137 | assert_eq!(table.entries[0].deleted, false); 138 | assert_eq!(table.entries[1].key, b"Lime"); 139 | assert_eq!(table.entries[1].value.as_ref().unwrap(), b"Lime Smoothie"); 140 | assert_eq!(table.entries[1].timestamp, 0); 141 | assert_eq!(table.entries[1].deleted, false); 142 | assert_eq!(table.entries[2].key, b"Orange"); 143 | assert_eq!(table.entries[2].value.as_ref().unwrap(), b"Orange Smoothie"); 144 | assert_eq!(table.entries[2].timestamp, 10); 145 | assert_eq!(table.entries[2].deleted, false); 146 | 147 | assert_eq!(table.size, 108); 148 | } 149 | 150 | #[test] 151 | fn test_mem_table_put_middle() { 152 | let mut table = MemTable::new(); 153 | table.set(b"Apple", b"Apple Smoothie", 0); 154 | table.set(b"Orange", b"Orange Smoothie", 10); 155 | 156 | table.set(b"Lime", b"Lime Smoothie", 20); 157 | 158 | assert_eq!(table.entries[0].key, b"Apple"); 159 | assert_eq!(table.entries[0].value.as_ref().unwrap(), b"Apple Smoothie"); 160 | assert_eq!(table.entries[0].timestamp, 0); 161 | assert_eq!(table.entries[0].deleted, false); 162 | assert_eq!(table.entries[1].key, b"Lime"); 163 | assert_eq!(table.entries[1].value.as_ref().unwrap(), b"Lime Smoothie"); 164 | assert_eq!(table.entries[1].timestamp, 20); 165 | assert_eq!(table.entries[1].deleted, false); 166 | assert_eq!(table.entries[2].key, b"Orange"); 167 | assert_eq!(table.entries[2].value.as_ref().unwrap(), b"Orange Smoothie"); 168 | assert_eq!(table.entries[2].timestamp, 10); 169 | assert_eq!(table.entries[2].deleted, false); 170 | 171 | assert_eq!(table.size, 108); 172 | } 173 | 174 | #[test] 175 | fn test_mem_table_put_end() { 176 | let mut table = MemTable::new(); 177 | table.set(b"Apple", b"Apple Smoothie", 0); 178 | table.set(b"Lime", b"Lime Smoothie", 10); 179 | 180 | table.set(b"Orange", b"Orange Smoothie", 20); 181 | 182 | assert_eq!(table.entries[0].key, b"Apple"); 183 | assert_eq!(table.entries[0].value.as_ref().unwrap(), b"Apple Smoothie"); 184 | assert_eq!(table.entries[0].timestamp, 0); 185 | assert_eq!(table.entries[0].deleted, false); 186 | assert_eq!(table.entries[1].key, b"Lime"); 187 | assert_eq!(table.entries[1].value.as_ref().unwrap(), b"Lime Smoothie"); 188 | assert_eq!(table.entries[1].timestamp, 10); 189 | assert_eq!(table.entries[1].deleted, false); 190 | assert_eq!(table.entries[2].key, b"Orange"); 191 | assert_eq!(table.entries[2].value.as_ref().unwrap(), b"Orange Smoothie"); 192 | assert_eq!(table.entries[2].timestamp, 20); 193 | assert_eq!(table.entries[2].deleted, false); 194 | 195 | assert_eq!(table.size, 108); 196 | } 197 | 198 | #[test] 199 | fn test_mem_table_put_overwrite() { 200 | let mut table = MemTable::new(); 201 | table.set(b"Apple", b"Apple Smoothie", 0); 202 | table.set(b"Lime", b"Lime Smoothie", 10); 203 | table.set(b"Orange", b"Orange Smoothie", 20); 204 | 205 | table.set(b"Lime", b"A sour fruit", 30); 206 | 207 | assert_eq!(table.entries[0].key, b"Apple"); 208 | assert_eq!(table.entries[0].value.as_ref().unwrap(), b"Apple Smoothie"); 209 | assert_eq!(table.entries[0].timestamp, 0); 210 | assert_eq!(table.entries[0].deleted, false); 211 | assert_eq!(table.entries[1].key, b"Lime"); 212 | assert_eq!(table.entries[1].value.as_ref().unwrap(), b"A sour fruit"); 213 | assert_eq!(table.entries[1].timestamp, 30); 214 | assert_eq!(table.entries[1].deleted, false); 215 | assert_eq!(table.entries[2].key, b"Orange"); 216 | assert_eq!(table.entries[2].value.as_ref().unwrap(), b"Orange Smoothie"); 217 | assert_eq!(table.entries[2].timestamp, 20); 218 | assert_eq!(table.entries[2].deleted, false); 219 | 220 | assert_eq!(table.size, 107); 221 | } 222 | 223 | #[test] 224 | fn test_mem_table_get_exists() { 225 | let mut table = MemTable::new(); 226 | table.set(b"Apple", b"Apple Smoothie", 0); 227 | table.set(b"Lime", b"Lime Smoothie", 10); 228 | table.set(b"Orange", b"Orange Smoothie", 20); 229 | 230 | let entry = table.get(b"Orange").unwrap(); 231 | 232 | assert_eq!(entry.key, b"Orange"); 233 | assert_eq!(entry.value.as_ref().unwrap(), b"Orange Smoothie"); 234 | assert_eq!(entry.timestamp, 20); 235 | } 236 | 237 | #[test] 238 | fn test_mem_table_get_not_exists() { 239 | let mut table = MemTable::new(); 240 | table.set(b"Apple", b"Apple Smoothie", 0); 241 | table.set(b"Lime", b"Lime Smoothie", 0); 242 | table.set(b"Orange", b"Orange Smoothie", 0); 243 | 244 | let res = table.get(b"Potato"); 245 | assert_eq!(res.is_some(), false); 246 | } 247 | 248 | #[test] 249 | fn test_mem_table_delete_exists() { 250 | let mut table = MemTable::new(); 251 | table.set(b"Apple", b"Apple Smoothie", 0); 252 | 253 | table.delete(b"Apple", 10); 254 | 255 | let res = table.get(b"Apple").unwrap(); 256 | assert_eq!(res.key, b"Apple"); 257 | assert_eq!(res.value, None); 258 | assert_eq!(res.timestamp, 10); 259 | assert_eq!(res.deleted, true); 260 | 261 | assert_eq!(table.entries[0].key, b"Apple"); 262 | assert_eq!(table.entries[0].value, None); 263 | assert_eq!(table.entries[0].timestamp, 10); 264 | assert_eq!(table.entries[0].deleted, true); 265 | 266 | assert_eq!(table.size, 22); 267 | } 268 | 269 | #[test] 270 | fn test_mem_table_delete_empty() { 271 | let mut table = MemTable::new(); 272 | 273 | table.delete(b"Apple", 10); 274 | 275 | let res = table.get(b"Apple").unwrap(); 276 | assert_eq!(res.key, b"Apple"); 277 | assert_eq!(res.value, None); 278 | assert_eq!(res.timestamp, 10); 279 | assert_eq!(res.deleted, true); 280 | 281 | assert_eq!(table.entries[0].key, b"Apple"); 282 | assert_eq!(table.entries[0].value, None); 283 | assert_eq!(table.entries[0].timestamp, 10); 284 | assert_eq!(table.entries[0].deleted, true); 285 | 286 | assert_eq!(table.size, 22); 287 | } 288 | } 289 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::fs::read_dir; 2 | use std::path::{Path, PathBuf}; 3 | 4 | /// Gets the set of files with an extension for a given directory. 5 | pub fn files_with_ext(dir: &Path, ext: &str) -> Vec { 6 | let mut files = Vec::new(); 7 | for file in read_dir(dir).unwrap() { 8 | let path = file.unwrap().path(); 9 | if path.extension().unwrap() == ext { 10 | files.push(path); 11 | } 12 | } 13 | 14 | files 15 | } 16 | -------------------------------------------------------------------------------- /src/wal.rs: -------------------------------------------------------------------------------- 1 | use crate::mem_table::MemTable; 2 | use crate::utils::files_with_ext; 3 | use crate::wal_iterator::WALEntry; 4 | use crate::wal_iterator::WALIterator; 5 | use std::fs::{remove_file, File, OpenOptions}; 6 | use std::io::prelude::*; 7 | use std::io::{self, BufWriter}; 8 | use std::path::{Path, PathBuf}; 9 | use std::time::{SystemTime, UNIX_EPOCH}; 10 | 11 | /// Write Ahead Log(WAL) 12 | /// 13 | /// An append-only file that holds the operations performed on the MemTable. 14 | /// The WAL is intended for recovery of the MemTable when the server is shutdown. 15 | pub struct WAL { 16 | path: PathBuf, 17 | file: BufWriter, 18 | } 19 | 20 | impl WAL { 21 | /// Creates a new WAL in a given directory. 22 | pub fn new(dir: &Path) -> io::Result { 23 | let timestamp = SystemTime::now() 24 | .duration_since(UNIX_EPOCH) 25 | .unwrap() 26 | .as_micros(); 27 | 28 | let path = Path::new(dir).join(timestamp.to_string() + ".wal"); 29 | let file = OpenOptions::new().append(true).create(true).open(&path)?; 30 | let file = BufWriter::new(file); 31 | 32 | Ok(WAL { path, file }) 33 | } 34 | 35 | /// Creates a WAL from an existing file path. 36 | pub fn from_path(path: &Path) -> io::Result { 37 | let file = OpenOptions::new().append(true).create(true).open(path)?; 38 | let file = BufWriter::new(file); 39 | 40 | Ok(WAL { 41 | path: path.to_owned(), 42 | file, 43 | }) 44 | } 45 | 46 | /// Loads the WAL(s) within a directory, returning a new WAL and the recovered MemTable. 47 | /// 48 | /// If multiple WALs exist in a directory, they are merged by file date. 49 | pub fn load_from_dir(dir: &Path) -> io::Result<(WAL, MemTable)> { 50 | let mut wal_files = files_with_ext(dir, "wal"); 51 | wal_files.sort(); 52 | 53 | let mut new_mem_table = MemTable::new(); 54 | let mut new_wal = WAL::new(dir)?; 55 | for wal_file in wal_files.iter() { 56 | if let Ok(wal) = WAL::from_path(wal_file) { 57 | for entry in wal.into_iter() { 58 | if entry.deleted { 59 | new_mem_table.delete(entry.key.as_slice(), entry.timestamp); 60 | new_wal.delete(entry.key.as_slice(), entry.timestamp)?; 61 | } else { 62 | new_mem_table.set( 63 | entry.key.as_slice(), 64 | entry.value.as_ref().unwrap().as_slice(), 65 | entry.timestamp, 66 | ); 67 | new_wal.set( 68 | entry.key.as_slice(), 69 | entry.value.unwrap().as_slice(), 70 | entry.timestamp, 71 | )?; 72 | } 73 | } 74 | } 75 | } 76 | new_wal.flush().unwrap(); 77 | wal_files.into_iter().for_each(|f| remove_file(f).unwrap()); 78 | 79 | Ok((new_wal, new_mem_table)) 80 | } 81 | 82 | /// Sets a Key-Value pair and the operation is appended to the WAL. 83 | pub fn set(&mut self, key: &[u8], value: &[u8], timestamp: u128) -> io::Result<()> { 84 | self.file.write_all(&key.len().to_le_bytes())?; 85 | self.file.write_all(&(false as u8).to_le_bytes())?; 86 | self.file.write_all(&value.len().to_le_bytes())?; 87 | self.file.write_all(key)?; 88 | self.file.write_all(value)?; 89 | self.file.write_all(×tamp.to_le_bytes())?; 90 | 91 | Ok(()) 92 | } 93 | 94 | /// Deletes a Key-Value pair and the operation is appended to the WAL. 95 | /// 96 | /// This is achieved using tombstones. 97 | pub fn delete(&mut self, key: &[u8], timestamp: u128) -> io::Result<()> { 98 | self.file.write_all(&key.len().to_le_bytes())?; 99 | self.file.write_all(&(true as u8).to_le_bytes())?; 100 | self.file.write_all(key)?; 101 | self.file.write_all(×tamp.to_le_bytes())?; 102 | 103 | Ok(()) 104 | } 105 | 106 | /// Flushes the WAL to disk. 107 | /// 108 | /// This is useful for applying bulk operations and flushing the final result to 109 | /// disk. Waiting to flush after the bulk operations have been performed will improve 110 | /// write performance substantially. 111 | pub fn flush(&mut self) -> io::Result<()> { 112 | self.file.flush() 113 | } 114 | } 115 | 116 | impl IntoIterator for WAL { 117 | type IntoIter = WALIterator; 118 | type Item = WALEntry; 119 | 120 | /// Converts a WAL into a `WALIterator` to iterate over the entries. 121 | fn into_iter(self) -> WALIterator { 122 | WALIterator::new(self.path).unwrap() 123 | } 124 | } 125 | 126 | #[cfg(test)] 127 | mod tests { 128 | use crate::wal::WAL; 129 | use rand::Rng; 130 | use std::fs::{create_dir, remove_dir_all}; 131 | use std::fs::{metadata, File, OpenOptions}; 132 | use std::io::prelude::*; 133 | use std::io::BufReader; 134 | use std::path::PathBuf; 135 | use std::time::{SystemTime, UNIX_EPOCH}; 136 | 137 | fn check_entry( 138 | reader: &mut BufReader, 139 | key: &[u8], 140 | value: Option<&[u8]>, 141 | timestamp: u128, 142 | deleted: bool, 143 | ) { 144 | let mut len_buffer = [0; 8]; 145 | reader.read_exact(&mut len_buffer).unwrap(); 146 | let file_key_len = usize::from_le_bytes(len_buffer); 147 | assert_eq!(file_key_len, key.len()); 148 | 149 | let mut bool_buffer = [0; 1]; 150 | reader.read_exact(&mut bool_buffer).unwrap(); 151 | let file_deleted = bool_buffer[0] != 0; 152 | assert_eq!(file_deleted, deleted); 153 | 154 | if deleted { 155 | let mut file_key = vec![0; file_key_len]; 156 | reader.read_exact(&mut file_key).unwrap(); 157 | assert_eq!(file_key, key); 158 | } else { 159 | reader.read_exact(&mut len_buffer).unwrap(); 160 | let file_value_len = usize::from_le_bytes(len_buffer); 161 | assert_eq!(file_value_len, value.unwrap().len()); 162 | let mut file_key = vec![0; file_key_len]; 163 | reader.read_exact(&mut file_key).unwrap(); 164 | assert_eq!(file_key, key); 165 | let mut file_value = vec![0; file_value_len]; 166 | reader.read_exact(&mut file_value).unwrap(); 167 | assert_eq!(file_value, value.unwrap()); 168 | } 169 | 170 | let mut timestamp_buffer = [0; 16]; 171 | reader.read_exact(&mut timestamp_buffer).unwrap(); 172 | let file_timestamp = u128::from_le_bytes(timestamp_buffer); 173 | assert_eq!(file_timestamp, timestamp); 174 | } 175 | 176 | #[test] 177 | fn test_write_one() { 178 | let mut rng = rand::thread_rng(); 179 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 180 | create_dir(&dir).unwrap(); 181 | 182 | let timestamp = SystemTime::now() 183 | .duration_since(UNIX_EPOCH) 184 | .unwrap() 185 | .as_micros(); 186 | 187 | let mut wal = WAL::new(&dir).unwrap(); 188 | wal.set(b"Lime", b"Lime Smoothie", timestamp).unwrap(); 189 | wal.flush().unwrap(); 190 | 191 | let file = OpenOptions::new().read(true).open(&wal.path).unwrap(); 192 | let mut reader = BufReader::new(file); 193 | 194 | check_entry( 195 | &mut reader, 196 | b"Lime", 197 | Some(b"Lime Smoothie"), 198 | timestamp, 199 | false, 200 | ); 201 | 202 | remove_dir_all(&dir).unwrap(); 203 | } 204 | 205 | #[test] 206 | fn test_write_many() { 207 | let mut rng = rand::thread_rng(); 208 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 209 | create_dir(&dir).unwrap(); 210 | 211 | let timestamp = SystemTime::now() 212 | .duration_since(UNIX_EPOCH) 213 | .unwrap() 214 | .as_micros(); 215 | 216 | let entries: Vec<(&[u8], Option<&[u8]>)> = vec![ 217 | (b"Apple", Some(b"Apple Smoothie")), 218 | (b"Lime", Some(b"Lime Smoothie")), 219 | (b"Orange", Some(b"Orange Smoothie")), 220 | ]; 221 | 222 | let mut wal = WAL::new(&dir).unwrap(); 223 | 224 | for e in entries.iter() { 225 | wal.set(e.0, e.1.unwrap(), timestamp).unwrap(); 226 | } 227 | wal.flush().unwrap(); 228 | 229 | let file = OpenOptions::new().read(true).open(&wal.path).unwrap(); 230 | let mut reader = BufReader::new(file); 231 | 232 | for e in entries.iter() { 233 | check_entry(&mut reader, e.0, e.1, timestamp, false); 234 | } 235 | 236 | remove_dir_all(&dir).unwrap(); 237 | } 238 | 239 | #[test] 240 | fn test_write_delete() { 241 | let mut rng = rand::thread_rng(); 242 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 243 | create_dir(&dir).unwrap(); 244 | 245 | let timestamp = SystemTime::now() 246 | .duration_since(UNIX_EPOCH) 247 | .unwrap() 248 | .as_micros(); 249 | 250 | let entries: Vec<(&[u8], Option<&[u8]>)> = vec![ 251 | (b"Apple", Some(b"Apple Smoothie")), 252 | (b"Lime", Some(b"Lime Smoothie")), 253 | (b"Orange", Some(b"Orange Smoothie")), 254 | ]; 255 | 256 | let mut wal = WAL::new(&dir).unwrap(); 257 | 258 | for e in entries.iter() { 259 | wal.set(e.0, e.1.unwrap(), timestamp).unwrap(); 260 | } 261 | for e in entries.iter() { 262 | wal.delete(e.0, timestamp).unwrap(); 263 | } 264 | 265 | wal.flush().unwrap(); 266 | 267 | let file = OpenOptions::new().read(true).open(&wal.path).unwrap(); 268 | let mut reader = BufReader::new(file); 269 | 270 | for e in entries.iter() { 271 | check_entry(&mut reader, e.0, e.1, timestamp, false); 272 | } 273 | for e in entries.iter() { 274 | check_entry(&mut reader, e.0, None, timestamp, true); 275 | } 276 | 277 | remove_dir_all(&dir).unwrap(); 278 | } 279 | 280 | #[test] 281 | fn test_read_wal_none() { 282 | let mut rng = rand::thread_rng(); 283 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 284 | create_dir(&dir).unwrap(); 285 | 286 | let (new_wal, new_mem_table) = WAL::load_from_dir(&dir).unwrap(); 287 | assert_eq!(new_mem_table.len(), 0); 288 | 289 | let m = metadata(new_wal.path).unwrap(); 290 | assert_eq!(m.len(), 0); 291 | 292 | remove_dir_all(&dir).unwrap(); 293 | } 294 | 295 | #[test] 296 | fn test_read_wal_one() { 297 | let mut rng = rand::thread_rng(); 298 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 299 | create_dir(&dir).unwrap(); 300 | 301 | let entries: Vec<(&[u8], Option<&[u8]>)> = vec![ 302 | (b"Apple", Some(b"Apple Smoothie")), 303 | (b"Lime", Some(b"Lime Smoothie")), 304 | (b"Orange", Some(b"Orange Smoothie")), 305 | ]; 306 | 307 | let mut wal = WAL::new(&dir).unwrap(); 308 | 309 | for (i, e) in entries.iter().enumerate() { 310 | wal.set(e.0, e.1.unwrap(), i as u128).unwrap(); 311 | } 312 | wal.flush().unwrap(); 313 | 314 | let (new_wal, new_mem_table) = WAL::load_from_dir(&dir).unwrap(); 315 | 316 | let file = OpenOptions::new().read(true).open(&new_wal.path).unwrap(); 317 | let mut reader = BufReader::new(file); 318 | 319 | for (i, e) in entries.iter().enumerate() { 320 | check_entry(&mut reader, e.0, e.1, i as u128, false); 321 | 322 | let mem_e = new_mem_table.get(e.0).unwrap(); 323 | assert_eq!(mem_e.key, e.0); 324 | assert_eq!(mem_e.value.as_ref().unwrap().as_slice(), e.1.unwrap()); 325 | assert_eq!(mem_e.timestamp, i as u128); 326 | } 327 | 328 | remove_dir_all(&dir).unwrap(); 329 | } 330 | 331 | #[test] 332 | fn test_read_wal_multiple() { 333 | let mut rng = rand::thread_rng(); 334 | let dir = PathBuf::from(format!("./{}/", rng.gen::())); 335 | create_dir(&dir).unwrap(); 336 | 337 | let entries_1: Vec<(&[u8], Option<&[u8]>)> = vec![ 338 | (b"Apple", Some(b"Apple Smoothie")), 339 | (b"Lime", Some(b"Lime Smoothie")), 340 | (b"Orange", Some(b"Orange Smoothie")), 341 | ]; 342 | let mut wal_1 = WAL::new(&dir).unwrap(); 343 | for (i, e) in entries_1.iter().enumerate() { 344 | wal_1.set(e.0, e.1.unwrap(), i as u128).unwrap(); 345 | } 346 | wal_1.flush().unwrap(); 347 | 348 | let entries_2: Vec<(&[u8], Option<&[u8]>)> = vec![ 349 | (b"Strawberry", Some(b"Strawberry Smoothie")), 350 | (b"Blueberry", Some(b"Blueberry Smoothie")), 351 | (b"Orange", Some(b"Orange Milkshake")), 352 | ]; 353 | let mut wal_2 = WAL::new(&dir).unwrap(); 354 | for (i, e) in entries_2.iter().enumerate() { 355 | wal_2.set(e.0, e.1.unwrap(), (i + 3) as u128).unwrap(); 356 | } 357 | wal_2.flush().unwrap(); 358 | 359 | let (new_wal, new_mem_table) = WAL::load_from_dir(&dir).unwrap(); 360 | 361 | let file = OpenOptions::new().read(true).open(&new_wal.path).unwrap(); 362 | let mut reader = BufReader::new(file); 363 | 364 | for (i, e) in entries_1.iter().enumerate() { 365 | check_entry(&mut reader, e.0, e.1, i as u128, false); 366 | 367 | let mem_e = new_mem_table.get(e.0).unwrap(); 368 | if i != 2 { 369 | assert_eq!(mem_e.key, e.0); 370 | assert_eq!(mem_e.value.as_ref().unwrap().as_slice(), e.1.unwrap()); 371 | assert_eq!(mem_e.timestamp, i as u128); 372 | } else { 373 | assert_eq!(mem_e.key, e.0); 374 | assert_ne!(mem_e.value.as_ref().unwrap().as_slice(), e.1.unwrap()); 375 | assert_ne!(mem_e.timestamp, i as u128); 376 | } 377 | } 378 | for (i, e) in entries_2.iter().enumerate() { 379 | check_entry(&mut reader, e.0, e.1, (i + 3) as u128, false); 380 | 381 | let mem_e = new_mem_table.get(e.0).unwrap(); 382 | assert_eq!(mem_e.key, e.0); 383 | assert_eq!(mem_e.value.as_ref().unwrap().as_slice(), e.1.unwrap()); 384 | assert_eq!(mem_e.timestamp, (i + 3) as u128); 385 | } 386 | 387 | remove_dir_all(&dir).unwrap(); 388 | } 389 | } 390 | -------------------------------------------------------------------------------- /src/wal_iterator.rs: -------------------------------------------------------------------------------- 1 | use std::fs::{File, OpenOptions}; 2 | use std::io::prelude::*; 3 | use std::io::{self, BufReader}; 4 | use std::path::PathBuf; 5 | 6 | pub struct WALEntry { 7 | pub key: Vec, 8 | pub value: Option>, 9 | pub timestamp: u128, 10 | pub deleted: bool, 11 | } 12 | 13 | /// WAL iterator to iterate over the items in a WAL file. 14 | pub struct WALIterator { 15 | reader: BufReader, 16 | } 17 | 18 | impl WALIterator { 19 | /// Creates a new WALIterator from a path to a WAL file. 20 | pub fn new(path: PathBuf) -> io::Result { 21 | let file = OpenOptions::new().read(true).open(path)?; 22 | let reader = BufReader::new(file); 23 | Ok(WALIterator { reader }) 24 | } 25 | } 26 | 27 | impl Iterator for WALIterator { 28 | type Item = WALEntry; 29 | 30 | /// Gets the next entry in the WAL file. 31 | fn next(&mut self) -> Option { 32 | let mut len_buffer = [0; 8]; 33 | if self.reader.read_exact(&mut len_buffer).is_err() { 34 | return None; 35 | } 36 | let key_len = usize::from_le_bytes(len_buffer); 37 | 38 | let mut bool_buffer = [0; 1]; 39 | if self.reader.read_exact(&mut bool_buffer).is_err() { 40 | return None; 41 | } 42 | let deleted = bool_buffer[0] != 0; 43 | 44 | let mut key = vec![0; key_len]; 45 | let mut value = None; 46 | if deleted { 47 | if self.reader.read_exact(&mut key).is_err() { 48 | return None; 49 | } 50 | } else { 51 | if self.reader.read_exact(&mut len_buffer).is_err() { 52 | return None; 53 | } 54 | let value_len = usize::from_le_bytes(len_buffer); 55 | if self.reader.read_exact(&mut key).is_err() { 56 | return None; 57 | } 58 | let mut value_buf = vec![0; value_len]; 59 | if self.reader.read_exact(&mut value_buf).is_err() { 60 | return None; 61 | } 62 | value = Some(value_buf); 63 | } 64 | 65 | let mut timestamp_buffer = [0; 16]; 66 | if self.reader.read_exact(&mut timestamp_buffer).is_err() { 67 | return None; 68 | } 69 | let timestamp = u128::from_le_bytes(timestamp_buffer); 70 | 71 | Some(WALEntry { 72 | key, 73 | value, 74 | timestamp, 75 | deleted, 76 | }) 77 | } 78 | } 79 | --------------------------------------------------------------------------------