├── .github ├── FUNDING.yml └── workflows │ ├── rust.yml │ ├── audit.yml │ ├── docs.yml │ ├── benches.yml │ └── coverage.yml ├── .cargo └── config ├── .gitignore ├── rustfmt.toml ├── xtask ├── Cargo.toml └── src │ └── main.rs ├── src ├── util.rs ├── config.rs ├── basinmap.rs ├── checkpointer.rs ├── lib.rs ├── commit_log.rs ├── transaction.rs ├── allocations.rs ├── wal.rs ├── store.rs ├── atlas.rs ├── tests.rs └── format.rs ├── deny.toml ├── Cargo.toml ├── benchmarks ├── Cargo.toml └── benches │ └── inserts.rs └── README.md /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [ecton] -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [alias] 2 | xtask = "run --package xtask --" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | .test* 4 | perf.data* -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | unstable_features = true 2 | use_field_init_shorthand = true 3 | imports_granularity = "Module" 4 | group_imports = "StdExternalCrate" 5 | format_code_in_doc_comments = true 6 | reorder_impl_items = true 7 | -------------------------------------------------------------------------------- /xtask/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xtask" 3 | version = "0.0.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [dependencies] 8 | khonsu-tools = { git = "https://github.com/khonsulabs/khonsu-tools.git", branch = "main" } 9 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use crate::{Error, Result}; 2 | 3 | pub fn usize_to_u32(value: usize) -> Result { 4 | u32::try_from(value).map_err(Error::from) 5 | } 6 | 7 | pub fn u32_to_usize(value: u32) -> Result { 8 | usize::try_from(value).map_err(Error::from) 9 | } 10 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | timeout-minutes: 30 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Install Rust 13 | uses: hecrj/setup-rust-action@v1 14 | 15 | - name: Run clippy 16 | run: | 17 | cargo clippy 18 | 19 | - name: Run unit tests 20 | run: | 21 | cargo test --all-features --all-targets 22 | -------------------------------------------------------------------------------- /xtask/src/main.rs: -------------------------------------------------------------------------------- 1 | use khonsu_tools::universal::clap::Parser; 2 | use khonsu_tools::universal::{anyhow, DefaultConfig}; 3 | use khonsu_tools::Commands; 4 | 5 | fn main() -> anyhow::Result<()> { 6 | Commands::parse().execute::() 7 | } 8 | 9 | enum Config {} 10 | 11 | impl khonsu_tools::Config for Config { 12 | type Publish = Self; 13 | type Universal = DefaultConfig; 14 | } 15 | 16 | impl khonsu_tools::publish::Config for Config { 17 | fn paths() -> Vec { 18 | vec![String::from(".")] 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | targets = [] 2 | 3 | [advisories] 4 | db-path = "~/.cargo/advisory-db" 5 | db-urls = ["https://github.com/rustsec/advisory-db"] 6 | vulnerability = "deny" 7 | unmaintained = "warn" 8 | yanked = "warn" 9 | notice = "warn" 10 | ignore = [] 11 | 12 | [licenses] 13 | unlicensed = "deny" 14 | allow = [] 15 | deny = [] 16 | copyleft = "deny" 17 | allow-osi-fsf-free = "either" 18 | default = "deny" 19 | confidence-threshold = 0.8 20 | exceptions = [] 21 | 22 | [licenses.private] 23 | ignore = true 24 | 25 | [bans] 26 | multiple-versions = "warn" 27 | wildcards = "allow" 28 | highlight = "all" 29 | -------------------------------------------------------------------------------- /.github/workflows/audit.yml: -------------------------------------------------------------------------------- 1 | name: Audit 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | audit: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Install Rust 11 | uses: hecrj/setup-rust-action@v1 12 | - name: Cache 13 | uses: actions/cache@v2 14 | with: 15 | path: | 16 | ~/.cargo/.crates.toml 17 | ~/.cargo/.crates2.json 18 | ~/.cargo/bin/cargo-deny 19 | key: cargo-deny 20 | 21 | - name: Install cargo-deny 22 | run: cargo -v install cargo-deny 23 | 24 | - name: Checkout 25 | uses: actions/checkout@v2 26 | 27 | - name: Audit 28 | run: | 29 | cargo xtask audit 30 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: [push] 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-latest 8 | if: github.ref == 'refs/heads/main' 9 | steps: 10 | - name: Install Rust 11 | uses: hecrj/setup-rust-action@v1 12 | 13 | - uses: actions/checkout@v2 14 | - name: Generate Docs 15 | run: | 16 | cargo doc --no-deps --all-features 17 | 18 | - name: Deploy Docs 19 | uses: JamesIves/github-pages-deploy-action@releases/v4 20 | with: 21 | branch: gh-pages 22 | folder: target/doc/ 23 | git-config-name: kl-botsu 24 | git-config-email: botsu@khonsulabs.com 25 | target-folder: /main/ 26 | clean: true 27 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sediment" 3 | version = "0.1.0" 4 | license = "MIT OR Apache-2.0" 5 | edition = "2021" 6 | description = "An ACID-compliant blob storage layer." 7 | repository = "https://github.com/khonsulabs/okaywal" 8 | keywords = ["blob-storage"] 9 | categories = ["database"] 10 | readme = "./README.md" 11 | 12 | [dependencies] 13 | okaywal = "0.2.0" 14 | crc32c = "0.6.3" 15 | tinyvec = "1.6.0" 16 | thiserror = "1.0.38" 17 | flume = "0.10.14" 18 | watchable = "1.1.0" 19 | 20 | # Comment out before checking into CI. 21 | [patch.crates-io] 22 | # okaywal = { path = "../okaywal2" } 23 | okaywal = { git = "https://github.com/khonsulabs/okaywal", branch = "main" } 24 | # watchable = { path = "../watchable" } 25 | 26 | [workspace] 27 | members = ["xtask", "benchmarks"] 28 | -------------------------------------------------------------------------------- /benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "benchmarks" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [features] 9 | # default = ["marble", "rocksdb", "sqlite"] 10 | # default = ["fbarrier-fsync"] 11 | # iouring = ["sediment/iouring"] 12 | # fbarrier-fsync = ["sediment/fbarrier-fsync"] 13 | sqlite = ["rusqlite"] 14 | 15 | [dependencies] 16 | timings = { git = "https://github.com/khonsulabs/timings", branch = "main" } 17 | # timings = { path = "../../timings" } 18 | sediment = { path = "../" } 19 | marble = { version = "15.0.2", optional = true } 20 | rusqlite = { version = "0.28.0", optional = true } 21 | # rocksdb = { version = "0.19.0", optional = true } 22 | rand = "0.8" 23 | 24 | [[bench]] 25 | name = "inserts" 26 | harness = false 27 | -------------------------------------------------------------------------------- /.github/workflows/benches.yml: -------------------------------------------------------------------------------- 1 | name: Benchmarks 2 | 3 | on: [push] 4 | 5 | jobs: 6 | benchmark: 7 | services: 8 | postgres: 9 | image: postgres 10 | env: 11 | POSTGRES_DB: bench 12 | POSTGRES_USER: bencher 13 | POSTGRES_PASSWORD: password 14 | options: >- 15 | --health-cmd pg_isready 16 | --health-interval 10s 17 | --health-timeout 5s 18 | --health-retries 5 19 | ports: 20 | - 5432:5432 21 | 22 | runs-on: ubuntu-latest 23 | timeout-minutes: 60 24 | steps: 25 | - uses: actions/checkout@v2 26 | 27 | - name: Install Rust 28 | uses: hecrj/setup-rust-action@v1 29 | 30 | - name: Build benchmarks 31 | run: | 32 | cargo bench -p benchmarks --all-features --no-run 33 | 34 | - name: Run benchmarks 35 | run: | 36 | cargo bench -p benchmarks --all-features 37 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: [push] 4 | 5 | jobs: 6 | coverage: 7 | services: 8 | postgres: 9 | image: postgres 10 | env: 11 | POSTGRES_DB: bench 12 | POSTGRES_USER: bencher 13 | POSTGRES_PASSWORD: password 14 | options: >- 15 | --health-cmd pg_isready 16 | --health-interval 10s 17 | --health-timeout 5s 18 | --health-retries 5 19 | ports: 20 | - 5432:5432 21 | runs-on: ubuntu-latest 22 | timeout-minutes: 30 23 | steps: 24 | - uses: actions/checkout@v2 25 | 26 | - name: Install Rust 27 | uses: hecrj/setup-rust-action@v1 28 | 29 | - name: Run code coverage 30 | run: | 31 | cargo xtask generate-code-coverage-report --install-dependencies 32 | 33 | - name: Deploy Docs 34 | if: github.ref == 'refs/heads/main' 35 | uses: JamesIves/github-pages-deploy-action@releases/v4 36 | with: 37 | branch: gh-pages 38 | folder: coverage/ 39 | git-config-name: kl-botsu 40 | git-config-email: botsu@khonsulabs.com 41 | target-folder: /coverage/ 42 | clean: true 43 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::path; 2 | 3 | use okaywal::file_manager; 4 | use okaywal::file_manager::fs::StdFileManager; 5 | use okaywal::file_manager::memory::MemoryFileManager; 6 | 7 | use crate::{Database, Result}; 8 | 9 | #[derive(Clone)] 10 | pub struct Config { 11 | pub wal: okaywal::Configuration, 12 | } 13 | 14 | impl Config { 15 | pub fn for_directory(directory: Path) -> Self 16 | where 17 | Path: AsRef, 18 | { 19 | Self { 20 | wal: okaywal::Configuration::default_for(directory), 21 | } 22 | } 23 | } 24 | 25 | impl Config { 26 | pub fn in_memory() -> Self { 27 | Self { 28 | wal: okaywal::Configuration::default_with_manager("/", MemoryFileManager::default()), 29 | } 30 | } 31 | } 32 | impl Config 33 | where 34 | FileManager: file_manager::FileManager, 35 | { 36 | pub fn configure_wal< 37 | Configuration: FnOnce(okaywal::Configuration) -> okaywal::Configuration, 38 | >( 39 | mut self, 40 | configurator: Configuration, 41 | ) -> Self { 42 | self.wal = configurator(self.wal); 43 | self 44 | } 45 | 46 | pub fn recover(self) -> Result> { 47 | Database::recover_config(self) 48 | } 49 | } 50 | 51 | impl From> for Config { 52 | fn from(wal: okaywal::Configuration) -> Self { 53 | Self { wal } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/basinmap.rs: -------------------------------------------------------------------------------- 1 | use std::ops::{Index, IndexMut}; 2 | 3 | use crate::format::BasinId; 4 | 5 | #[derive(Debug)] 6 | pub struct BasinMap { 7 | basins: [Option; 8], 8 | } 9 | 10 | impl BasinMap { 11 | pub const fn new() -> Self { 12 | Self { 13 | basins: [None, None, None, None, None, None, None, None], 14 | } 15 | } 16 | 17 | pub fn get_or_insert_with(&mut self, index: BasinId, default: impl FnOnce() -> T) -> &mut T { 18 | if self[index].is_none() { 19 | self[index] = Some(default()); 20 | } 21 | 22 | self[index].as_mut().expect("always initialized above") 23 | } 24 | 25 | pub fn get_or_default(&mut self, index: BasinId) -> &mut T 26 | where 27 | T: Default, 28 | { 29 | self.get_or_insert_with(index, T::default) 30 | } 31 | } 32 | 33 | impl Index for BasinMap { 34 | type Output = Option; 35 | 36 | fn index(&self, index: BasinId) -> &Self::Output { 37 | &self.basins[usize::from(index.index())] 38 | } 39 | } 40 | 41 | impl IndexMut for BasinMap { 42 | fn index_mut(&mut self, index: BasinId) -> &mut Self::Output { 43 | &mut self.basins[usize::from(index.index())] 44 | } 45 | } 46 | 47 | impl<'a, T> IntoIterator for &'a BasinMap { 48 | type IntoIter = Iter<'a, T>; 49 | type Item = (BasinId, &'a T); 50 | 51 | fn into_iter(self) -> Self::IntoIter { 52 | Iter { 53 | map: self, 54 | id: Some(BasinId::MIN), 55 | } 56 | } 57 | } 58 | 59 | #[derive(Debug)] 60 | pub struct Iter<'a, T> { 61 | map: &'a BasinMap, 62 | id: Option, 63 | } 64 | 65 | impl<'a, T> Iterator for Iter<'a, T> { 66 | type Item = (BasinId, &'a T); 67 | 68 | fn next(&mut self) -> Option { 69 | while let Some(id) = self.id { 70 | let next_id = id.next(); 71 | let basin_contents = &self.map[id]; 72 | self.id = next_id; 73 | // If the basin had something, return it. Otherwise, look at the 74 | // next position. 75 | if let Some(contents) = basin_contents { 76 | return Some((id, contents)); 77 | } 78 | } 79 | 80 | None 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/checkpointer.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{Arc, Weak}; 2 | use std::thread::JoinHandle; 3 | 4 | use okaywal::{file_manager, WriteAheadLog}; 5 | use watchable::{Watchable, Watcher}; 6 | 7 | use crate::format::TransactionId; 8 | use crate::{Data, Database, Error, Result}; 9 | 10 | #[derive(Debug)] 11 | pub struct Checkpointer { 12 | watchable: Watchable, 13 | handle_receiver: flume::Receiver>>, 14 | } 15 | 16 | impl Checkpointer { 17 | pub fn new(current_checkpointed_transaction: TransactionId) -> (Self, Spawner) { 18 | let watchable = Watchable::new(current_checkpointed_transaction); 19 | let watcher = watchable.watch(); 20 | let (handle_sender, handle_receiver) = flume::bounded(1); 21 | 22 | ( 23 | Self { 24 | watchable, 25 | handle_receiver, 26 | }, 27 | Spawner { 28 | watcher, 29 | handle_sender, 30 | }, 31 | ) 32 | } 33 | 34 | pub fn checkpoint_to(&self, tx_id: TransactionId) { 35 | let _ = self.watchable.update(tx_id); 36 | } 37 | 38 | pub fn shutdown(&self) -> Result<()> { 39 | self.watchable.shutdown(); 40 | let join_handle = self 41 | .handle_receiver 42 | .recv() 43 | .expect("handle should always be sent after spawning"); 44 | join_handle.join().map_err(|_| Error::ThreadJoin)? 45 | } 46 | } 47 | 48 | #[derive(Debug)] 49 | pub struct Spawner { 50 | watcher: Watcher, 51 | handle_sender: flume::Sender>>, 52 | } 53 | 54 | impl Spawner { 55 | pub(super) fn spawn( 56 | self, 57 | current_checkpointed_tx: TransactionId, 58 | data: &Arc>, 59 | wal: &WriteAheadLog, 60 | ) -> Result<()> 61 | where 62 | FileManager: file_manager::FileManager, 63 | { 64 | let data = Arc::downgrade(data); 65 | let wal = wal.clone(); 66 | let thread_handle = std::thread::Builder::new() 67 | .name(String::from("sediment-cp")) 68 | .spawn(move || { 69 | sediment_checkpoint_thread(current_checkpointed_tx, self.watcher, data, wal) 70 | }) 71 | .expect("failed to spawn thread"); 72 | self.handle_sender 73 | .send(thread_handle) 74 | .expect("this send should never fail"); 75 | Ok(()) 76 | } 77 | } 78 | 79 | fn sediment_checkpoint_thread( 80 | baseline_transaction: TransactionId, 81 | mut tx_receiver: Watcher, 82 | data: Weak>, 83 | wal: WriteAheadLog, 84 | ) -> Result<()> 85 | where 86 | FileManager: file_manager::FileManager, 87 | { 88 | let mut current_tx_id = baseline_transaction; 89 | while let Ok(transaction_to_checkpoint) = tx_receiver.next_value() { 90 | if transaction_to_checkpoint <= current_tx_id { 91 | continue; 92 | } 93 | 94 | if let Some(data) = data.upgrade() { 95 | let db = Database { 96 | data, 97 | wal: wal.clone(), 98 | }; 99 | 100 | // Find all commit log entries that are <= 101 | // transaction_to_checkpoint. 102 | let mut current_commit_log = db.commit_log_head()?; 103 | let mut archived_grains = Vec::new(); 104 | let mut commit_logs_to_archive = Vec::new(); 105 | while let Some(entry) = current_commit_log { 106 | if entry.transaction_id > current_tx_id 107 | && entry.transaction_id <= transaction_to_checkpoint 108 | { 109 | archived_grains.extend(entry.archived_grains.iter().copied()); 110 | commit_logs_to_archive.push(entry.grain_id); 111 | } else if entry.transaction_id <= current_tx_id { 112 | // We can't go any further back. 113 | break; 114 | } 115 | 116 | current_commit_log = entry.next_entry(&db)?; 117 | } 118 | 119 | let mut tx = db.begin_transaction()?; 120 | for commit_log_id in commit_logs_to_archive { 121 | tx.archive(commit_log_id)?; 122 | } 123 | tx.free_grains(&archived_grains)?; 124 | tx.checkpointed_to(transaction_to_checkpoint)?; 125 | tx.commit()?; 126 | 127 | current_tx_id = transaction_to_checkpoint; 128 | } 129 | } 130 | 131 | Ok(()) 132 | } 133 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | 3 | use std::io::{self}; 4 | use std::num::TryFromIntError; 5 | use std::path::Path; 6 | use std::sync::{Arc, PoisonError}; 7 | 8 | use okaywal::file_manager::fs::StdFileManager; 9 | use okaywal::file_manager::memory::MemoryFileManager; 10 | use okaywal::file_manager::FSyncError; 11 | use okaywal::{file_manager, WriteAheadLog}; 12 | pub use transaction::Transaction; 13 | 14 | use crate::atlas::{Atlas, GrainReader}; 15 | use crate::checkpointer::Checkpointer; 16 | use crate::commit_log::{CommitLogEntry, CommitLogs}; 17 | use crate::config::Config; 18 | use crate::format::{GrainId, Stored, TransactionId}; 19 | use crate::store::Store; 20 | use crate::transaction::TransactionLock; 21 | 22 | mod allocations; 23 | mod atlas; 24 | mod basinmap; 25 | mod checkpointer; 26 | mod commit_log; 27 | pub mod config; 28 | pub mod format; 29 | mod store; 30 | #[cfg(test)] 31 | mod tests; 32 | mod transaction; 33 | mod util; 34 | mod wal; 35 | 36 | #[derive(Debug, Clone)] 37 | pub struct Database 38 | where 39 | FileManager: file_manager::FileManager, 40 | { 41 | data: Arc>, 42 | wal: WriteAheadLog, 43 | } 44 | 45 | impl Database { 46 | pub fn recover>(directory: AsRefPath) -> Result { 47 | Config::for_directory(directory).recover() 48 | } 49 | } 50 | 51 | impl Database { 52 | pub fn in_memory() -> Self { 53 | Config::in_memory() 54 | .recover() 55 | .expect("somehow failed to recover on default memory file manager") 56 | } 57 | } 58 | 59 | impl Database 60 | where 61 | FileManager: file_manager::FileManager, 62 | { 63 | fn recover_config(config: Config) -> Result { 64 | // Opening the store restores the database to the last fully committed 65 | // state. Each commit happens when the write ahead log is checkpointed. 66 | let store = Store::recover( 67 | config.wal.directory.as_ref(), 68 | config.wal.file_manager.clone(), 69 | )?; 70 | let atlas = Atlas::new(&store); 71 | let current_metadata = atlas.current_index_metadata()?; 72 | let (checkpointer, cp_spawner) = Checkpointer::new(current_metadata.checkpointed_to); 73 | let data = Arc::new(Data { 74 | store, 75 | atlas, 76 | tx_lock: TransactionLock::new(current_metadata), 77 | checkpointer, 78 | commit_logs: CommitLogs::default(), 79 | }); 80 | 81 | // Recover any transactions from the write ahead log that haven't been 82 | // checkpointed to the store already. 83 | let wal = config.wal.open(wal::WalManager::new(&data))?; 84 | 85 | // The wal recovery process may have recovered sediment checkpoints that 86 | // are in the WAL but not yet in permanent storage. Refresh the metadata. 87 | let current_metadata = data.atlas.current_index_metadata()?; 88 | cp_spawner.spawn(current_metadata.checkpointed_to, &data, &wal)?; 89 | if current_metadata.checkpoint_target > current_metadata.checkpointed_to { 90 | data.checkpointer 91 | .checkpoint_to(current_metadata.checkpoint_target); 92 | } 93 | 94 | Ok(Self { data, wal }) 95 | } 96 | 97 | pub fn begin_transaction(&self) -> Result> { 98 | let tx_guard = self.data.tx_lock.lock(); 99 | let wal_entry = self.wal.begin_entry()?; 100 | 101 | Transaction::new(self, wal_entry, tx_guard) 102 | } 103 | 104 | pub fn read(&self, grain: GrainId) -> Result>> { 105 | self.data.atlas.find(grain, &self.wal) 106 | } 107 | 108 | pub fn read_commit_log_entry(&self, grain: GrainId) -> Result>> { 109 | self.data.commit_logs.get_or_lookup(grain, self) 110 | } 111 | 112 | pub fn shutdown(self) -> Result<()> { 113 | // Shut the checkpointer down first, since it may try to access the 114 | // write-ahead log. 115 | self.data.checkpointer.shutdown()?; 116 | // Shut down the write-ahead log, which may still end up having its own 117 | // checkpointing process finishing up. This may require the file syncer. 118 | self.wal.shutdown()?; 119 | // With everything else shut down, we can now shut down the file 120 | // manager. 121 | self.data.store.file_manager.shutdown()?; 122 | 123 | Ok(()) 124 | } 125 | 126 | pub fn checkpoint_target(&self) -> Result { 127 | Ok(self.data.atlas.current_index_metadata()?.checkpoint_target) 128 | } 129 | 130 | pub fn checkpointed_to(&self) -> Result { 131 | Ok(self.data.atlas.current_index_metadata()?.checkpointed_to) 132 | } 133 | 134 | pub fn embedded_header(&self) -> Result> { 135 | Ok(self 136 | .data 137 | .atlas 138 | .current_index_metadata()? 139 | .embedded_header_data) 140 | } 141 | 142 | pub fn commit_log_head(&self) -> Result>>> { 143 | if let Some(entry_id) = self.data.atlas.current_index_metadata()?.commit_log_head { 144 | if let Some(entry) = self.read_commit_log_entry(entry_id)? { 145 | return Ok(Some(Stored { 146 | grain_id: entry_id, 147 | stored: entry, 148 | })); 149 | } 150 | } 151 | 152 | Ok(None) 153 | } 154 | } 155 | 156 | impl Eq for Database where FileManager: file_manager::FileManager {} 157 | 158 | impl PartialEq for Database 159 | where 160 | FileManager: file_manager::FileManager, 161 | { 162 | fn eq(&self, other: &Self) -> bool { 163 | Arc::ptr_eq(&self.data, &other.data) 164 | } 165 | } 166 | 167 | #[derive(Debug)] 168 | struct Data 169 | where 170 | FileManager: file_manager::FileManager, 171 | { 172 | store: Store, 173 | checkpointer: Checkpointer, 174 | atlas: Atlas, 175 | commit_logs: CommitLogs, 176 | tx_lock: TransactionLock, 177 | } 178 | 179 | #[derive(thiserror::Error, Debug)] 180 | pub enum Error { 181 | #[error("a GrainId was used that was not allocated")] 182 | GrainNotAllocated, 183 | #[error("a poisoned lock was encountered, the database must be closed and reopened")] 184 | LockPoisoned, 185 | #[error("a thread was not able to be joined")] 186 | ThreadJoin, 187 | #[error("crc32 checksum mismatch")] 188 | ChecksumFailed, 189 | #[error("the value is too large to be stored in Sediment")] 190 | GrainTooLarge, 191 | #[error("an invalid grain id was encountered")] 192 | InvalidGrainId, 193 | #[error("the transaction id is not valid for this database")] 194 | InvalidTransactionId, 195 | #[error("value too large for target")] 196 | ValueOutOfBounds, 197 | #[error("io error: {0}")] 198 | Io(#[from] io::Error), 199 | #[error("the service has shut down")] 200 | Shutdown, 201 | #[error("database verification failed: {0}")] 202 | VerificationFailed(String), 203 | } 204 | 205 | impl Error { 206 | fn verification_failed(reason: impl Into) -> Self { 207 | Self::VerificationFailed(reason.into()) 208 | } 209 | } 210 | 211 | pub type Result = std::result::Result; 212 | 213 | impl From for io::Error { 214 | fn from(err: Error) -> Self { 215 | match err { 216 | Error::Io(err) => err, 217 | other => io::Error::new(io::ErrorKind::Other, other), 218 | } 219 | } 220 | } 221 | 222 | impl From> for Error { 223 | fn from(_: PoisonError) -> Self { 224 | Self::LockPoisoned 225 | } 226 | } 227 | 228 | impl From for Error { 229 | fn from(_: TryFromIntError) -> Self { 230 | Self::ValueOutOfBounds 231 | } 232 | } 233 | 234 | impl From for Error { 235 | fn from(error: FSyncError) -> Self { 236 | match error { 237 | FSyncError::Shutdown => Self::Shutdown, 238 | FSyncError::ThreadJoin => Self::ThreadJoin, 239 | FSyncError::InternalInconstency => Self::LockPoisoned, 240 | FSyncError::Io(io) => Self::Io(io), 241 | } 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /src/commit_log.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{hash_map, HashMap}; 2 | use std::io::{Read, Write}; 3 | use std::sync::{Arc, Condvar, Mutex}; 4 | 5 | use okaywal::file_manager; 6 | 7 | use crate::format::{ByteUtil, GrainId, Stored, TransactionId}; 8 | use crate::util::{u32_to_usize, usize_to_u32}; 9 | use crate::{Database, Error, Result}; 10 | 11 | #[derive(Debug)] 12 | pub struct CommitLogEntry { 13 | pub transaction_id: TransactionId, 14 | pub next_entry: Option, 15 | pub new_grains: Vec, 16 | pub archived_grains: Vec, 17 | pub freed_grains: Vec, 18 | 19 | pub embedded_header_data: Option, 20 | pub checkpoint_target: TransactionId, 21 | pub checkpointed_to: TransactionId, 22 | } 23 | 24 | impl CommitLogEntry { 25 | pub fn new( 26 | transaction_id: TransactionId, 27 | next_entry: Option, 28 | embedded_header_data: Option, 29 | checkpoint_target: TransactionId, 30 | checkpointed_to: TransactionId, 31 | ) -> Self { 32 | Self { 33 | transaction_id, 34 | next_entry, 35 | new_grains: Vec::new(), 36 | archived_grains: Vec::new(), 37 | freed_grains: Vec::new(), 38 | embedded_header_data, 39 | checkpoint_target, 40 | checkpointed_to, 41 | } 42 | } 43 | 44 | pub fn serialize_to(&self, bytes: &mut Vec) -> Result<()> { 45 | let total_size = 8 // transaction_id 46 | + 8 // next_entry 47 | + 8 // embedded_header_data 48 | + 8 // checkpoint_target 49 | + 8 // checkpointed_to 50 | + 4 * 3 // u32 counts of all three grain types 51 | + (self.new_grains.len() * NewGrain::BYTES) 52 | + (self.archived_grains.len() + self.freed_grains.len()) * 8; 53 | bytes.clear(); 54 | bytes.reserve(total_size); 55 | 56 | bytes.write_all(&self.transaction_id.to_be_bytes())?; 57 | bytes.write_all(&self.next_entry.to_be_bytes())?; 58 | 59 | bytes.write_all(&self.embedded_header_data.to_be_bytes())?; 60 | bytes.write_all(&self.checkpoint_target.to_be_bytes())?; 61 | bytes.write_all(&self.checkpointed_to.to_be_bytes())?; 62 | 63 | bytes.write_all(&usize_to_u32(self.new_grains.len())?.to_be_bytes())?; 64 | bytes.write_all(&usize_to_u32(self.archived_grains.len())?.to_be_bytes())?; 65 | bytes.write_all(&usize_to_u32(self.freed_grains.len())?.to_be_bytes())?; 66 | 67 | for grain in &self.new_grains { 68 | bytes.write_all(&grain.id.to_bytes())?; 69 | bytes.write_all(&grain.crc32.to_be_bytes())?; 70 | } 71 | 72 | for grain in &self.archived_grains { 73 | bytes.write_all(&grain.to_bytes())?; 74 | } 75 | 76 | for grain in &self.freed_grains { 77 | bytes.write_all(&grain.to_bytes())?; 78 | } 79 | 80 | Ok(()) 81 | } 82 | 83 | pub fn read_from(mut reader: R) -> Result { 84 | let mut eight_bytes = [0; 8]; 85 | reader.read_exact(&mut eight_bytes)?; 86 | let transaction_id = TransactionId::from_be_bytes(eight_bytes); 87 | reader.read_exact(&mut eight_bytes)?; 88 | let next_entry = GrainId::from_bytes(&eight_bytes); 89 | reader.read_exact(&mut eight_bytes)?; 90 | let embedded_header_data = GrainId::from_bytes(&eight_bytes); 91 | reader.read_exact(&mut eight_bytes)?; 92 | let checkpoint_target = TransactionId::from_be_bytes(eight_bytes); 93 | reader.read_exact(&mut eight_bytes)?; 94 | let checkpointed_to = TransactionId::from_be_bytes(eight_bytes); 95 | 96 | let mut four_bytes = [0; 4]; 97 | reader.read_exact(&mut four_bytes)?; 98 | let new_grain_count = u32::from_be_bytes(four_bytes); 99 | reader.read_exact(&mut four_bytes)?; 100 | let archived_grain_count = u32::from_be_bytes(four_bytes); 101 | reader.read_exact(&mut four_bytes)?; 102 | let freed_grain_count = u32::from_be_bytes(four_bytes); 103 | 104 | let mut new_grains = Vec::with_capacity(u32_to_usize(new_grain_count)?); 105 | for _ in 0..new_grain_count { 106 | reader.read_exact(&mut eight_bytes)?; 107 | let id = GrainId::from_bytes(&eight_bytes).ok_or(Error::InvalidGrainId)?; 108 | reader.read_exact(&mut four_bytes)?; 109 | let crc32 = u32::from_be_bytes(four_bytes); 110 | new_grains.push(NewGrain { id, crc32 }); 111 | } 112 | 113 | let mut archived_grains = Vec::with_capacity(u32_to_usize(archived_grain_count)?); 114 | for _ in 0..archived_grain_count { 115 | reader.read_exact(&mut eight_bytes)?; 116 | let id = GrainId::from_bytes(&eight_bytes).ok_or(Error::InvalidGrainId)?; 117 | archived_grains.push(id); 118 | } 119 | 120 | let mut freed_grains = Vec::with_capacity(u32_to_usize(freed_grain_count)?); 121 | for _ in 0..freed_grain_count { 122 | reader.read_exact(&mut eight_bytes)?; 123 | let id = GrainId::from_bytes(&eight_bytes).ok_or(Error::InvalidGrainId)?; 124 | freed_grains.push(id); 125 | } 126 | 127 | Ok(Self { 128 | transaction_id, 129 | next_entry, 130 | new_grains, 131 | archived_grains, 132 | freed_grains, 133 | embedded_header_data, 134 | checkpoint_target, 135 | checkpointed_to, 136 | }) 137 | } 138 | 139 | pub fn next_entry( 140 | &self, 141 | database: &Database, 142 | ) -> Result>>> 143 | where 144 | FileManager: file_manager::FileManager, 145 | { 146 | if self.transaction_id > database.checkpointed_to()? { 147 | if let Some(entry_id) = self.next_entry { 148 | if let Some(entry) = database.read_commit_log_entry(entry_id)? { 149 | return Ok(Some(Stored { 150 | grain_id: entry_id, 151 | stored: entry, 152 | })); 153 | } 154 | } 155 | } 156 | 157 | Ok(None) 158 | } 159 | } 160 | 161 | #[derive(Debug)] 162 | pub struct NewGrain { 163 | pub id: GrainId, 164 | pub crc32: u32, 165 | } 166 | 167 | impl NewGrain { 168 | const BYTES: usize = 12; 169 | } 170 | 171 | #[derive(Debug, Default)] 172 | pub struct CommitLogs { 173 | // TODO this should be an LRU 174 | cached: Mutex>, 175 | sync: Condvar, 176 | } 177 | 178 | impl CommitLogs { 179 | pub fn cache(&self, grain_id: GrainId, entry: Arc) -> Result<()> { 180 | let mut data = self.cached.lock()?; 181 | data.insert(grain_id, CommitLogCacheEntry::Cached(Some(entry))); 182 | Ok(()) 183 | } 184 | 185 | pub fn get_or_lookup( 186 | &self, 187 | grain_id: GrainId, 188 | db: &Database, 189 | ) -> Result>> 190 | where 191 | FileManager: file_manager::FileManager, 192 | { 193 | let mut data = self.cached.lock()?; 194 | loop { 195 | match data.entry(grain_id) { 196 | hash_map::Entry::Occupied(entry) => match entry.get() { 197 | CommitLogCacheEntry::Cached(cached) => return Ok(cached.clone()), 198 | CommitLogCacheEntry::Caching => { 199 | // Another thread is trying to cache this entry already. 200 | data = self.sync.wait(data)?; 201 | } 202 | }, 203 | hash_map::Entry::Vacant(miss) => { 204 | miss.insert(CommitLogCacheEntry::Caching); 205 | drop(data); 206 | 207 | // We want to be careful to not cause another thread to 208 | // block indefinitely if we receive an error. 209 | let result = match Self::read_entry(grain_id, db) { 210 | Ok(entry) => { 211 | let entry = entry.map(Arc::new); 212 | data = self.cached.lock()?; 213 | data.insert(grain_id, CommitLogCacheEntry::Cached(entry.clone())); 214 | Ok(entry) 215 | } 216 | Err(err) => { 217 | // We had an error reading, clear our entry in the 218 | // cache before returning it. 219 | data = self.cached.lock()?; 220 | data.remove(&grain_id); 221 | Err(err) 222 | } 223 | }; 224 | 225 | drop(data); 226 | 227 | // This is wasteful to wake up all waiting threads, but we 228 | // don't have a good way to notify just a single one. 229 | self.sync.notify_all(); 230 | 231 | return result; 232 | } 233 | } 234 | } 235 | } 236 | 237 | fn read_entry( 238 | grain_id: GrainId, 239 | db: &Database, 240 | ) -> Result> 241 | where 242 | FileManager: file_manager::FileManager, 243 | { 244 | if let Some(reader) = db.read(grain_id)? { 245 | let data = reader.read_all_data()?; 246 | let entry = CommitLogEntry::read_from(&data[..])?; 247 | Ok(Some(entry)) 248 | } else { 249 | Ok(None) 250 | } 251 | } 252 | } 253 | 254 | #[derive(Debug)] 255 | enum CommitLogCacheEntry { 256 | Cached(Option>), 257 | Caching, 258 | } 259 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sediment 2 | 3 | **This crate is not anywhere near being ready to be used in production projects. 4 | Constructive feedback is welcome!** 5 | 6 | ![sediment forbids unsafe code](https://img.shields.io/badge/unsafe-forbid-success) 7 | [![crate version](https://img.shields.io/crates/v/sediment.svg)](https://crates.io/crates/sediment) 8 | [![Live Build Status](https://img.shields.io/github/actions/workflow/status/khonsulabs/sediment/rust.yml?branch=main)](https://github.com/khonsulabs/sediment/actions?query=workflow:Tests) 9 | [![HTML Coverage Report for `main` branch](https://khonsulabs.github.io/sediment/coverage/badge.svg)](https://khonsulabs.github.io/sediment/coverage/) 10 | [![Documentation](https://img.shields.io/badge/docs-main-informational)](https://khonsulabs.github.io/sediment/main/sediment) 11 | 12 | This storage format is meant to provide a foundation for building ACID-compliant 13 | databases. 14 | 15 | - Uses a [write-ahead log][okaywal] for efficient, atomic, durable writes. 16 | - Data chunks can be written and assigned a unique ID. 17 | - Data chunks can be archived by their unique ID. 18 | - The database can be checkpointed to free previously archived data for reuse. 19 | - The lifecycle of data allows building full database replication from the 20 | commit log contained within the database and ensuring data isn't overwritten 21 | before all clients are replicated. 22 | 23 | This database uses a folder to store its files. Additional files can be stored 24 | within the folder without affecting Sediment. 25 | 26 | ## The Grain Lifecycle 27 | 28 | The storage format of Sediment is organized around storing chunks of data in 29 | slots known as *grains*. When data is written to Sediment, it is given a 30 | `GrainId`. The data associated with a `GrainId` is immutable. 31 | 32 | A `GrainId` can be used to read previously stored data. Once a grain is no 33 | longer needed, it can be archived. Archiving a grain marks it as being able to 34 | be freed during a checkpointing operation. 35 | 36 | When Sediment's database is checkpointed, all archived grains are freed. After 37 | this operation, the previously allocated regions will be able to be reused. 38 | 39 | This design allows Sediment to be viewed as an append-only data format despite 40 | not being implemented as such. This design also enables building a replication 41 | log using the built-in commit log. 42 | 43 | ## Sediment Architecture 44 | 45 | Sediment's internal implementation is separated into several main types: 46 | 47 | - `Atlas`: The Atlas keeps track of the in-memory representation of the state of 48 | the Sediment database. The Atlas knows whether a particular grain is in the 49 | write-ahead log or isn't even allocated at all. 50 | - `Store`: The Store manages updating the on-disk representation of the Sediment 51 | database. 52 | - `WalManager`: The WalManager persists changes from the write-ahead log to the 53 | Store, and updates the Atlas when data has moved from the write-ahead log to 54 | the primary data store. 55 | - `Checkpointer`: The Checkpointer is responsible for freeing archived grains 56 | and removing old commit log entries. Without the Checkpointer, Sediment would 57 | be an append-only database that data could never be removed from. 58 | 59 | ### Disk Format 60 | 61 | On-disk, Sediment uses an `index` file and a collection of "Strata" -- sticking 62 | with the geology theme. Each Stratum is organized by its Basin, which determines 63 | the individual grain size within the Stratum. 64 | 65 | The `index` contains two copies of the `IndexHeader`. When updating the `index`, 66 | the inactive version should be overwritten. This ensures that if a crash happens 67 | while updating the header, the currently active version remains untouched. 68 | 69 | Each Stratum contains two copies of a `StratumHeader`. Just like the `index`, 70 | when updating a Stratum's header, the inactive version should be overwritten. 71 | After the `StratumHeader`s, the remainder of the file is an array of grains. 72 | 73 | A grain allocation can span multiple grains. Each grain begins with an 8-byte 74 | value representing the `TransactionId` that the grain was allocated during. 75 | Next, a 4-byte encoded representation of the allocation's exact byte length. The 76 | grain's data comes after the length field. Finally a CRC32 is written directly 77 | after the grain, which is a checksum of the data itself. This means that each 78 | grain has 16 bytes of overhead. 79 | 80 | ### Opening a database (Recovery) 81 | 82 | To open a database from an unknown state, the first step is opening the `index`. 83 | The two `IndexHeaders`'s `transaction_id`s are compared. The most recent version 84 | is attempted to be read from initially. 85 | 86 | To verify that the `IndexHeader` is valid, we must inspect all data written. If 87 | any checksums fail or inconsistencies are detected, the older `IndexHeader` 88 | should be used instead. 89 | 90 | To validate the last commit, the `commit_log_head` is loaded and each grain 91 | operation must have the updates on-disk validated. To ensure a newly written 92 | grain's data is correct, the grain header's `TransactionId` must match the 93 | `IndexHeader` and the data's CRC must be validated. The expected CRC is not only 94 | written on disk but is also contained in the `CommitLogEntry`. 95 | 96 | Once the `Store` has restored its state from disk, the `Atlas` can be 97 | initialized from the disk state. With the `Store` and the `Atlas` created, the 98 | `WriteAheadLog` can be opened. 99 | 100 | While recovering the `WriteAheadLog`, the `WalManager` will update the `Atlas` 101 | with changes that were previously written to the WAL but haven't been 102 | checkpointed into primary storage yet. 103 | 104 | After the `WriteAheadLog` has finished recovery, the Sediment database is now 105 | fully validated. 106 | 107 | ### Transaction Flow (Writing Data) 108 | 109 | Sediment uses OkayWAL for its `WriteAheadLog` implementation. OkayWAL only 110 | allows a single thread to write an Entry at any given time, but will 111 | transparently batch multiple threads when their writes can be `fsync`ed 112 | simultaneously. To leverage this, we need to ensure that changes written in 113 | transactions are available to future transactions but are not published to 114 | readers outside of a transaction until after the `WriteAheadLog` has confirmed 115 | the commit. 116 | 117 | This state is managed by the `TransactionLock`. It hands out a single 118 | `TransactionGuard` at a time, which allows a thread to begin a new entry in the 119 | `WriteAheadLog`. It also provides access to the transactional version of the 120 | `IndexMetadata`. 121 | 122 | Before committing a transaction's Entry to the `WriteAheadLog`, 123 | `TransactionGuard::store` is called, which updates the transactional 124 | `IndexMetadata` and stages the information about the current commit in a 125 | location that can be published after the `WriteAheadLog` confirms the entry is 126 | written. The thread is handed back a `TransactionFnalizer`. Another thread will 127 | now be able to acquire a `TransactionGuard`. 128 | 129 | The thread then commits the `WriteAheadLog` entry. Because another thread can 130 | acquire a `TransactionGuard`, the `WriteAheadLog` is able to queue up and batch 131 | multiple operations being made by different threads. 132 | 133 | When the `WriteAheadLog` entry is committed, the `TransactionFinalizer` can be 134 | finalized, which publishes all staged transactions up to and including the one 135 | that was written by this thread. 136 | 137 | Publishing the staged changes involves notifying the `Atlas` of the 138 | `LogPosition`s that new `GrainId`s are available at, or marking `GrainId`s as 139 | freed. 140 | 141 | ### `WriteAheadLog` Checkpointing 142 | 143 | Periodically, the `WriteAheadLog` will invoke `WalManager::checkpoint_to()`, 144 | part of the `LogManager` trait. During this stage, the WAL needs the 145 | `WalManager` to move all data from the entries being checkpointed into the 146 | primary data store. 147 | 148 | The `LogManager` iterates over the entries being checkpointed. When a new grain 149 | is written, the grain data at the appropriate location in the appropriate 150 | Stratum is written including the grain's header and CRC. All grain changes are 151 | accumulated during this scanning operation. 152 | 153 | Once all entries have been processed, the `StratumHeader`s need to be updated to 154 | reflect the new grain states. Once all updates to a given Stratum have been 155 | processed, a new version of the `StratumHeader` is written to the file. 156 | 157 | Once all Stratum have been updated, the `IndexHeader` is updated with the latest 158 | checkpointed information. The new `IndexHeader` is then written to the `index`. 159 | 160 | Finally, all changed Stratum, the `index`, and if there were new files, the 161 | directory are `fsync`ed. 162 | 163 | After all files are fully synchronized to disk, the `Atlas` is notified of the 164 | grains that were checkpointed, allowing it to now serve read requests for grains 165 | from primary storage instead of the WAL. 166 | 167 | ### Sediment Checkpointing (Removing Data) 168 | 169 | Sediment stores a `CommitLogEntry` for each transaction, which describes the 170 | changes performed to each grain. This `CommitLogEntry` contains enough 171 | information to enable replicating a Sediment database. These `CommitLogEntry`s 172 | are important to the recovery process, but they can take up a lot of disk space 173 | if they aren't removed. 174 | 175 | The process of cleaning up the Sediment database is called checkpointing, and is 176 | driven by the `Checkpointer`. The `IndexMetadata` has two fields related to 177 | checkpointing: `checkpointed_to` and `checkpoint_target`. 178 | 179 | Sediment is built around atomic operations. Because data cannot be fully freed 180 | until it has been checkpointed into primary storage, we first commit an update 181 | to the `checkpoint_target` value. Once the update to `checkpoint_target` has 182 | been persisted by the `WalManager` during a WAL Checkpoint, the `Checkpointer` 183 | is notified of the updated target. 184 | 185 | The `Checkpointer` scans the database for all `CommitLogEntry`s that have 186 | `TransactionId`s between `checkpointed_to` and the updated `checkpoint_target`. 187 | If any matching `CommitLogEntry` has any `archived_grains`, they are collected 188 | into a list of grains to free. Each `CommitLogEntry`'s `GrainId` is collected to 189 | archive. 190 | 191 | Once all matching `CommitLogEntry`s have been found, a new transaction is begun. 192 | The transaction writes the collected grain archive and free commands. 193 | Additionally, the transaction updates the `checkpointed_to` value to the 194 | `TransactionId` just checkpointed. 195 | 196 | Once the transaction is committed, the Checkpointer waits for a new 197 | `checkpoint_target`. 198 | 199 | [okaywal]: https://github.com/khonsulabs/okaywal 200 | -------------------------------------------------------------------------------- /src/transaction.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | use std::sync::{Arc, Condvar, Mutex}; 3 | 4 | use crc32c::crc32c; 5 | use okaywal::{file_manager, EntryWriter, LogPosition}; 6 | 7 | use crate::atlas::IndexMetadata; 8 | use crate::commit_log::{CommitLogEntry, NewGrain}; 9 | use crate::format::{GrainId, TransactionId}; 10 | use crate::util::usize_to_u32; 11 | use crate::wal::WalChunk; 12 | use crate::{Database, Error, Result}; 13 | 14 | #[derive(Debug)] 15 | pub struct Transaction<'db, FileManager> 16 | where 17 | FileManager: file_manager::FileManager, 18 | { 19 | database: &'db Database, 20 | entry: Option>, 21 | guard: Option, 22 | state: Option, 23 | } 24 | 25 | impl<'db, FileManager> Transaction<'db, FileManager> 26 | where 27 | FileManager: file_manager::FileManager, 28 | { 29 | pub(super) fn new( 30 | database: &'db Database, 31 | entry: EntryWriter<'db, FileManager>, 32 | guard: TransactionGuard, 33 | ) -> Result { 34 | let metadata = guard.current_index_metadata(); 35 | Ok(Self { 36 | database, 37 | state: Some(CommittingTransaction { 38 | metadata, 39 | written_grains: Vec::new(), 40 | log_entry: CommitLogEntry::new( 41 | TransactionId::from(entry.id()), 42 | metadata.commit_log_head, 43 | metadata.embedded_header_data, 44 | metadata.checkpoint_target, 45 | metadata.checkpointed_to, 46 | ), 47 | }), 48 | 49 | entry: Some(entry), 50 | guard: Some(guard), 51 | }) 52 | } 53 | 54 | pub fn write(&mut self, data: &[u8]) -> Result { 55 | let data_length = usize_to_u32(data.len())?; 56 | let grain_id = self.database.data.atlas.reserve(data_length)?; 57 | 58 | let entry = self.entry.as_mut().expect("entry missing"); 59 | let mut chunk = entry.begin_chunk(WalChunk::new_grain_length(data_length))?; 60 | WalChunk::write_new_grain(grain_id, data, &mut chunk)?; 61 | let record = chunk.finish()?; 62 | 63 | let state = self.state.as_mut().expect("state missing"); 64 | state.written_grains.push((grain_id, record.position)); 65 | state.log_entry.new_grains.push(NewGrain { 66 | id: grain_id, 67 | crc32: crc32c(data), 68 | }); 69 | Ok(grain_id) 70 | } 71 | 72 | pub fn archive(&mut self, grain: GrainId) -> Result<()> { 73 | self.database.data.atlas.check_grain_validity(grain)?; 74 | 75 | let entry = self.entry.as_mut().expect("entry missing"); 76 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 77 | WalChunk::write_archive_grain(grain, &mut chunk)?; 78 | chunk.finish()?; 79 | 80 | let state = self.state.as_mut().expect("state missing"); 81 | state.log_entry.archived_grains.push(grain); 82 | 83 | Ok(()) 84 | } 85 | 86 | pub(crate) fn free_grains(&mut self, grains: &[GrainId]) -> Result<()> { 87 | let entry = self.entry.as_mut().expect("entry missing"); 88 | for grain in grains { 89 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 90 | WalChunk::write_free_grain(*grain, &mut chunk)?; 91 | chunk.finish()?; 92 | } 93 | 94 | let state = self.state.as_mut().expect("state missing"); 95 | state.log_entry.freed_grains.extend(grains.iter().copied()); 96 | 97 | Ok(()) 98 | } 99 | 100 | #[allow(clippy::drop_ref)] 101 | pub fn set_embedded_header(&mut self, new_header: Option) -> Result<()> { 102 | let entry = self.entry.as_mut().expect("entry missing"); 103 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 104 | WalChunk::write_embedded_header_update(new_header, &mut chunk)?; 105 | chunk.finish()?; 106 | 107 | let mut state = self.state.as_mut().expect("state missing"); 108 | if let Some(old_header) = state.log_entry.embedded_header_data { 109 | drop(state); 110 | self.archive(old_header)?; 111 | state = self.state.as_mut().expect("state missing"); 112 | } 113 | 114 | state.metadata.embedded_header_data = new_header; 115 | state.log_entry.embedded_header_data = new_header; 116 | 117 | Ok(()) 118 | } 119 | 120 | pub fn checkpoint_to(&mut self, tx_id: TransactionId) -> Result<()> { 121 | let entry = self.entry.as_mut().expect("entry missing"); 122 | let mut state = self.state.as_mut().expect("state missing"); 123 | if tx_id <= state.log_entry.checkpoint_target { 124 | // already the checkpoint target 125 | return Ok(()); 126 | } else if tx_id >= entry.id() { 127 | return Err(Error::InvalidTransactionId); 128 | } 129 | 130 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 131 | WalChunk::write_checkpoint_to(tx_id, &mut chunk)?; 132 | chunk.finish()?; 133 | 134 | state.log_entry.checkpoint_target = tx_id; 135 | 136 | Ok(()) 137 | } 138 | 139 | pub(crate) fn checkpointed_to(&mut self, tx_id: TransactionId) -> Result<()> { 140 | let entry = self.entry.as_mut().expect("entry missing"); 141 | let mut state = self.state.as_mut().expect("state missing"); 142 | if tx_id <= state.log_entry.checkpointed_to { 143 | // already the checkpoint target 144 | return Ok(()); 145 | } else if tx_id >= entry.id() { 146 | return Err(Error::InvalidTransactionId); 147 | } 148 | 149 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 150 | WalChunk::write_checkpointed_to(tx_id, &mut chunk)?; 151 | chunk.finish()?; 152 | 153 | state.log_entry.checkpointed_to = tx_id; 154 | 155 | Ok(()) 156 | } 157 | 158 | #[allow(clippy::drop_ref)] 159 | pub fn commit(mut self) -> Result { 160 | let state = self.state.as_mut().expect("state missing"); 161 | // Write the commit log entry 162 | state.log_entry.freed_grains.sort_unstable(); 163 | let mut log_entry_bytes = Vec::new(); 164 | state.log_entry.serialize_to(&mut log_entry_bytes)?; 165 | drop(state); 166 | let new_commit_log_head = self.write(&log_entry_bytes)?; 167 | 168 | let mut state = self.state.take().expect("state missing"); 169 | state.metadata.commit_log_head = Some(new_commit_log_head); 170 | // Because we end up caching the log_entry, we need it to match what's 171 | // on disk. What we just stored did not contain the newly written commit 172 | // log head grain. We need to remove it from the entry. 173 | state.log_entry.new_grains.pop(); 174 | 175 | // Write the transaction tail 176 | let mut entry = self.entry.take().expect("entry missing"); 177 | let mut chunk = entry.begin_chunk(WalChunk::COMMAND_LENGTH)?; 178 | WalChunk::write_transaction_tail(new_commit_log_head, &mut chunk)?; 179 | chunk.finish()?; 180 | 181 | let guard = self.guard.take().expect("tx guard missing"); 182 | 183 | let transaction_id = state.log_entry.transaction_id; 184 | let finalizer = guard.stage(state, self.database); 185 | 186 | entry.commit()?; 187 | 188 | finalizer.finalize()?; 189 | 190 | Ok(transaction_id) 191 | } 192 | 193 | pub fn rollback(mut self) -> Result<()> { 194 | self.rollback_transaction() 195 | } 196 | 197 | fn rollback_transaction(&mut self) -> Result<()> { 198 | let mut state = self.state.take().expect("state missing"); 199 | let entry = self.entry.take().expect("entry missing"); 200 | 201 | let result = entry.rollback(); 202 | 203 | self.database 204 | .data 205 | .atlas 206 | .rollback_grains(state.written_grains.drain(..).map(|(g, _)| g))?; 207 | 208 | result?; 209 | 210 | Ok(()) 211 | } 212 | } 213 | 214 | impl<'db, FileManager> Drop for Transaction<'db, FileManager> 215 | where 216 | FileManager: file_manager::FileManager, 217 | { 218 | fn drop(&mut self) { 219 | if self.entry.is_some() { 220 | self.rollback_transaction() 221 | .expect("error rolling back transaction"); 222 | } 223 | } 224 | } 225 | 226 | #[derive(Debug, Clone)] 227 | pub struct TransactionLock { 228 | data: Arc, 229 | } 230 | 231 | impl TransactionLock { 232 | pub fn new(initial_metadata: IndexMetadata) -> Self { 233 | Self { 234 | data: Arc::new(TransactionLockData { 235 | tx_lock: Mutex::new(TransactionState::new(initial_metadata)), 236 | tx_sync: Condvar::new(), 237 | }), 238 | } 239 | } 240 | 241 | pub(super) fn lock(&self) -> TransactionGuard { 242 | let mut state = self.data.tx_lock.lock().expect("can't panick"); 243 | 244 | // Wait for the locked status to be relinquished 245 | while state.in_transaction { 246 | state = self.data.tx_sync.wait(state).expect("can't panick"); 247 | } 248 | 249 | // Acquire the locked status 250 | state.in_transaction = true; 251 | 252 | // Return the guard 253 | TransactionGuard { lock: self.clone() } 254 | } 255 | } 256 | 257 | #[derive(Debug)] 258 | struct TransactionLockData { 259 | tx_lock: Mutex, 260 | tx_sync: Condvar, 261 | } 262 | 263 | /// Ensures only one thread has access to begin a transaction at any given time. 264 | /// 265 | /// This guard ensures that no two threads try to update some of the in-memory 266 | /// state at the same time. The Write-Ahead Log always ensures only one thread 267 | /// can write to it already, but we need extra guarantees because we don't want 268 | /// to publish some state until after the WAL has confirmed its commit. 269 | #[derive(Debug)] 270 | pub(super) struct TransactionGuard { 271 | lock: TransactionLock, 272 | } 273 | 274 | impl TransactionGuard { 275 | pub fn current_index_metadata(&self) -> IndexMetadata { 276 | let state = self.lock.data.tx_lock.lock().expect("cannot panic"); 277 | state.metadata 278 | } 279 | 280 | pub(super) fn stage( 281 | self, 282 | tx: CommittingTransaction, 283 | db: &'_ Database, 284 | ) -> TransactionFinalizer<'_, FileManager> 285 | where 286 | FileManager: file_manager::FileManager, 287 | { 288 | let id = tx.log_entry.transaction_id; 289 | let mut state = self.lock.data.tx_lock.lock().expect("cannot panic"); 290 | state.metadata = tx.metadata; 291 | state.committing_transactions.push_back(tx); 292 | 293 | TransactionFinalizer { 294 | db, 295 | lock: self.lock.clone(), 296 | id, 297 | } 298 | } 299 | } 300 | 301 | impl Drop for TransactionGuard { 302 | fn drop(&mut self) { 303 | // Reset the locked status 304 | let mut state = self.lock.data.tx_lock.lock().expect("can't panick"); 305 | state.in_transaction = false; 306 | drop(state); 307 | 308 | // Notify the next waiter. 309 | self.lock.data.tx_sync.notify_one(); 310 | } 311 | } 312 | 313 | #[derive(Debug)] 314 | struct TransactionState { 315 | in_transaction: bool, 316 | metadata: IndexMetadata, 317 | committing_transactions: VecDeque, 318 | } 319 | 320 | impl TransactionState { 321 | pub fn new(initial_metadata: IndexMetadata) -> Self { 322 | Self { 323 | in_transaction: false, 324 | metadata: initial_metadata, 325 | committing_transactions: VecDeque::new(), 326 | } 327 | } 328 | } 329 | 330 | #[derive(Debug)] 331 | pub(super) struct CommittingTransaction { 332 | metadata: IndexMetadata, 333 | written_grains: Vec<(GrainId, LogPosition)>, 334 | log_entry: CommitLogEntry, 335 | } 336 | 337 | #[derive(Debug)] 338 | pub(super) struct TransactionFinalizer<'a, FileManager> 339 | where 340 | FileManager: file_manager::FileManager, 341 | { 342 | db: &'a Database, 343 | lock: TransactionLock, 344 | id: TransactionId, 345 | } 346 | 347 | impl<'a, FileManager> TransactionFinalizer<'a, FileManager> 348 | where 349 | FileManager: file_manager::FileManager, 350 | { 351 | pub fn finalize(self) -> Result<()> { 352 | let mut state = self.lock.data.tx_lock.lock().expect("can't panic"); 353 | 354 | while state 355 | .committing_transactions 356 | .front() 357 | .map_or(false, |tx| tx.log_entry.transaction_id <= self.id) 358 | { 359 | let mut tx_to_commit = state 360 | .committing_transactions 361 | .pop_front() 362 | .expect("just checked"); 363 | self.db.data.atlas.note_transaction_committed( 364 | tx_to_commit.metadata, 365 | tx_to_commit.written_grains.drain(..), 366 | &tx_to_commit.log_entry.freed_grains, 367 | false, 368 | )?; 369 | self.db.data.commit_logs.cache( 370 | tx_to_commit 371 | .metadata 372 | .commit_log_head 373 | .expect("commit log must be present"), 374 | Arc::new(tx_to_commit.log_entry), 375 | )?; 376 | } 377 | 378 | Ok(()) 379 | } 380 | } 381 | -------------------------------------------------------------------------------- /src/allocations.rs: -------------------------------------------------------------------------------- 1 | use crate::format::{ 2 | GrainAllocationInfo, GrainAllocationStatus, GrainIndex, LocalGrainId, StratumHeader, 3 | }; 4 | 5 | #[derive(Debug)] 6 | pub struct FreeLocations { 7 | free_locations: Vec, 8 | grains_free: u16, 9 | } 10 | 11 | impl Default for FreeLocations { 12 | fn default() -> Self { 13 | Self { 14 | free_locations: vec![LocationInfo { 15 | offset: GrainIndex::new(0).expect("0 is valid"), 16 | length: 16_372, 17 | }], 18 | grains_free: 16_372, 19 | } 20 | } 21 | } 22 | 23 | impl FreeLocations { 24 | pub fn from_stratum(stratum: &StratumHeader) -> Self { 25 | let mut free_locations = Vec::new(); 26 | 27 | let mut index = 0; 28 | while index < 16_372 { 29 | let index_status = stratum.grain_info(index); 30 | let count = index_status.count(); 31 | let free = matches!( 32 | index_status.status().expect("invalid header"), 33 | GrainAllocationStatus::Free 34 | ); 35 | 36 | if free { 37 | // See how many free grains in a row we have. 38 | let free_until = if let Some(next_allocated_index) = stratum.grains[index + 1..] 39 | .iter() 40 | .enumerate() 41 | .find_map(|(index, info)| { 42 | matches!( 43 | GrainAllocationInfo(*info).status().expect("invalid header"), 44 | GrainAllocationStatus::Allocated | GrainAllocationStatus::Archived 45 | ) 46 | .then_some(index) 47 | }) { 48 | next_allocated_index + index + 1 49 | } else { 50 | 16_372 51 | }; 52 | 53 | insert_location_info( 54 | &mut free_locations, 55 | LocationInfo { 56 | offset: GrainIndex::new(index as u16).expect("valid index"), 57 | length: (free_until - index) as u16, 58 | }, 59 | ); 60 | 61 | index = free_until; 62 | } else { 63 | index += usize::from(count); 64 | } 65 | } 66 | 67 | let grains_free = free_locations.iter().map(|loc| loc.length).sum::(); 68 | Self { 69 | free_locations, 70 | grains_free, 71 | } 72 | } 73 | 74 | pub fn allocate(&mut self, number_of_grains: u8) -> Option { 75 | let number_of_grains_u16 = u16::from(number_of_grains); 76 | if number_of_grains_u16 <= self.grains_free { 77 | for (index, location) in self.free_locations.iter().enumerate() { 78 | if location.length >= number_of_grains_u16 { 79 | self.grains_free -= number_of_grains_u16; 80 | // This position can be split. If the new length will cause the 81 | // location to shift position, we'll remove and reinsert it. 82 | let new_length = location.length - number_of_grains_u16; 83 | let grain_index = location.offset; 84 | 85 | if new_length == 0 { 86 | self.free_locations.remove(index); 87 | } else { 88 | self.free_locations[index].offset += number_of_grains; 89 | self.free_locations[index].length = new_length; 90 | 91 | self.check_if_location_needs_to_move(index); 92 | } 93 | 94 | return Some( 95 | LocalGrainId::from_parts(grain_index, number_of_grains) 96 | .expect("invalid grain count"), 97 | ); 98 | } 99 | } 100 | } 101 | 102 | None 103 | } 104 | 105 | fn check_if_location_needs_to_move(&mut self, index: usize) { 106 | let info = &self.free_locations[index]; 107 | if index > 0 && self.free_locations[index - 1].length > info.length { 108 | // Needs to move below 109 | if index == 1 || (index > 1 && self.free_locations[index - 2].length < info.length) { 110 | // A simple swap of index and index - 1 is enough 111 | self.free_locations.swap(index - 1, index); 112 | } else { 113 | // We need to move this row more than a single location 114 | let info = self.free_locations.remove(index); 115 | insert_location_info(&mut self.free_locations, info); 116 | } 117 | } else if index + 1 < self.free_locations.len() 118 | && self.free_locations[index + 1].length < info.length 119 | { 120 | // Needs to move above 121 | if index + 2 == self.free_locations.len() 122 | || (index + 2 < self.free_locations.len() 123 | && self.free_locations[index + 1].length > info.length) 124 | { 125 | // A simple swap of index and index - 1 is enough 126 | self.free_locations.swap(index, index + 1); 127 | } else { 128 | // We need to move this row more than a single location 129 | let info = self.free_locations.remove(index); 130 | insert_location_info(&mut self.free_locations, info); 131 | } 132 | } 133 | } 134 | 135 | #[must_use] 136 | pub fn allocate_grain(&mut self, grain_id: LocalGrainId) -> bool { 137 | let start = grain_id.grain_index(); 138 | let count = u16::from(grain_id.grain_count()); 139 | let end = start.as_u16() + count; 140 | 141 | for (index, info) in self.free_locations.iter_mut().enumerate() { 142 | let location_end = info.offset.as_u16() + info.length; 143 | if location_end >= start.as_u16() && info.offset.as_u16() <= end { 144 | if info.offset == start { 145 | // Allocate from the start 146 | info.offset = GrainIndex::new(end).expect("valid index"); 147 | info.length -= count; 148 | if info.length > 0 { 149 | self.check_if_location_needs_to_move(index); 150 | } else { 151 | // The location is now empty. 152 | self.free_locations.remove(index); 153 | } 154 | } else if location_end == end { 155 | // Allocate from the end 156 | info.length -= count; 157 | 158 | // We can assume a non-zero length because otherwise 159 | // info.offset would have been equal to start. 160 | self.check_if_location_needs_to_move(index); 161 | } else { 162 | // Split this into two 163 | let remaining_start_grains = start.as_u16() - info.offset.as_u16(); 164 | let remaining_end_grains = location_end - end; 165 | 166 | info.length = remaining_start_grains; 167 | self.check_if_location_needs_to_move(index); 168 | 169 | // Add the new region for the tail end of the split. 170 | insert_location_info( 171 | &mut self.free_locations, 172 | LocationInfo { 173 | offset: GrainIndex::new(end).expect("valid index"), 174 | length: remaining_end_grains, 175 | }, 176 | ); 177 | } 178 | self.grains_free -= count; 179 | return true; 180 | } 181 | } 182 | 183 | false 184 | } 185 | 186 | pub fn free_grain(&mut self, grain_id: LocalGrainId) { 187 | let start = grain_id.grain_index(); 188 | let count = u16::from(grain_id.grain_count()); 189 | let end = start.as_u16() + count; 190 | 191 | self.grains_free += count; 192 | 193 | // First, attempt to find a grain whose end matches the start or whose 194 | // start matches the end. 195 | for (index, info) in self.free_locations.iter_mut().enumerate() { 196 | let location_end = info.offset.as_u16() + info.length; 197 | // Check for any overlap of the two regions 198 | if info.offset.as_u16() <= end && location_end >= start.as_u16() { 199 | // If we don't match the end or start, this is an invalid 200 | // operation. 201 | if info.offset.as_u16() == end { 202 | // Prepend the free grains 203 | info.offset = start; 204 | info.length += count; 205 | self.scan_for_merge_with_start(index, start.as_u16()); 206 | } else if location_end == start.as_u16() { 207 | // Append the free grains 208 | info.length += count; 209 | self.scan_for_merge_with_end(index, end); 210 | } else { 211 | unreachable!("invalid free operation: grain was already partially free") 212 | } 213 | 214 | return; 215 | } 216 | } 217 | 218 | // If we couldn't find an existing location to latch onto, we need to 219 | // insert it. 220 | insert_location_info( 221 | &mut self.free_locations, 222 | LocationInfo { 223 | offset: start, 224 | length: count, 225 | }, 226 | ) 227 | } 228 | 229 | fn scan_for_merge_with_start(&mut self, possible_merge_index: usize, start: u16) { 230 | for index in possible_merge_index + 1..self.free_locations.len() { 231 | let info = &self.free_locations[index]; 232 | 233 | if info.offset.as_u16() + info.length == start { 234 | let new_start = info.offset; 235 | let count = info.length; 236 | self.free_locations.remove(index); 237 | self.free_locations[possible_merge_index].offset = new_start; 238 | self.free_locations[possible_merge_index].length += count; 239 | return; 240 | } 241 | } 242 | } 243 | 244 | fn scan_for_merge_with_end(&mut self, possible_merge_index: usize, end: u16) { 245 | for index in possible_merge_index + 1..self.free_locations.len() { 246 | let info = &self.free_locations[index]; 247 | 248 | if info.offset.as_u16() == end { 249 | let count = info.length; 250 | self.free_locations.remove(index); 251 | self.free_locations[possible_merge_index].length += count; 252 | return; 253 | } 254 | } 255 | } 256 | 257 | pub const fn is_full(&self) -> bool { 258 | self.grains_free == 0 259 | } 260 | } 261 | 262 | fn insert_location_info(free_locations: &mut Vec, info: LocationInfo) { 263 | let insert_at = free_locations 264 | .binary_search_by(|loc| loc.length.cmp(&info.length)) 265 | .map_or_else(|a| a, |a| a); 266 | free_locations.insert(insert_at, info); 267 | } 268 | 269 | #[derive(Debug)] 270 | struct LocationInfo { 271 | offset: GrainIndex, 272 | length: u16, 273 | } 274 | 275 | #[test] 276 | fn basics() { 277 | let mut allocator = FreeLocations::default(); 278 | let first = allocator.allocate(16).expect("failed to allocate"); 279 | println!("Allocated {first}: {allocator:?}"); 280 | assert_eq!(allocator.free_locations.len(), 1); 281 | let second = allocator.allocate(16).expect("failed to allocate"); 282 | println!("Allocated {second}: {allocator:?}"); 283 | assert_eq!(allocator.free_locations.len(), 1); 284 | allocator.free_grain(first); 285 | println!("Freed {first}: {allocator:?}"); 286 | assert_eq!(allocator.free_locations.len(), 2); 287 | let reused_a = allocator.allocate(8).expect("failed to allocate"); 288 | println!("Allocated {reused_a}: {allocator:?}"); 289 | assert_eq!(allocator.free_locations.len(), 2); 290 | let reused_b = allocator.allocate(8).expect("failed to allocate"); 291 | println!("Allocated {reused_b}: {allocator:?}"); 292 | assert_eq!(allocator.free_locations.len(), 1); 293 | assert_eq!(allocator.grains_free, 16_372 - 32); 294 | 295 | // Free the grains in order such that the second free ends up joining the 296 | // two existing nodes into one. 297 | allocator.free_grain(reused_b); 298 | println!("Freed {reused_b}: {allocator:?}"); 299 | assert_eq!(allocator.free_locations.len(), 2); 300 | allocator.free_grain(second); 301 | println!("Freed {second}: {allocator:?}"); 302 | assert_eq!(allocator.free_locations.len(), 1); 303 | allocator.free_grain(reused_a); 304 | println!("Freed {reused_a}: {allocator:?}"); 305 | assert_eq!(allocator.grains_free, 16_372); 306 | 307 | // Re-allocate using these grain ids 308 | assert!(allocator.allocate_grain(second)); 309 | println!("Allocated {second}: {allocator:?}"); 310 | assert_eq!(allocator.free_locations.len(), 2); 311 | assert!(allocator.allocate_grain(reused_a)); 312 | println!("Allocated {reused_a}: {allocator:?}"); 313 | assert_eq!(allocator.free_locations.len(), 2); 314 | assert!(allocator.allocate_grain(reused_b)); 315 | println!("Allocated {reused_b}: {allocator:?}"); 316 | assert_eq!(allocator.free_locations.len(), 1); 317 | assert_eq!(allocator.grains_free, 16_372 - 32); 318 | 319 | // Free in a different order this time. 320 | allocator.free_grain(reused_a); 321 | println!("Freed {reused_a}: {allocator:?}"); 322 | assert_eq!(allocator.free_locations.len(), 2); 323 | allocator.free_grain(second); 324 | println!("Freed {second}: {allocator:?}"); 325 | assert_eq!(allocator.free_locations.len(), 2); 326 | allocator.free_grain(reused_b); 327 | println!("Freed {reused_b}: {allocator:?}"); 328 | assert_eq!(allocator.free_locations.len(), 1); 329 | assert_eq!(allocator.grains_free, 16_372); 330 | } 331 | -------------------------------------------------------------------------------- /benchmarks/benches/inserts.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | use std::num::NonZeroUsize; 3 | use std::ops::Range; 4 | use std::path::Path; 5 | use std::sync::Arc; 6 | 7 | use rand::prelude::StdRng; 8 | use rand::{Rng, SeedableRng}; 9 | use sediment::format::TransactionId; 10 | use sediment::Database; 11 | use timings::{Benchmark, BenchmarkImplementation, Label, LabeledTimings, Timings}; 12 | 13 | const ITERS: u128 = 100; 14 | const INSERTS_PER_BATCH: usize = 20; 15 | 16 | fn main() { 17 | #[cfg(any(target_os = "macos", target_os = "ios"))] 18 | { 19 | if cfg!(feature = "sqlite") && !cfg!(feature = "fbarrier-fsync") { 20 | eprintln!("SQLite bundled in macOS uses F_BARRIERFSYNC instead of F_FULLFSYNC, which means it does not provide ACID guarantees. Enable feature `fbarrier-fsync` to configure Sediment to use the same synchronization primitive. See for more information."); 21 | } 22 | 23 | if cfg!(feature = "rocksdb") { 24 | if cfg!(feature = "fbarrier-fsync") { 25 | eprintln!("RocksDB prior to 7.3.1 only utilizes fdatasync. As of writing this, RocksDB does not support F_BARRIERFSYNC. The current version used by the rocksdb crate is 7.1.2."); 26 | eprintln!("rocksdb crate's built version: "); 27 | eprintln!("ACID on Apple: "); 28 | } else { 29 | eprintln!("RocksDB does not use F_FULLFSYNC until version 7.3.1. The current version used by the rocksdb crate is 7.1.2."); 30 | eprintln!("rocksdb crate's built version: "); 31 | } 32 | } 33 | } 34 | 35 | let measurements = Timings::default(); 36 | 37 | let source = vec![0; 4096]; 38 | let mut ranges = Vec::new(); 39 | let mut rng = StdRng::from_seed([0; 32]); 40 | for _ in 0..ITERS { 41 | let mut batch = Vec::with_capacity(rng.gen_range(1..INSERTS_PER_BATCH)); 42 | for _ in 0..batch.capacity() { 43 | let start = rng.gen_range(0..source.len()); 44 | let end = rng.gen_range(start..source.len()); 45 | batch.push(start..end); 46 | } 47 | ranges.push(batch); 48 | } 49 | 50 | let threads = std::thread::available_parallelism() 51 | .map(NonZeroUsize::get) 52 | .unwrap_or(4) 53 | .max(4); 54 | 55 | let mut benchmark = Benchmark::for_config(Arc::new(ThreadedInsertsData { source, ranges })) 56 | .with_each_number_of_threads([threads * 4, threads * 2, threads, 1]); 57 | 58 | #[cfg(feature = "sqlite")] 59 | { 60 | benchmark = benchmark.with::(); 61 | } 62 | // #[cfg(feature = "rocksdb")] 63 | // { 64 | // benchmark = benchmark.with::(); 65 | // } 66 | 67 | benchmark = benchmark.with::(); 68 | 69 | benchmark.run(&measurements).unwrap(); 70 | // return; 71 | 72 | measure_sediment(&measurements); 73 | #[cfg(feature = "marble")] 74 | marble::measure(&measurements); 75 | #[cfg(feature = "sqlite")] 76 | measure_sqlite(&measurements); 77 | // #[cfg(feature = "rocksdb")] 78 | // self::rocksdb::measure(&measurements); 79 | 80 | let stats = measurements.wait_for_stats(); 81 | timings::print_table_summaries(&stats).unwrap(); 82 | } 83 | 84 | fn measure_sediment(measurements: &Timings) { 85 | let path = Path::new(".bench-suite.sediment"); 86 | if path.exists() { 87 | std::fs::remove_dir_all(path).unwrap(); 88 | } 89 | 90 | let sediment = Database::recover(path).unwrap(); 91 | let mut checkpoint_to = TransactionId::default(); 92 | for i in 0_u128..ITERS { 93 | let measurement = measurements.begin("sediment", String::from("insert 16b")); 94 | let mut session = sediment.begin_transaction().unwrap(); 95 | session.write(&i.to_le_bytes()).unwrap(); 96 | session.checkpoint_to(checkpoint_to).unwrap(); 97 | checkpoint_to = session.commit().unwrap(); 98 | measurement.finish(); 99 | } 100 | 101 | sediment.shutdown().unwrap(); 102 | std::fs::remove_dir_all(path).unwrap(); 103 | } 104 | 105 | #[cfg(feature = "sqlite")] 106 | fn measure_sqlite(measurements: &Timings) { 107 | let path = Path::new("./bench-suite.sqlite"); 108 | if path.exists() { 109 | std::fs::remove_file(path).unwrap(); 110 | } 111 | let mut sqlite = initialize_sqlite(path); 112 | 113 | for i in 0_u128..ITERS { 114 | let measurement = measurements.begin("sqlite", String::from("insert 16b")); 115 | let tx = sqlite.transaction().unwrap(); 116 | tx.execute("insert into blobs (value) values ($1)", [&i.to_le_bytes()]) 117 | .unwrap(); 118 | tx.commit().unwrap(); 119 | measurement.finish(); 120 | } 121 | drop(sqlite); 122 | 123 | std::fs::remove_file(path).unwrap(); 124 | } 125 | 126 | #[cfg(feature = "sqlite")] 127 | fn initialize_sqlite(path: &Path) -> rusqlite::Connection { 128 | let sqlite = rusqlite::Connection::open(path).unwrap(); 129 | sqlite 130 | .busy_timeout(std::time::Duration::from_secs(3600)) 131 | .unwrap(); 132 | 133 | #[cfg(any(target_os = "macos", target_os = "ios"))] 134 | { 135 | // On macOS with built-in SQLite versions, despite the name and the SQLite 136 | // documentation, this pragma makes SQLite use `fcntl(_, F_BARRIER_FSYNC, 137 | // _)`. There's not a good practical way to make rusqlite's access of SQLite 138 | // on macOS to use `F_FULLFSYNC`, which skews benchmarks heavily in favor of 139 | // SQLite when not enabling this feature. 140 | // 141 | // Enabling this feature reduces the durability guarantees, which breaks 142 | // ACID compliance. Unless performance is critical on macOS or you know that 143 | // ACID compliance is not important for your application, this feature 144 | // should be left disabled. 145 | // 146 | // 147 | // 148 | sqlite.pragma_update(None, "fullfsync", "on").unwrap(); 149 | } 150 | 151 | sqlite 152 | .execute("create table if not exists blobs (value BLOB)", []) 153 | .unwrap(); 154 | sqlite 155 | } 156 | 157 | #[cfg(feature = "marble")] 158 | mod marble { 159 | use super::*; 160 | 161 | pub fn measure(measurements: &Timings) { 162 | let path = Path::new("./bench-suite.marble"); 163 | if path.exists() { 164 | std::fs::remove_dir_all(path).unwrap(); 165 | } 166 | let marble = ::marble::open(path).unwrap(); 167 | 168 | for i in 0_u128..ITERS { 169 | let measurement = measurements.begin("marble", String::from("insert 16b")); 170 | marble 171 | .write_batch([(i as u64 + 1, Some(i.to_le_bytes()))]) 172 | .unwrap(); 173 | marble.maintenance().unwrap(); 174 | measurement.finish(); 175 | } 176 | 177 | drop(marble); 178 | std::fs::remove_dir_all(path).unwrap(); 179 | } 180 | } 181 | 182 | pub struct ThreadedInsertsData { 183 | source: Vec, 184 | ranges: Vec>>, 185 | } 186 | 187 | impl Debug for ThreadedInsertsData { 188 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 189 | let batches = self.ranges.len(); 190 | let total_inserts = self.ranges.iter().map(|batch| batch.len()).sum::(); 191 | f.debug_struct("ThreadedInsertsData") 192 | .field("batches", &batches) 193 | .field("total_inserts", &total_inserts) 194 | .finish_non_exhaustive() 195 | } 196 | } 197 | 198 | #[derive(Debug, Clone)] 199 | pub struct SedimentThreadedInserts { 200 | db: Arc, 201 | number_of_threads: usize, 202 | data: Arc, 203 | } 204 | 205 | impl BenchmarkImplementation, ()> for SedimentThreadedInserts { 206 | type SharedConfig = Self; 207 | 208 | fn initialize_shared_config( 209 | number_of_threads: usize, 210 | config: &Arc, 211 | ) -> Result { 212 | Ok(Self { 213 | db: Arc::new(Database::recover(".threaded-inserts.sediment").unwrap()), 214 | number_of_threads, 215 | data: config.clone(), 216 | }) 217 | } 218 | 219 | fn reset(shutting_down: bool) -> Result<(), ()> { 220 | if shutting_down { 221 | let path = Path::new(".threaded-inserts.sediment"); 222 | if path.exists() { 223 | println!("Cleaning up."); 224 | std::fs::remove_dir_all(path).unwrap(); 225 | } 226 | } 227 | Ok(()) 228 | } 229 | 230 | fn initialize(_number_of_threads: usize, config: Self) -> Result { 231 | Ok(config) 232 | } 233 | 234 | fn measure(&mut self, measurements: &LabeledTimings) -> Result<(), ()> { 235 | let mut checkpoint_to = TransactionId::default(); 236 | for batch in &self.data.ranges { 237 | let measurement = 238 | measurements.begin(format!("{}-threads-inserts", self.number_of_threads)); 239 | let mut session = self.db.begin_transaction().unwrap(); 240 | for range in batch { 241 | session.write(&self.data.source[range.clone()]).unwrap(); 242 | } 243 | session.checkpoint_to(checkpoint_to).unwrap(); 244 | checkpoint_to = session.commit().unwrap(); 245 | //session.commit().unwrap(); 246 | measurement.finish(); 247 | } 248 | 249 | // dbg!(self.db.statistics()); 250 | 251 | Ok(()) 252 | } 253 | 254 | fn label(_number_of_threads: usize, _config: &Arc) -> Label { 255 | Label::from("sediment") 256 | } 257 | } 258 | 259 | impl Drop for SedimentThreadedInserts { 260 | fn drop(&mut self) { 261 | if Arc::strong_count(&self.db) == 1 { 262 | // This is the last instance of this database, shut it down before 263 | // the benchmark invokes reset. Otherwise, the checkpointer or the 264 | // wal could encounter an error after the files are deleted out from 265 | // underneath it. 266 | self.db.as_ref().clone().shutdown().unwrap(); 267 | } 268 | } 269 | } 270 | 271 | #[cfg(feature = "sqlite")] 272 | #[derive(Debug)] 273 | pub struct SqliteThreadedInserts { 274 | number_of_threads: usize, 275 | data: Arc, 276 | } 277 | 278 | #[cfg(feature = "sqlite")] 279 | impl BenchmarkImplementation, ()> for SqliteThreadedInserts { 280 | type SharedConfig = Arc; 281 | 282 | fn initialize_shared_config( 283 | _number_of_threads: usize, 284 | config: &Arc, 285 | ) -> Result { 286 | Ok(config.clone()) 287 | } 288 | 289 | fn initialize(number_of_threads: usize, config: Arc) -> Result { 290 | Ok(Self { 291 | number_of_threads, 292 | data: config, 293 | }) 294 | } 295 | 296 | fn measure(&mut self, measurements: &LabeledTimings) -> Result<(), ()> { 297 | let path = Path::new(".threaded-inserts.sqlite3"); 298 | let mut db = initialize_sqlite(path); 299 | 300 | for batch in &self.data.ranges { 301 | let measurement = 302 | measurements.begin(format!("{}-threads-inserts", self.number_of_threads)); 303 | let tx = db.transaction().unwrap(); 304 | for range in batch { 305 | tx.execute( 306 | "insert into blobs (value) values ($1)", 307 | [&self.data.source[range.clone()]], 308 | ) 309 | .unwrap(); 310 | } 311 | tx.commit().unwrap(); 312 | measurement.finish(); 313 | } 314 | 315 | Ok(()) 316 | } 317 | 318 | fn reset(_shutting_down: bool) -> Result<(), ()> { 319 | let path = Path::new(".threaded-inserts.sqlite3"); 320 | if path.exists() { 321 | std::fs::remove_file(path).unwrap(); 322 | } 323 | Ok(()) 324 | } 325 | 326 | fn label(_number_of_threads: usize, _config: &Arc) -> Label { 327 | Label::from("sqlite") 328 | } 329 | } 330 | 331 | // #[cfg(feature = "rocksdb")] 332 | // mod rocksdb { 333 | // use std::path::Path; 334 | // use std::sync::atomic::{AtomicU64, Ordering}; 335 | // use std::sync::Arc; 336 | 337 | // use rocksdb::{DBWithThreadMode, MultiThreaded, WriteBatch, WriteOptions, DB}; 338 | // use timings::{BenchmarkImplementation, LabeledTimings, Timings}; 339 | 340 | // use super::ITERS; 341 | // use crate::ThreadedInsertsData; 342 | 343 | // pub fn measure(measurements: &Timings) { 344 | // let path = Path::new("./bench-suite.rocksdb"); 345 | // if path.exists() { 346 | // std::fs::remove_dir_all(path).unwrap(); 347 | // } 348 | // let db = DB::open_default(path).unwrap(); 349 | // let mut write_opts = WriteOptions::new(); 350 | // write_opts.set_sync(true); 351 | 352 | // for i in 0_u128..ITERS { 353 | // let measurement = measurements.begin("rocksdb", String::from("insert 16b")); 354 | 355 | // db.put_opt(i.to_be_bytes(), i.to_le_bytes(), &write_opts) 356 | // .unwrap(); 357 | // measurement.finish(); 358 | // } 359 | 360 | // drop(db); 361 | // std::fs::remove_dir_all(path).unwrap(); 362 | // } 363 | 364 | // pub struct ThreadedInserts { 365 | // number_of_threads: usize, 366 | // config: ThreadedInsertsConfig, 367 | // } 368 | 369 | // #[derive(Clone)] 370 | // pub struct ThreadedInsertsConfig { 371 | // db: Arc>, 372 | // unique_id_counter: Arc, 373 | // data: Arc, 374 | // } 375 | 376 | // impl BenchmarkImplementation, ()> for ThreadedInserts { 377 | // type SharedConfig = ThreadedInsertsConfig; 378 | 379 | // fn initialize_shared_config( 380 | // _number_of_threads: usize, 381 | // config: &Arc, 382 | // ) -> Result { 383 | // let path = Path::new("./.threaded-inserts.rocksdb"); 384 | // let db = DBWithThreadMode::::open_default(path).unwrap(); 385 | // Ok(ThreadedInsertsConfig { 386 | // db: Arc::new(db), 387 | // unique_id_counter: Arc::default(), 388 | // data: config.clone(), 389 | // }) 390 | // } 391 | 392 | // fn initialize(number_of_threads: usize, config: ThreadedInsertsConfig) -> Result { 393 | // Ok(Self { 394 | // number_of_threads, 395 | // config, 396 | // }) 397 | // } 398 | 399 | // #[allow(clippy::unnecessary_to_owned)] // TODO submit PR against rocksdb to allow ?Sized 400 | // fn measure(&mut self, measurements: &LabeledTimings) -> Result<(), ()> { 401 | // let mut write_opts = WriteOptions::new(); 402 | // write_opts.set_sync(true); 403 | // for batch in &self.config.data.ranges { 404 | // let measurement = 405 | // measurements.begin(format!("{}-threads-inserts", self.number_of_threads)); 406 | // let mut write_batch = WriteBatch::default(); 407 | 408 | // for range in batch { 409 | // let unique_id = self.config.unique_id_counter.fetch_add(1, Ordering::SeqCst); 410 | // write_batch.put( 411 | // unique_id.to_be_bytes(), 412 | // &self.config.data.source[range.clone()].to_vec(), 413 | // ); 414 | // } 415 | // self.config.db.write_opt(write_batch, &write_opts).unwrap(); 416 | // measurement.finish(); 417 | // } 418 | 419 | // Ok(()) 420 | // } 421 | 422 | // fn reset(shutting_down: bool) -> Result<(), ()> { 423 | // let path = Path::new("./.threaded-inserts.rocksdb"); 424 | // if path.exists() { 425 | // std::fs::remove_dir_all(path).unwrap(); 426 | // } 427 | // if !shutting_down { 428 | // std::fs::create_dir(path).unwrap(); 429 | // } 430 | // Ok(()) 431 | // } 432 | 433 | // fn label(_number_of_threads: usize, _config: &Arc) -> timings::Label { 434 | // "rocksdb".into() 435 | // } 436 | // } 437 | // } 438 | -------------------------------------------------------------------------------- /src/wal.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufWriter, Read, Seek, SeekFrom, Write}; 2 | use std::sync::{Arc, Weak}; 3 | 4 | use okaywal::file_manager::{self, File as _}; 5 | use okaywal::{EntryId, LogManager, ReadChunkResult, WriteAheadLog}; 6 | 7 | use crate::format::{ByteUtil, GrainAllocationInfo, GrainAllocationStatus, GrainId, TransactionId}; 8 | use crate::store::BasinState; 9 | use crate::util::{u32_to_usize, usize_to_u32}; 10 | use crate::{Data as DatabaseData, Database, Error, Result}; 11 | 12 | #[derive(Debug)] 13 | pub struct WalManager 14 | where 15 | FileManager: file_manager::FileManager, 16 | { 17 | db: Weak>, 18 | scratch: Vec, 19 | } 20 | 21 | impl WalManager 22 | where 23 | FileManager: file_manager::FileManager, 24 | { 25 | pub(super) fn new(database: &Arc>) -> Self { 26 | Self { 27 | db: Arc::downgrade(database), 28 | scratch: Vec::new(), 29 | } 30 | } 31 | } 32 | 33 | impl LogManager for WalManager 34 | where 35 | FileManager: file_manager::FileManager, 36 | { 37 | fn recover(&mut self, entry: &mut okaywal::Entry<'_, FileManager::File>) -> io::Result<()> { 38 | if let Some(database) = self.db.upgrade() { 39 | let mut written_grains = Vec::new(); 40 | let mut freed_grains = Vec::new(); 41 | let mut index_metadata = database.atlas.current_index_metadata()?; 42 | loop { 43 | match entry.read_chunk()? { 44 | ReadChunkResult::Chunk(mut chunk) => { 45 | let position = chunk.log_position(); 46 | self.scratch 47 | .resize(u32_to_usize(chunk.bytes_remaining())?, 0); 48 | chunk.read_exact(&mut self.scratch)?; 49 | 50 | let chunk = WalChunk::read(&self.scratch)?; 51 | match chunk { 52 | WalChunk::NewGrainInWal { id, .. } => { 53 | written_grains.push((id, position)) 54 | } 55 | WalChunk::FinishTransaction { commit_log_entry } => { 56 | index_metadata.commit_log_head = Some(commit_log_entry); 57 | } 58 | WalChunk::UpdatedEmbeddedHeader(header) => { 59 | index_metadata.embedded_header_data = header; 60 | } 61 | WalChunk::CheckpointTo(tx_id) => { 62 | index_metadata.checkpoint_target = tx_id; 63 | } 64 | WalChunk::CheckpointedTo(tx_id) => { 65 | index_metadata.checkpointed_to = tx_id; 66 | } 67 | WalChunk::FreeGrain(id) => { 68 | freed_grains.push(id); 69 | } 70 | // Archiving a grain doesn't have any effect on the in-memory state 71 | WalChunk::ArchiveGrain(_) => {} 72 | } 73 | } 74 | ReadChunkResult::EndOfEntry => break, 75 | ReadChunkResult::AbortedEntry => return Ok(()), 76 | } 77 | } 78 | 79 | freed_grains.sort_unstable(); 80 | 81 | database.atlas.note_transaction_committed( 82 | index_metadata, 83 | written_grains, 84 | &freed_grains, 85 | true, 86 | )?; 87 | } 88 | 89 | Ok(()) 90 | } 91 | 92 | fn checkpoint_to( 93 | &mut self, 94 | last_checkpointed_id: okaywal::EntryId, 95 | checkpointed_entries: &mut okaywal::SegmentReader, 96 | wal: &WriteAheadLog, 97 | ) -> std::io::Result<()> { 98 | if let Some(database) = self.db.upgrade() { 99 | let database = Database { 100 | data: database, 101 | wal: wal.clone(), 102 | }; 103 | let fsyncs = database.data.store.file_manager.new_fsync_batch()?; 104 | let latest_tx_id = TransactionId::from(last_checkpointed_id); 105 | let mut store = database.data.store.lock()?; 106 | let mut needs_directory_sync = store.needs_directory_sync; 107 | let mut all_changed_grains = Vec::new(); 108 | let mut latest_commit_log_entry = store.index.active.commit_log_head; 109 | let mut latest_embedded_header_data = store.index.active.embedded_header_data; 110 | let mut latest_checkpoint_target = store.index.active.checkpoint_target; 111 | let mut latest_checkpointed_to = store.index.active.checkpointed_to; 112 | // We allocate the transaction grains vec once and reuse the vec to 113 | // avoid reallocating. 114 | let mut transaction_grains = Vec::new(); 115 | 'entry_loop: while let Some(mut entry) = checkpointed_entries.read_entry()? { 116 | let checkpointed_tx = TransactionId::from(entry.id()); 117 | // Because an entry could be aborted, we need to make sure we don't 118 | // modify our DiskState until after we've read every chunk. We will 119 | // write new grain data directly to the segments, but the headers 120 | // won't be updated until after the loop as well. 121 | transaction_grains.clear(); 122 | while let Some(mut chunk) = match entry.read_chunk()? { 123 | ReadChunkResult::Chunk(chunk) => Some(chunk), 124 | ReadChunkResult::EndOfEntry => None, 125 | ReadChunkResult::AbortedEntry => continue 'entry_loop, 126 | } { 127 | self.scratch.clear(); 128 | chunk.read_to_end(&mut self.scratch)?; 129 | if !chunk.check_crc()? { 130 | return Err(Error::ChecksumFailed.into()); 131 | } 132 | 133 | match WalChunk::read(&self.scratch)? { 134 | WalChunk::NewGrainInWal { id, data } => { 135 | let basin = store.basins.get_or_insert_with(id.basin_id(), || { 136 | BasinState::default_for(id.basin_id()) 137 | }); 138 | let stratum = basin.get_or_allocate_stratum( 139 | id.stratum_id(), 140 | &database.data.store.directory, 141 | ); 142 | let mut file = BufWriter::new(stratum.get_or_open_file( 143 | &database.data.store.file_manager, 144 | &mut needs_directory_sync, 145 | )?); 146 | 147 | // Write the grain data to disk. 148 | let file_position = id.file_position(); 149 | file.seek(SeekFrom::Start(file_position))?; 150 | file.write_all(&checkpointed_tx.to_be_bytes())?; 151 | file.write_all(&usize_to_u32(data.len())?.to_be_bytes())?; 152 | file.write_all(data)?; 153 | let crc32 = crc32c::crc32c(data); 154 | file.write_all(&crc32.to_be_bytes())?; 155 | file.flush()?; 156 | 157 | transaction_grains.push((id, GrainAllocationStatus::Allocated)); 158 | } 159 | WalChunk::ArchiveGrain(id) => { 160 | transaction_grains.push((id, GrainAllocationStatus::Archived)); 161 | } 162 | WalChunk::FreeGrain(id) => { 163 | transaction_grains.push((id, GrainAllocationStatus::Free)); 164 | } 165 | WalChunk::FinishTransaction { commit_log_entry } => { 166 | latest_commit_log_entry = Some(commit_log_entry); 167 | } 168 | WalChunk::UpdatedEmbeddedHeader(header) => { 169 | latest_embedded_header_data = header; 170 | } 171 | WalChunk::CheckpointTo(tx_id) => { 172 | latest_checkpoint_target = tx_id; 173 | } 174 | WalChunk::CheckpointedTo(tx_id) => { 175 | latest_checkpointed_to = tx_id; 176 | } 177 | } 178 | } 179 | 180 | all_changed_grains.append(&mut transaction_grains); 181 | } 182 | 183 | all_changed_grains.sort_unstable(); 184 | 185 | let mut index = 0; 186 | while let Some((first_id, _)) = all_changed_grains.get(index).cloned() { 187 | let basin = store.basins.get_or_insert_with(first_id.basin_id(), || { 188 | BasinState::default_for(first_id.basin_id()) 189 | }); 190 | let stratum = basin 191 | .get_or_allocate_stratum(first_id.stratum_id(), &database.data.store.directory); 192 | 193 | // Update the stratum header for the disk state. 194 | loop { 195 | // This is a messy match statement, but the goal is to only 196 | // re-lookup basin and stratum when we jump to a new 197 | // stratum. 198 | match all_changed_grains.get(index).copied() { 199 | Some((id, status)) 200 | if id.basin_id() == first_id.basin_id() 201 | && id.stratum_id() == first_id.stratum_id() => 202 | { 203 | let local_index = usize::from(id.local_grain_index().as_u16()); 204 | if status == GrainAllocationStatus::Free { 205 | // Free grains are just 0s. 206 | stratum.header.active.grains 207 | [local_index..local_index + usize::from(id.grain_count())] 208 | .fill(0); 209 | } else { 210 | for index in 0..id.grain_count() { 211 | let status = if status == GrainAllocationStatus::Allocated { 212 | GrainAllocationInfo::allocated(id.grain_count() - index) 213 | } else { 214 | GrainAllocationInfo::archived(id.grain_count() - index) 215 | }; 216 | stratum.header.active.grains 217 | [local_index + usize::from(index)] = status.0; 218 | } 219 | } 220 | } 221 | _ => break, 222 | } 223 | index += 1; 224 | } 225 | stratum.write_header(latest_tx_id, &fsyncs)?; 226 | } 227 | 228 | store.index.active.commit_log_head = latest_commit_log_entry; 229 | store.index.active.embedded_header_data = latest_embedded_header_data; 230 | store.index.active.checkpoint_target = latest_checkpoint_target; 231 | store.index.active.checkpointed_to = latest_checkpointed_to; 232 | 233 | store.write_header(latest_tx_id, &fsyncs)?; 234 | 235 | if needs_directory_sync { 236 | store.needs_directory_sync = false; 237 | fsyncs.queue_fsync_all(store.directory.try_clone()?)?; 238 | } 239 | 240 | fsyncs.wait_all()?; 241 | 242 | database 243 | .data 244 | .atlas 245 | .note_grains_checkpointed(&all_changed_grains)?; 246 | 247 | database 248 | .data 249 | .checkpointer 250 | .checkpoint_to(latest_checkpoint_target); 251 | 252 | Ok(()) 253 | } else { 254 | // TODO OkayWAL should have a way to be told "shut down" from this 255 | // callback. 256 | Err(io::Error::from(Error::Shutdown)) 257 | } 258 | } 259 | } 260 | 261 | #[derive(Debug)] 262 | pub enum WalChunk<'a> { 263 | NewGrainInWal { id: GrainId, data: &'a [u8] }, 264 | ArchiveGrain(GrainId), 265 | FreeGrain(GrainId), 266 | UpdatedEmbeddedHeader(Option), 267 | CheckpointTo(TransactionId), 268 | CheckpointedTo(TransactionId), 269 | FinishTransaction { commit_log_entry: GrainId }, 270 | } 271 | 272 | impl<'a> WalChunk<'a> { 273 | pub const COMMAND_LENGTH: u32 = 9; 274 | pub const COMMAND_LENGTH_USIZE: usize = Self::COMMAND_LENGTH as usize; 275 | 276 | pub fn read(buffer: &'a [u8]) -> Result { 277 | if buffer.len() < Self::COMMAND_LENGTH_USIZE { 278 | return Err(Error::ValueOutOfBounds); 279 | } 280 | let kind = buffer[0]; 281 | match kind { 282 | 0 => Ok(Self::NewGrainInWal { 283 | id: GrainId::from_bytes(&buffer[1..9]).ok_or(Error::InvalidGrainId)?, 284 | data: &buffer[9..], 285 | }), 286 | 1 => Ok(Self::ArchiveGrain( 287 | GrainId::from_bytes(&buffer[1..9]).ok_or(Error::InvalidGrainId)?, 288 | )), 289 | 2 => Ok(Self::FreeGrain( 290 | GrainId::from_bytes(&buffer[1..9]).ok_or(Error::InvalidGrainId)?, 291 | )), 292 | 3 => Ok(Self::UpdatedEmbeddedHeader(GrainId::from_bytes( 293 | &buffer[1..9], 294 | ))), 295 | 4 => Ok(Self::CheckpointTo(TransactionId::from(EntryId( 296 | u64::from_be_bytes(buffer[1..9].try_into().expect("u64 is 8 bytes")), 297 | )))), 298 | 5 => Ok(Self::CheckpointedTo(TransactionId::from(EntryId( 299 | u64::from_be_bytes(buffer[1..9].try_into().expect("u64 is 8 bytes")), 300 | )))), 301 | 255 => Ok(Self::FinishTransaction { 302 | commit_log_entry: GrainId::from_bytes(&buffer[1..9]) 303 | .ok_or(Error::InvalidGrainId)?, 304 | }), 305 | _ => Err(Error::verification_failed("invalid wal chunk")), 306 | } 307 | } 308 | 309 | pub fn write_new_grain(grain_id: GrainId, data: &[u8], writer: &mut W) -> Result<()> { 310 | writer.write_all(&[0])?; 311 | writer.write_all(&grain_id.to_be_bytes())?; 312 | writer.write_all(data)?; 313 | Ok(()) 314 | } 315 | 316 | pub fn write_archive_grain(grain_id: GrainId, writer: &mut W) -> Result<()> { 317 | writer.write_all(&[1])?; 318 | writer.write_all(&grain_id.to_be_bytes())?; 319 | Ok(()) 320 | } 321 | 322 | pub fn write_free_grain(grain_id: GrainId, writer: &mut W) -> Result<()> { 323 | writer.write_all(&[2])?; 324 | writer.write_all(&grain_id.to_be_bytes())?; 325 | Ok(()) 326 | } 327 | 328 | pub fn write_embedded_header_update( 329 | new_embedded_header: Option, 330 | writer: &mut W, 331 | ) -> Result<()> { 332 | writer.write_all(&[3])?; 333 | writer.write_all(&new_embedded_header.unwrap_or(GrainId::NONE).to_be_bytes())?; 334 | Ok(()) 335 | } 336 | 337 | pub fn write_checkpoint_to( 338 | checkpoint_to: TransactionId, 339 | writer: &mut W, 340 | ) -> Result<()> { 341 | writer.write_all(&[4])?; 342 | writer.write_all(&checkpoint_to.to_be_bytes())?; 343 | Ok(()) 344 | } 345 | 346 | pub fn write_checkpointed_to( 347 | checkpointed_to: TransactionId, 348 | writer: &mut W, 349 | ) -> Result<()> { 350 | writer.write_all(&[5])?; 351 | writer.write_all(&checkpointed_to.to_be_bytes())?; 352 | Ok(()) 353 | } 354 | 355 | pub const fn new_grain_length(data_length: u32) -> u32 { 356 | data_length + 9 357 | } 358 | 359 | pub fn write_transaction_tail( 360 | commit_log_entry_id: GrainId, 361 | writer: &mut W, 362 | ) -> Result<()> { 363 | writer.write_all(&[255])?; 364 | writer.write_all(&commit_log_entry_id.to_be_bytes())?; 365 | Ok(()) 366 | } 367 | } 368 | 369 | #[test] 370 | fn wal_chunk_error_tests() { 371 | // The valid WalChunks are all tested by virtue of testing sediment. The 372 | // errors, however, are nearly impossible to simulate due to the wal being 373 | // completely abstracted away. 374 | let Error::VerificationFailed(_) = WalChunk::read(&[254,0,0,0,0,0,0,0,0]).unwrap_err() else { unreachable!() }; 375 | let Error::ValueOutOfBounds = WalChunk::read(&[254]).unwrap_err() else { unreachable!() }; 376 | } 377 | -------------------------------------------------------------------------------- /src/store.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::ffi::OsStr; 3 | use std::io::{self, BufReader, Read, Seek}; 4 | use std::path::{Path, PathBuf}; 5 | use std::str::FromStr; 6 | use std::sync::{Arc, Mutex, MutexGuard}; 7 | 8 | use crc32c::crc32c; 9 | use okaywal::file_manager::{self, FSyncBatch, FileManager, OpenOptions, PathId}; 10 | 11 | use crate::basinmap::BasinMap; 12 | use crate::commit_log::CommitLogEntry; 13 | use crate::format::{ 14 | BasinAndStratum, BasinId, Duplicable, FileHeader, GrainId, IndexHeader, StratumHeader, 15 | StratumId, TransactionId, 16 | }; 17 | use crate::util::u32_to_usize; 18 | use crate::{Error, Result}; 19 | 20 | #[derive(Debug)] 21 | pub struct Store 22 | where 23 | FileManager: file_manager::FileManager, 24 | { 25 | pub directory: Arc, 26 | disk_state: Mutex>, 27 | pub file_manager: FileManager, 28 | } 29 | 30 | impl Store 31 | where 32 | FileManager: file_manager::FileManager, 33 | { 34 | pub fn recover(path: &Path, file_manager: FileManager) -> Result { 35 | let disk_state = DiskState::recover(path, &file_manager)?; 36 | Ok(Self { 37 | directory: Arc::new(path.to_path_buf()), 38 | disk_state: Mutex::new(disk_state), 39 | file_manager, 40 | }) 41 | } 42 | 43 | pub fn lock(&self) -> Result>> { 44 | Ok(self.disk_state.lock()?) 45 | } 46 | } 47 | 48 | #[derive(Debug)] 49 | pub struct DiskState 50 | where 51 | File: file_manager::File, 52 | { 53 | pub needs_directory_sync: bool, 54 | pub directory: File, 55 | pub index: Duplicated, 56 | pub index_writer: File, 57 | pub basins: BasinMap>, 58 | } 59 | 60 | impl DiskState 61 | where 62 | File: file_manager::File, 63 | { 64 | pub fn recover(path: &Path, manager: &File::Manager) -> Result { 65 | let path = PathId::from(path); 66 | if !manager.exists(&path) { 67 | manager.create_dir_all(&path)?; 68 | } 69 | 70 | let directory = manager.open(&path, OpenOptions::new().read(true))?; 71 | 72 | let index_path = PathId::from(path.join("index")); 73 | 74 | let mut scratch = Vec::new(); 75 | let mut discovered_strata = discover_strata(&path, manager, &mut scratch)?; 76 | 77 | if index_path.exists() { 78 | let mut index_writer = 79 | manager.open(&index_path, OpenOptions::new().read(true).write(true))?; 80 | 81 | let file_header = 82 | FileHeader::::read_from(&mut index_writer, &mut scratch)?; 83 | 84 | let (mut first_is_active, mut active, older) = match file_header { 85 | FileHeader::Both(first, second) => { 86 | if first.transaction_id > second.transaction_id { 87 | (true, first, Some(second)) 88 | } else { 89 | (false, second, Some(first)) 90 | } 91 | } 92 | FileHeader::First(first) => (true, first, None), 93 | FileHeader::Second(second) => (false, second, None), 94 | }; 95 | 96 | let mut strata_to_clean = None; 97 | let commit_log = match (BasinMap::verify(&active, &mut discovered_strata), older) { 98 | (Ok(commit_log), _) => commit_log, 99 | (Err(_), Some(older)) => { 100 | let commit_log = BasinMap::verify(&older, &mut discovered_strata)?; 101 | 102 | active = older; 103 | first_is_active = !first_is_active; 104 | 105 | let mut invalid_strata = Vec::new(); 106 | for (id, stratum) in &discovered_strata { 107 | if stratum.should_exist(&active) 108 | && stratum.needs_cleanup(commit_log.as_ref()) 109 | { 110 | invalid_strata.push(*id); 111 | } 112 | } 113 | strata_to_clean = Some(invalid_strata); 114 | commit_log 115 | } 116 | (Err(err), None) => return Err(err), 117 | }; 118 | 119 | let mut basins = 120 | BasinMap::load_from(&active, commit_log.as_ref(), discovered_strata, &path)?; 121 | 122 | let mut index = Duplicated { 123 | active, 124 | first_is_active, 125 | }; 126 | 127 | if let Some(strata_to_clean) = strata_to_clean { 128 | for id in strata_to_clean { 129 | let basin = basins[id.basin()].as_mut().expect("just loaded"); 130 | let stratum = &mut basin.stratum[id.stratum().as_usize()]; 131 | stratum 132 | .header 133 | .write_to(stratum.file.as_mut().expect("just loaded"))?; 134 | } 135 | 136 | index.write_to(&mut index_writer)?; 137 | index_writer.sync_data()?; 138 | } 139 | 140 | Ok(Self { 141 | needs_directory_sync: false, 142 | directory, 143 | index, 144 | index_writer, 145 | basins, 146 | }) 147 | } else { 148 | let mut index_writer = manager.open( 149 | &index_path, 150 | OpenOptions::new().read(true).write(true).create(true), 151 | )?; 152 | 153 | let mut empty_header = IndexHeader::default(); 154 | empty_header.write_to(&mut index_writer)?; 155 | empty_header.write_to(&mut index_writer)?; 156 | 157 | // Ensure the file is fully persisted to disk. 158 | index_writer.sync_all()?; 159 | directory.sync_all()?; 160 | 161 | if discovered_strata.is_empty() { 162 | Ok(Self { 163 | needs_directory_sync: false, 164 | directory, 165 | index: Duplicated { 166 | active: empty_header, 167 | first_is_active: false, 168 | }, 169 | index_writer, 170 | basins: BasinMap::new(), 171 | }) 172 | } else { 173 | Err(Error::verification_failed("existing strata found without an index file. If this is intentional, clean the directory being used for the database.")) 174 | } 175 | } 176 | } 177 | 178 | pub fn write_header( 179 | &mut self, 180 | transaction_id: TransactionId, 181 | sync: &FSyncBatch, 182 | ) -> Result<()> { 183 | self.index.active.transaction_id = transaction_id; 184 | for (basin, count) in (&self.basins) 185 | .into_iter() 186 | .zip(&mut self.index.active.basin_strata_count) 187 | { 188 | *count = basin.1.stratum.len() as u64; 189 | } 190 | self.index.write_to(&mut self.index_writer)?; 191 | 192 | sync.queue_fsync_data(self.index_writer.try_clone()?)?; 193 | 194 | Ok(()) 195 | } 196 | } 197 | 198 | #[derive(Debug, Default)] 199 | pub struct Duplicated { 200 | pub active: T, 201 | pub first_is_active: bool, 202 | } 203 | 204 | impl Duplicated 205 | where 206 | T: Duplicable, 207 | { 208 | pub fn write_to(&mut self, file: &mut File) -> Result<()> 209 | where 210 | File: file_manager::File, 211 | { 212 | let offset = if self.first_is_active { T::BYTES } else { 0 }; 213 | 214 | file.seek(io::SeekFrom::Start(offset))?; 215 | self.active.write_to(file)?; 216 | self.first_is_active = !self.first_is_active; 217 | 218 | Ok(()) 219 | } 220 | } 221 | 222 | #[derive(Debug)] 223 | pub struct BasinState 224 | where 225 | File: file_manager::File, 226 | { 227 | pub id: BasinId, 228 | pub stratum: Vec>, 229 | } 230 | 231 | impl BasinState 232 | where 233 | File: file_manager::File, 234 | { 235 | pub fn default_for(id: BasinId) -> Self { 236 | Self { 237 | id, 238 | stratum: Vec::new(), 239 | } 240 | } 241 | 242 | pub fn get_or_allocate_stratum( 243 | &mut self, 244 | id: StratumId, 245 | directory: &Path, 246 | ) -> &mut StratumState { 247 | while id.as_usize() >= self.stratum.len() { 248 | let new_id = 249 | StratumId::new(u64::try_from(self.stratum.len()).expect("too large of a database")) 250 | .expect("invalid id"); 251 | self.stratum.push(StratumState::default_for(PathId::from( 252 | directory.join(format!("{}{}", self.id, new_id)), 253 | ))) 254 | } 255 | 256 | &mut self.stratum[id.as_usize()] 257 | } 258 | } 259 | 260 | fn discover_strata( 261 | path: &PathId, 262 | manager: &File::Manager, 263 | scratch: &mut Vec, 264 | ) -> Result>> 265 | where 266 | File: file_manager::File, 267 | { 268 | let mut discovered = BTreeMap::new(); 269 | 270 | for entry in manager.list(path)? { 271 | if let Some(name) = entry.file_name().and_then(OsStr::to_str) { 272 | if let Ok(basin_and_stratum) = BasinAndStratum::from_str(name) { 273 | discovered.insert( 274 | basin_and_stratum, 275 | UnverifiedStratum::read_from(entry, manager, basin_and_stratum, scratch)?, 276 | ); 277 | } 278 | } 279 | } 280 | 281 | Ok(discovered) 282 | } 283 | 284 | #[derive(Debug)] 285 | pub struct StratumState 286 | where 287 | File: file_manager::File, 288 | { 289 | pub path: PathId, 290 | pub header: Duplicated, 291 | pub file: Option, 292 | } 293 | 294 | impl StratumState 295 | where 296 | File: file_manager::File, 297 | { 298 | fn default_for(path: PathId) -> Self { 299 | Self { 300 | path, 301 | header: Duplicated::default(), 302 | file: None, 303 | } 304 | } 305 | 306 | pub fn get_or_open_file( 307 | &mut self, 308 | manager: &File::Manager, 309 | needs_directory_sync: &mut bool, 310 | ) -> Result<&mut File> { 311 | if self.file.is_none() { 312 | // If this file doesn't exist, we need to do a directory sync to 313 | // ensure the file is persisted. 314 | *needs_directory_sync |= !self.path.exists(); 315 | 316 | let file = manager.open( 317 | &self.path, 318 | OpenOptions::new().read(true).write(true).create(true), 319 | )?; 320 | 321 | self.file = Some(file); 322 | } 323 | 324 | Ok(self.file.as_mut().expect("file always allocated above")) 325 | } 326 | 327 | pub fn write_header( 328 | &mut self, 329 | new_transaction_id: TransactionId, 330 | sync_batch: &FSyncBatch, 331 | ) -> io::Result<()> { 332 | let file = self 333 | .file 334 | .as_mut() 335 | .expect("shouldn't ever write a file header if no data was written"); 336 | 337 | self.header.active.transaction_id = new_transaction_id; 338 | self.header.write_to(file)?; 339 | 340 | let file_to_sync = file.try_clone()?; 341 | sync_batch.queue_fsync_data(file_to_sync)?; 342 | 343 | Ok(()) 344 | } 345 | } 346 | 347 | pub struct UnverifiedStratum { 348 | pub path: PathId, 349 | pub id: BasinAndStratum, 350 | pub header: FileHeader, 351 | pub file: File, 352 | } 353 | 354 | impl UnverifiedStratum 355 | where 356 | File: file_manager::File, 357 | { 358 | pub fn read_from( 359 | path: PathId, 360 | manager: &File::Manager, 361 | id: BasinAndStratum, 362 | scratch: &mut Vec, 363 | ) -> Result { 364 | let mut file = manager.open(&path, OpenOptions::new().read(true).write(true))?; 365 | let header = FileHeader::read_from(&mut file, scratch)?; 366 | Ok(Self { 367 | path, 368 | id, 369 | header, 370 | file, 371 | }) 372 | } 373 | 374 | pub fn validate(&self, commit_log: &CommitLogEntry) -> Result<()> { 375 | let (first, second) = self.validate_headers(Some(commit_log)); 376 | 377 | if first.is_some() || second.is_some() { 378 | Ok(()) 379 | } else { 380 | Err(Error::verification_failed("neither header is valid")) 381 | } 382 | } 383 | 384 | pub fn should_exist(&self, index: &IndexHeader) -> bool { 385 | self.id.stratum().as_u64() < index.basin_strata_count[usize::from(self.id.basin().index())] 386 | } 387 | 388 | pub fn needs_cleanup(&self, commit_log: Option<&CommitLogEntry>) -> bool { 389 | !matches!(self.validate_headers(commit_log), (Some(_), Some(_))) 390 | } 391 | 392 | fn validate_headers( 393 | &self, 394 | commit_log: Option<&CommitLogEntry>, 395 | ) -> (Option<&StratumHeader>, Option<&StratumHeader>) { 396 | fn is_valid( 397 | header: &StratumHeader, 398 | commit_transaction: TransactionId, 399 | commit_log: Option<&CommitLogEntry>, 400 | ) -> bool { 401 | header.transaction_id < commit_transaction 402 | || (header.transaction_id == commit_transaction 403 | && commit_log 404 | .map_or(true, |commit_log| header.reflects_changes_from(commit_log))) 405 | } 406 | let commit_transaction = commit_log.map_or_else(TransactionId::default, |commit_log| { 407 | commit_log.transaction_id 408 | }); 409 | let (first, second) = self.header.as_options(); 410 | let first = first 411 | .and_then(|first| is_valid(first, commit_transaction, commit_log).then_some(first)); 412 | let second = second 413 | .and_then(|second| is_valid(second, commit_transaction, commit_log).then_some(second)); 414 | 415 | (first, second) 416 | } 417 | } 418 | 419 | impl BasinMap> 420 | where 421 | File: file_manager::File, 422 | { 423 | pub fn verify( 424 | index: &IndexHeader, 425 | discovered_strata: &mut BTreeMap>, 426 | ) -> Result> { 427 | let mut scratch = Vec::new(); 428 | if let Some(commit_log_head) = index.commit_log_head { 429 | if let Some(stratum) = discovered_strata.get_mut(&commit_log_head.basin_and_stratum()) { 430 | let mut reader = BufReader::new(&mut stratum.file); 431 | verify_read_grain( 432 | commit_log_head, 433 | &mut reader, 434 | index.transaction_id, 435 | None, 436 | &mut scratch, 437 | )?; 438 | let commit_log_entry = CommitLogEntry::read_from(&scratch[..])?; 439 | for new_grain in &commit_log_entry.new_grains { 440 | verify_read_grain( 441 | new_grain.id, 442 | &mut reader, 443 | index.transaction_id, 444 | Some(new_grain.crc32), 445 | &mut scratch, 446 | )?; 447 | } 448 | 449 | for stratum in discovered_strata.values() { 450 | if stratum.should_exist(index) { 451 | stratum.validate(&commit_log_entry)?; 452 | } 453 | } 454 | return Ok(Some(commit_log_entry)); 455 | } else { 456 | return Err(Error::verification_failed("commit log stratum not found")); 457 | } 458 | } 459 | 460 | Ok(None) 461 | } 462 | 463 | pub fn load_from( 464 | index: &IndexHeader, 465 | commit_log: Option<&CommitLogEntry>, 466 | discovered_strata: BTreeMap>, 467 | directory: &PathId, 468 | ) -> Result { 469 | let mut basins = Self::new(); 470 | for stratum in discovered_strata.into_values() { 471 | if !stratum.should_exist(index) { 472 | std::fs::remove_file(directory.join(stratum.id.to_string()))?; 473 | continue; 474 | } 475 | 476 | let header = match stratum.validate_headers(commit_log) { 477 | (Some(first), Some(second)) => { 478 | if first.transaction_id >= second.transaction_id { 479 | Duplicated { 480 | active: stratum.header.into_first(), 481 | first_is_active: true, 482 | } 483 | } else { 484 | Duplicated { 485 | active: stratum.header.into_second(), 486 | first_is_active: false, 487 | } 488 | } 489 | } 490 | (Some(_), _) => Duplicated { 491 | active: stratum.header.into_first(), 492 | first_is_active: true, 493 | }, 494 | (_, Some(_)) => Duplicated { 495 | active: stratum.header.into_second(), 496 | first_is_active: false, 497 | }, 498 | (None, None) => { 499 | unreachable!("error is handled in validation phase") 500 | } 501 | }; 502 | 503 | let basin = basins.get_or_insert_with(stratum.id.basin(), || { 504 | BasinState::default_for(stratum.id.basin()) 505 | }); 506 | if stratum.id.stratum().as_usize() != basin.stratum.len() { 507 | return Err(Error::verification_failed("strata are non-sequential")); 508 | } 509 | 510 | basin.stratum.push(StratumState { 511 | path: stratum.path, 512 | header, 513 | file: Some(stratum.file), 514 | }); 515 | } 516 | Ok(basins) 517 | } 518 | } 519 | 520 | fn verify_read_grain( 521 | grain: GrainId, 522 | file: &mut BufReader<&mut File>, 523 | transaction_id: TransactionId, 524 | expected_crc: Option, 525 | buffer: &mut Vec, 526 | ) -> Result<()> 527 | where 528 | File: file_manager::File, 529 | { 530 | file.seek(io::SeekFrom::Start(grain.file_position()))?; 531 | let mut eight_bytes = [0; 8]; 532 | file.read_exact(&mut eight_bytes)?; 533 | let grain_transaction_id = TransactionId::from_be_bytes(eight_bytes); 534 | if grain_transaction_id != transaction_id { 535 | return Err(Error::verification_failed( 536 | "new grain was written in a different transaction", 537 | )); 538 | } 539 | 540 | let mut four_bytes = [0; 4]; 541 | file.read_exact(&mut four_bytes)?; 542 | let length = u32::from_be_bytes(four_bytes); 543 | 544 | let length = u32_to_usize(length)?; 545 | buffer.resize(length, 0); 546 | file.read_exact(buffer)?; 547 | 548 | let computed_crc = crc32c(buffer); 549 | 550 | file.read_exact(&mut four_bytes)?; 551 | let stored_crc = u32::from_be_bytes(four_bytes); 552 | 553 | if computed_crc == stored_crc && expected_crc.map_or(true, |expected| expected == stored_crc) { 554 | Ok(()) 555 | } else { 556 | Err(Error::ChecksumFailed) 557 | } 558 | } 559 | -------------------------------------------------------------------------------- /src/atlas.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, HashSet, VecDeque}; 2 | use std::io::{BufReader, Read, Seek}; 3 | use std::path::PathBuf; 4 | use std::sync::{Arc, Mutex}; 5 | 6 | use okaywal::file_manager::{OpenOptions, PathId}; 7 | use okaywal::{file_manager, ChunkReader, LogPosition, WriteAheadLog}; 8 | use tinyvec::ArrayVec; 9 | 10 | use crate::allocations::FreeLocations; 11 | use crate::basinmap::BasinMap; 12 | use crate::format::{ 13 | BasinAndStratum, BasinId, GrainAllocationStatus, GrainId, GrainIndex, StratumHeader, StratumId, 14 | TransactionId, 15 | }; 16 | use crate::store::{BasinState, Store}; 17 | use crate::util::{u32_to_usize, usize_to_u32}; 18 | use crate::{Error, Result}; 19 | 20 | #[derive(Debug)] 21 | pub struct Atlas { 22 | data: Mutex>, 23 | } 24 | 25 | impl Atlas 26 | where 27 | FileManager: file_manager::FileManager, 28 | { 29 | pub fn new(store: &Store) -> Self { 30 | let disk_state = store.lock().expect("unable to lock store"); 31 | 32 | let mut basins = BasinMap::new(); 33 | 34 | for (basin_id, basin) in &disk_state.basins { 35 | basins[basin_id] = Some(Basin::from(basin)); 36 | } 37 | 38 | Self { 39 | data: Mutex::new(Data { 40 | directory: store.directory.clone(), 41 | index: IndexMetadata { 42 | commit_log_head: disk_state.index.active.commit_log_head, 43 | embedded_header_data: disk_state.index.active.embedded_header_data, 44 | checkpoint_target: disk_state.index.active.checkpoint_target, 45 | checkpointed_to: disk_state.index.active.checkpointed_to, 46 | }, 47 | basins, 48 | uncheckpointed_grains: HashMap::new(), 49 | file_manager: store.file_manager.clone(), 50 | }), 51 | } 52 | } 53 | 54 | pub fn current_index_metadata(&self) -> Result { 55 | let data = self.data.lock()?; 56 | Ok(data.index) 57 | } 58 | 59 | pub fn find<'wal>( 60 | &self, 61 | grain: GrainId, 62 | wal: &'wal WriteAheadLog, 63 | ) -> Result>> { 64 | let data = self.data.lock()?; 65 | match data.uncheckpointed_grains.get(&grain) { 66 | Some(UncheckpointedGrain::PendingCommit) => Ok(None), 67 | Some(UncheckpointedGrain::InWal(location)) => { 68 | let location = *location; 69 | let mut chunk_reader = wal.read_at(location)?; 70 | // We hold onto the data lock until after we read from the wal 71 | // to ensure a checkpoint doesn't happen before we start the 72 | // read operation. 73 | drop(data); 74 | 75 | // Skip over the WalChunk info. 76 | chunk_reader.read_exact(&mut [0; 9])?; 77 | 78 | Ok(Some(GrainReader::InWal(chunk_reader))) 79 | } 80 | None => { 81 | if data.check_grain_validity(grain).is_err() { 82 | return Ok(None); 83 | } 84 | 85 | let file_path = data.basins[grain.basin_id()] 86 | .as_ref() 87 | .expect("grain validated") 88 | .strata 89 | .get(grain.stratum_id().as_usize()) 90 | .expect("grain validated") 91 | .path 92 | .clone(); 93 | 94 | // Remove the lock before we do any file operations. 95 | let file_manager = data.file_manager.clone(); 96 | drop(data); 97 | 98 | let mut file = file_manager.open(&file_path, OpenOptions::new().read(true))?; 99 | // The grain data starts with the transaction id, followed 100 | // by the byte length. 101 | file.seek(std::io::SeekFrom::Start(grain.file_position() + 8))?; 102 | let mut file = BufReader::new(file); 103 | let mut length = [0; 4]; 104 | file.read_exact(&mut length)?; 105 | let length = u32::from_be_bytes(length); 106 | 107 | return Ok(Some(GrainReader::InStratum(StratumGrainReader { 108 | file, 109 | length, 110 | bytes_remaining: length, 111 | }))); 112 | } 113 | } 114 | } 115 | 116 | pub fn reserve(&self, length: u32) -> Result { 117 | // First, determine what basins have been allocated, and within those, 118 | // which ones are the best fit (least amount of wasted space). For 119 | // example, storing a 80 byte value as 2 64 byte grains vs 3 32 byte 120 | // grains would waste 48 bytes in one case and waste 0 bytes in the 121 | // other. 122 | let length_with_grain_info = length.checked_add(16).ok_or(Error::GrainTooLarge)?; 123 | let mut data = self.data.lock()?; 124 | // Accessing fields through MutexGuard's DerefMut causes issues with the 125 | // borrow checker extending the lifetime of the borrow across both 126 | // basins and uncheckpointed_grains. So, we perform the DerefMut to get 127 | // the Data pointer first, allowing the borrow checker to see that the 128 | // mutable accesses are unique. 129 | let data = &mut *data; 130 | let mut eligible_basins = ArrayVec::<[(BasinId, u32, bool, u32); 8]>::new(); 131 | for basin in 0..=7 { 132 | let basin_id = BasinId::new(basin).expect("valid basin id"); 133 | let grain_size = basin_id.grain_stripe_bytes(); 134 | let number_of_grains_needed = 135 | if let Some(padded_length) = length_with_grain_info.checked_add(grain_size - 1) { 136 | padded_length / grain_size 137 | } else { 138 | todo!("handle large grains") 139 | }; 140 | let extra_bytes = number_of_grains_needed * grain_size - length; 141 | 142 | if number_of_grains_needed <= 63 { 143 | eligible_basins.push(( 144 | basin_id, 145 | number_of_grains_needed, 146 | data.basins[basin_id].is_some(), 147 | extra_bytes, 148 | )); 149 | } 150 | } 151 | 152 | eligible_basins.sort_by(|a, b| a.3.cmp(&b.3)); 153 | 154 | // Now we have a list of basins to consider. 155 | for (basin_id, number_of_grains_needed, _, _) in eligible_basins 156 | .iter() 157 | .filter(|(_, _, is_allocated, _)| *is_allocated) 158 | { 159 | let basin = data.basins[*basin_id] 160 | .as_mut() 161 | .expect("filter should prevent none"); 162 | 163 | let mut free_stata = basin.free_strata.iter_mut(); 164 | while let Some(stratum_id) = free_stata.next() { 165 | let stratum = basin 166 | .strata 167 | .get_mut(stratum_id.as_usize()) 168 | .expect("strata should be allocated"); 169 | if let Ok(grain_id) = allocate_grain_within_stratum( 170 | stratum, 171 | &mut data.uncheckpointed_grains, 172 | *basin_id, 173 | stratum_id, 174 | *number_of_grains_needed as u8, 175 | ) { 176 | return Ok(grain_id); 177 | } else if stratum.allocations.is_full() { 178 | free_stata.remove_current(); 179 | } 180 | } 181 | } 182 | 183 | // We couldn't find an existing stratum that was able to fit the 184 | // allocation. Create a new one. 185 | let (basin_id, number_of_grains_needed, is_allocated, _) = eligible_basins 186 | .first() 187 | .expect("at least one basin should fit"); 188 | if !*is_allocated { 189 | data.basins[*basin_id] = Some(Basin::default()); 190 | } 191 | let basin = data.basins[*basin_id].as_mut().expect("just allocated"); 192 | let new_id = StratumId::new(basin.strata.len() as u64).expect("valid stratum id"); 193 | basin 194 | .strata 195 | .push(Stratum::default_for(PathId::from(data.directory.join( 196 | BasinAndStratum::from_parts(*basin_id, new_id).to_string(), 197 | )))); 198 | basin.free_strata.push(new_id); 199 | Ok(allocate_grain_within_stratum( 200 | basin.strata.last_mut().expect("just pushed"), 201 | &mut data.uncheckpointed_grains, 202 | *basin_id, 203 | new_id, 204 | *number_of_grains_needed as u8, 205 | ) 206 | .expect("empty stratum should have room")) 207 | } 208 | 209 | pub fn note_transaction_committed( 210 | &self, 211 | new_metadata: IndexMetadata, 212 | written_grains: impl IntoIterator, 213 | mut freed_grains: &[GrainId], 214 | is_from_wal: bool, 215 | ) -> Result<()> { 216 | let mut data = self.data.lock()?; 217 | let data = &mut *data; // This local deref helps avoid lifetime issues with borrows 218 | data.index = new_metadata; 219 | if is_from_wal { 220 | for (grain, log_position) in written_grains { 221 | data.uncheckpointed_grains 222 | .insert(grain, UncheckpointedGrain::InWal(log_position)); 223 | let basin = data.basins.get_or_default(grain.basin_id()); 224 | 225 | // We may be committing a grain to a new stratum. 226 | while grain.stratum_id().as_usize() >= basin.strata.len() { 227 | let new_id = 228 | StratumId::new(basin.strata.len().try_into()?).expect("invalid statum id"); 229 | basin.strata.push(Stratum::default_for(PathId::from( 230 | data.directory.join( 231 | BasinAndStratum::from_parts(grain.basin_id(), new_id).to_string(), 232 | ), 233 | ))); 234 | } 235 | 236 | let stratum = &mut basin.strata[grain.stratum_id().as_usize()]; 237 | assert!(stratum.allocations.allocate_grain(grain.local_grain_id())); 238 | stratum.known_grains.insert(grain.local_grain_index()); 239 | } 240 | } else { 241 | for (grain, log_position) in written_grains { 242 | if let Some(uncheckpointed) = data.uncheckpointed_grains.get_mut(&grain) { 243 | *uncheckpointed = UncheckpointedGrain::InWal(log_position); 244 | } 245 | } 246 | } 247 | 248 | // We assume that freed_grains is sorted. To avoid continuing to re-look 249 | // up the basin and stratum for grains that are from the same stratum, 250 | // we use two loops -- one to get the stratum and one to do the actual 251 | // free operations. Only the inner loop advances the iterator. 252 | while let Some(next_grain) = freed_grains.first().copied() { 253 | let basin = data.basins.get_or_default(next_grain.basin_id()); 254 | let stratum = &mut basin.strata[next_grain.stratum_id().as_usize()]; 255 | 256 | while let Some(grain) = freed_grains 257 | .first() 258 | .filter(|g| g.basin_and_stratum() == next_grain.basin_and_stratum()) 259 | .copied() 260 | { 261 | freed_grains = &freed_grains[1..]; 262 | 263 | stratum.allocations.free_grain(grain.local_grain_id()); 264 | stratum.known_grains.remove(&grain.local_grain_index()); 265 | } 266 | } 267 | 268 | Ok(()) 269 | } 270 | 271 | pub fn note_grains_checkpointed<'a>( 272 | &self, 273 | checkpointed_grains: impl IntoIterator, 274 | ) -> Result<()> { 275 | let mut data = self.data.lock()?; 276 | for (grain, status) in checkpointed_grains { 277 | match status { 278 | GrainAllocationStatus::Allocated => { 279 | // The grain can now be found in the Stratum, so we can stop 280 | // returning readers to the WAL. 281 | data.uncheckpointed_grains.remove(grain); 282 | } 283 | GrainAllocationStatus::Archived => { 284 | // Archiving has no effect to the Atlas. 285 | } 286 | GrainAllocationStatus::Free => { 287 | // The grains area already removed during the WAL phase. 288 | } 289 | } 290 | } 291 | Ok(()) 292 | } 293 | 294 | pub fn rollback_grains(&self, written_grains: impl IntoIterator) -> Result<()> { 295 | let mut data = self.data.lock()?; 296 | for grain in written_grains { 297 | data.uncheckpointed_grains.remove(&grain); 298 | let basin = data.basins[grain.basin_id()] 299 | .as_mut() 300 | .expect("basin missing"); 301 | let stratum = basin 302 | .strata 303 | .get_mut(grain.stratum_id().as_usize()) 304 | .expect("stratum missing"); 305 | 306 | stratum.allocations.free_grain(grain.local_grain_id()); 307 | stratum.known_grains.remove(&grain.local_grain_index()); 308 | } 309 | Ok(()) 310 | } 311 | 312 | pub fn check_grain_validity(&self, grain: GrainId) -> Result<()> { 313 | let data = self.data.lock()?; 314 | data.check_grain_validity(grain) 315 | } 316 | } 317 | 318 | #[derive(Debug)] 319 | struct Data { 320 | directory: Arc, 321 | index: IndexMetadata, 322 | basins: BasinMap, 323 | uncheckpointed_grains: HashMap, 324 | file_manager: FileManager, 325 | } 326 | 327 | impl Data { 328 | pub fn check_grain_validity(&self, grain: GrainId) -> Result<()> { 329 | let basin = self.basins[grain.basin_id()] 330 | .as_ref() 331 | .ok_or(Error::GrainNotAllocated)?; 332 | 333 | let stratum = basin 334 | .strata 335 | .get(grain.stratum_id().as_usize()) 336 | .ok_or(Error::GrainNotAllocated)?; 337 | if stratum.known_grains.contains(&grain.local_grain_index()) { 338 | Ok(()) 339 | } else { 340 | Err(Error::GrainNotAllocated) 341 | } 342 | } 343 | } 344 | 345 | fn allocate_grain_within_stratum( 346 | stratum: &mut Stratum, 347 | uncheckpointed_grains: &mut HashMap, 348 | 349 | basin_id: BasinId, 350 | stratum_id: StratumId, 351 | number_of_grains_needed: u8, 352 | ) -> Result { 353 | if let Some(index) = stratum.allocations.allocate(number_of_grains_needed) { 354 | let id = GrainId::new(basin_id, stratum_id, index); 355 | uncheckpointed_grains.insert(id, UncheckpointedGrain::PendingCommit); 356 | stratum.known_grains.insert(id.local_grain_index()); 357 | Ok(id) 358 | } else { 359 | Err(()) 360 | } 361 | } 362 | 363 | #[derive(Debug, Default)] 364 | struct Basin { 365 | strata: Vec, 366 | free_strata: StratumIdRing, 367 | } 368 | 369 | impl<'a, File> From<&'a BasinState> for Basin 370 | where 371 | File: file_manager::File, 372 | { 373 | fn from(state: &'a BasinState) -> Self { 374 | let mut strata = Vec::new(); 375 | let mut free_strata = StratumIdRing::default(); 376 | for stratum in &state.stratum { 377 | let stratum = Stratum::from_stratum(stratum.path.clone(), &stratum.header.active); 378 | 379 | if !stratum.allocations.is_full() { 380 | free_strata.push(StratumId::new(strata.len() as u64).expect("valid stratum id")); 381 | } 382 | 383 | strata.push(stratum); 384 | } 385 | 386 | Self { 387 | strata, 388 | free_strata, 389 | } 390 | } 391 | } 392 | 393 | #[derive(Debug)] 394 | struct Stratum { 395 | path: PathId, 396 | allocations: FreeLocations, 397 | known_grains: HashSet, 398 | } 399 | 400 | impl Stratum { 401 | fn from_stratum(path: PathId, stratum: &StratumHeader) -> Self { 402 | let allocations = FreeLocations::from_stratum(stratum); 403 | 404 | let mut known_grains = HashSet::new(); 405 | let mut index = 0; 406 | while index < 16_372 { 407 | let index_status = stratum.grain_info(index); 408 | let count = index_status.count(); 409 | let allocated = !matches!( 410 | index_status.status().expect("invalid header"), 411 | GrainAllocationStatus::Free 412 | ); 413 | 414 | if allocated { 415 | known_grains.insert( 416 | GrainIndex::new(index.try_into().expect("only valid indexes are used")) 417 | .expect("only valid grains are used"), 418 | ); 419 | index += usize::from(count); 420 | } else { 421 | index += 1; 422 | } 423 | } 424 | 425 | Self { 426 | path, 427 | allocations, 428 | known_grains, 429 | } 430 | } 431 | 432 | fn default_for(path: PathId) -> Self { 433 | Self { 434 | path, 435 | allocations: FreeLocations::default(), 436 | known_grains: HashSet::default(), 437 | } 438 | } 439 | } 440 | 441 | #[derive(Debug)] 442 | pub enum GrainReader<'a, FileManager> 443 | where 444 | FileManager: file_manager::FileManager, 445 | { 446 | InWal(ChunkReader<'a, FileManager>), 447 | InStratum(StratumGrainReader), 448 | } 449 | 450 | impl<'a, FileManager> GrainReader<'a, FileManager> 451 | where 452 | FileManager: file_manager::FileManager, 453 | { 454 | pub const fn bytes_remaining(&self) -> u32 { 455 | match self { 456 | GrainReader::InWal(reader) => reader.bytes_remaining(), 457 | GrainReader::InStratum(reader) => reader.bytes_remaining, 458 | } 459 | } 460 | 461 | pub const fn length(&self) -> u32 { 462 | match self { 463 | GrainReader::InWal(reader) => reader.chunk_length() - 9, 464 | GrainReader::InStratum(reader) => reader.length, 465 | } 466 | } 467 | 468 | pub fn read_all_data(mut self) -> Result> { 469 | let mut data = Vec::new(); 470 | self.read_to_end(&mut data)?; 471 | 472 | // TODO offer a way to do a crc check? 473 | 474 | Ok(data) 475 | } 476 | } 477 | 478 | impl<'a, FileManager> Read for GrainReader<'a, FileManager> 479 | where 480 | FileManager: file_manager::FileManager, 481 | { 482 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 483 | match self { 484 | GrainReader::InWal(reader) => reader.read(buf), 485 | GrainReader::InStratum(reader) => { 486 | let bytes_remaining = u32_to_usize(reader.bytes_remaining)?; 487 | let bytes_to_read = buf.len().min(bytes_remaining); 488 | let bytes_read = reader.file.read(&mut buf[..bytes_to_read])?; 489 | reader.bytes_remaining -= usize_to_u32(bytes_read)?; 490 | Ok(bytes_read) 491 | } 492 | } 493 | } 494 | } 495 | 496 | #[derive(Debug)] 497 | pub struct StratumGrainReader 498 | where 499 | File: file_manager::File, 500 | { 501 | file: BufReader, 502 | length: u32, 503 | bytes_remaining: u32, 504 | } 505 | 506 | #[derive(Debug)] 507 | enum UncheckpointedGrain { 508 | PendingCommit, 509 | InWal(LogPosition), 510 | } 511 | 512 | #[derive(Debug, Default)] 513 | struct StratumIdRing(VecDeque); 514 | 515 | impl StratumIdRing { 516 | pub fn push(&mut self, id: StratumId) { 517 | self.0.push_back(id); 518 | } 519 | 520 | pub fn iter_mut(&mut self) -> StratumIdIter<'_> { 521 | StratumIdIter { 522 | ring: self, 523 | iterated: 0, 524 | } 525 | } 526 | } 527 | 528 | struct StratumIdIter<'a> { 529 | ring: &'a mut StratumIdRing, 530 | iterated: usize, 531 | } 532 | 533 | impl<'a> Iterator for StratumIdIter<'a> { 534 | type Item = StratumId; 535 | 536 | fn next(&mut self) -> Option { 537 | if self.iterated == self.ring.0.len() { 538 | None 539 | } else { 540 | // Cycle the ring, moving the front to the end. We keep track of how 541 | // many times we've iterated to ensure we don't return the same id 542 | // twice. 543 | self.iterated += 1; 544 | self.ring.0.rotate_left(1); 545 | self.ring.0.front().copied() 546 | } 547 | } 548 | } 549 | 550 | impl<'a> StratumIdIter<'a> { 551 | /// Removes the current id from the ring. 552 | /// 553 | /// # Panics 554 | /// 555 | /// Panics if `Iterator::next()` wasn't called at least once before calling 556 | /// this function. 557 | pub fn remove_current(&mut self) { 558 | assert!(self.iterated > 0); 559 | self.ring.0.pop_front(); 560 | self.iterated -= 1; 561 | } 562 | } 563 | 564 | #[derive(Debug, Clone, Copy, Default)] 565 | pub struct IndexMetadata { 566 | pub embedded_header_data: Option, 567 | pub commit_log_head: Option, 568 | pub checkpoint_target: TransactionId, 569 | pub checkpointed_to: TransactionId, 570 | } 571 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | use std::fs::{self, OpenOptions}; 2 | use std::io::{Read, Seek, Write}; 3 | use std::path::Path; 4 | 5 | use okaywal::file_manager::{self}; 6 | 7 | use crate::config::Config; 8 | use crate::format::{ 9 | BasinAndStratum, Duplicable, IndexHeader, StratumHeader, StratumId, TransactionId, 10 | }; 11 | use crate::{Database, Error}; 12 | 13 | fn basic(config: Config) 14 | where 15 | FileManager: file_manager::FileManager, 16 | { 17 | if config.wal.file_manager.exists(&config.wal.directory) { 18 | config 19 | .wal 20 | .file_manager 21 | .remove_dir_all(&config.wal.directory) 22 | .unwrap(); 23 | } 24 | 25 | let db = config.clone().recover().unwrap(); 26 | assert!(db.embedded_header().unwrap().is_none()); 27 | let mut tx = db.begin_transaction().unwrap(); 28 | let grain = tx.write(b"hello, world").unwrap(); 29 | println!("Wrote {grain:?}"); 30 | tx.set_embedded_header(Some(grain)).unwrap(); 31 | assert!(db.read(grain).unwrap().is_none()); 32 | let tx_id = tx.commit().unwrap(); 33 | assert_eq!(db.embedded_header().unwrap(), Some(grain)); 34 | 35 | let verify = |db: &Database| { 36 | let mut reader = db.read(grain).unwrap().expect("grain not found"); 37 | assert_eq!(reader.length(), 12); 38 | assert_eq!(reader.bytes_remaining(), reader.length()); 39 | let mut read_contents = [0; 12]; 40 | reader.read_exact(&mut read_contents[..6]).unwrap(); 41 | assert_eq!(reader.bytes_remaining(), 6); 42 | reader.read_exact(&mut read_contents[6..]).unwrap(); 43 | assert_eq!(reader.bytes_remaining(), 0); 44 | assert_eq!(&read_contents[..], b"hello, world"); 45 | 46 | assert_eq!(reader.read(&mut read_contents).unwrap(), 0); 47 | 48 | let commit = db.commit_log_head().unwrap().expect("commit log missing"); 49 | assert_eq!(commit.transaction_id, tx_id); 50 | assert_eq!(commit.new_grains.len(), 1); 51 | assert_eq!(commit.new_grains[0].id, grain); 52 | assert_eq!(commit.embedded_header_data, Some(grain)); 53 | assert!(commit.freed_grains.is_empty()); 54 | assert!(commit.archived_grains.is_empty()); 55 | assert!(commit.next_entry(db).unwrap().is_none()); 56 | }; 57 | 58 | verify(&db); 59 | 60 | // Close the database and reopen it. Since this has a default WAL 61 | // configuration, this transaction will be recovered from the WAL, unlike a 62 | // lot of the other unit tests. 63 | db.shutdown().unwrap(); 64 | let db = config.clone().recover().unwrap(); 65 | 66 | verify(&db); 67 | db.shutdown().unwrap(); 68 | 69 | config 70 | .wal 71 | .file_manager 72 | .remove_dir_all(&config.wal.directory) 73 | .unwrap(); 74 | } 75 | 76 | #[test] 77 | fn basic_std() { 78 | basic(Config::for_directory("test")); 79 | } 80 | 81 | #[test] 82 | fn basic_memory() { 83 | basic(Config::in_memory()); 84 | } 85 | 86 | #[test] 87 | fn wal_checkpoint() { 88 | let path = Path::new(".test-checkpoint"); 89 | if path.exists() { 90 | fs::remove_dir_all(path).unwrap(); 91 | } 92 | 93 | // Configure the WAL to checkpoint after 10 bytes -- "hello, world" is 12. 94 | let db = Config::for_directory(path) 95 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 96 | .recover() 97 | .unwrap(); 98 | let mut tx = db.begin_transaction().unwrap(); 99 | let grain = tx.write(b"hello, world").unwrap(); 100 | assert!(db.read(grain).unwrap().is_none()); 101 | tx.commit().unwrap(); 102 | db.shutdown().unwrap(); 103 | 104 | let db = Config::for_directory(path) 105 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 106 | .recover() 107 | .unwrap(); 108 | let contents = db 109 | .read(grain) 110 | .unwrap() 111 | .expect("grain not found") 112 | .read_all_data() 113 | .unwrap(); 114 | assert_eq!(contents, b"hello, world"); 115 | 116 | db.shutdown().unwrap(); 117 | 118 | fs::remove_dir_all(path).unwrap(); 119 | } 120 | 121 | #[test] 122 | fn wal_checkpoint_loop() { 123 | let path = Path::new(".test-checkpoint-loop"); 124 | if path.exists() { 125 | fs::remove_dir_all(path).unwrap(); 126 | } 127 | 128 | // Configure the WAL to checkpoint after 10 bytes -- "hello, world" is 12. 129 | let mut grains_written = Vec::new(); 130 | for i in 0_usize..10 { 131 | println!("{i}"); 132 | let db = Config::for_directory(path) 133 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 134 | .recover() 135 | .unwrap(); 136 | let mut tx = db.begin_transaction().unwrap(); 137 | let grain = dbg!(tx.write(&i.to_be_bytes()).unwrap()); 138 | assert!(db.read(grain).unwrap().is_none()); 139 | grains_written.push(grain); 140 | tx.commit().unwrap(); 141 | 142 | for (index, grain) in grains_written.iter().enumerate() { 143 | dbg!(grain); 144 | let contents = db 145 | .read(*grain) 146 | .unwrap() 147 | .expect("grain not found") 148 | .read_all_data() 149 | .unwrap(); 150 | assert_eq!(contents, &index.to_be_bytes()); 151 | } 152 | 153 | db.shutdown().unwrap(); 154 | } 155 | 156 | let db = Config::for_directory(path) 157 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 158 | .recover() 159 | .unwrap(); 160 | for (index, grain) in grains_written.iter().enumerate() { 161 | let contents = db 162 | .read(*grain) 163 | .unwrap() 164 | .expect("grain not found") 165 | .read_all_data() 166 | .unwrap(); 167 | assert_eq!(contents, &index.to_be_bytes()); 168 | } 169 | 170 | // Verify the commit log is correct. The commit log head will contain the 171 | // addition of the most recent grain, and we should be able to iterate 172 | // backwards and find each grain in each entry. 173 | let mut grains_to_read = grains_written.iter().rev(); 174 | let mut current_commit_log_entry = db.commit_log_head().unwrap(); 175 | while let Some(commit_log_entry) = current_commit_log_entry { 176 | let expected_grain = grains_to_read.next().expect("too many commit log entries"); 177 | assert_eq!(&commit_log_entry.new_grains[0].id, expected_grain); 178 | current_commit_log_entry = commit_log_entry.next_entry(&db).unwrap(); 179 | } 180 | 181 | db.shutdown().unwrap(); 182 | 183 | fs::remove_dir_all(path).unwrap(); 184 | } 185 | 186 | #[test] 187 | fn sediment_checkpoint_loop() { 188 | let path = Path::new(".test-sediment-checkpoint-loop"); 189 | if path.exists() { 190 | fs::remove_dir_all(path).unwrap(); 191 | } 192 | 193 | // Configure the WAL to checkpoint after 10 bytes -- "hello, world" is 12. 194 | let mut grains_written = Vec::new(); 195 | let mut headers_written = Vec::new(); 196 | let mut tx_id = TransactionId::default(); 197 | for i in 0_usize..10 { 198 | let db = Config::for_directory(path) 199 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 200 | .recover() 201 | .unwrap(); 202 | let mut tx = db.begin_transaction().unwrap(); 203 | let new_grain = tx.write(&i.to_be_bytes()).unwrap(); 204 | if let Some(last_grain) = grains_written.last() { 205 | tx.archive(*last_grain).unwrap(); 206 | } 207 | grains_written.push(new_grain); 208 | // The old headers are automatically archived. 209 | let new_header = tx.write(&i.to_be_bytes()).unwrap(); 210 | tx.set_embedded_header(Some(new_header)).unwrap(); 211 | headers_written.push(new_header); 212 | 213 | tx.checkpoint_to(tx_id).unwrap(); 214 | tx_id = tx.commit().unwrap(); 215 | 216 | db.shutdown().unwrap(); 217 | } 218 | 219 | let db = Config::for_directory(path) 220 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 221 | .recover() 222 | .unwrap(); 223 | 224 | // Because we close and reopen the database so often, we may not actually 225 | // have finished the sediment checkpoint yet. This thread sleep gives it 226 | // time to complete if it was run upon recovery. 227 | std::thread::sleep(std::time::Duration::from_millis(100)); 228 | 229 | // Because we archived all grains except the last one, we should only be able to read the last grain 230 | for (index, (grain, header)) in grains_written.iter().zip(&headers_written).enumerate() { 231 | let result = db.read(*grain).unwrap(); 232 | let header_result = db.read(*header).unwrap(); 233 | if index >= grains_written.len() - 2 { 234 | let contents = result.expect("grain not found").read_all_data().unwrap(); 235 | assert_eq!(contents, &index.to_be_bytes()); 236 | let contents = header_result 237 | .expect("grain not found") 238 | .read_all_data() 239 | .unwrap(); 240 | assert_eq!(contents, &index.to_be_bytes()); 241 | } else if let Some(grain) = result.or(header_result) { 242 | // Because grain IDs can be reused, we may have "lucked" out and 243 | // stumbled upon another written grain. If we get an error reading 244 | // the data or the contents aren't what we expect, this is a passed 245 | // check. 246 | if let Ok(contents) = grain.read_all_data() { 247 | assert_ne!(contents, &index.to_be_bytes()); 248 | } 249 | } else { 250 | // None means the grain couldn't be read. 251 | } 252 | } 253 | 254 | db.shutdown().unwrap(); 255 | 256 | fs::remove_dir_all(path).unwrap(); 257 | } 258 | 259 | #[test] 260 | fn rollback() { 261 | let path = Path::new("rollback"); 262 | if path.exists() { 263 | fs::remove_dir_all(path).unwrap(); 264 | } 265 | 266 | let db = Database::recover(path).unwrap(); 267 | let mut tx = db.begin_transaction().unwrap(); 268 | let grain = tx.write(b"hello, world").unwrap(); 269 | println!("Wrote {grain:?}"); 270 | tx.set_embedded_header(Some(grain)).unwrap(); 271 | assert!(db.read(grain).unwrap().is_none()); 272 | drop(tx); 273 | 274 | // Ensure we still didn't get it published. 275 | assert!(db.read(grain).unwrap().is_none()); 276 | 277 | // Trying again, we should get the same grain id back. 278 | let mut tx = db.begin_transaction().unwrap(); 279 | assert_eq!(tx.write(b"hello, world").unwrap(), grain); 280 | tx.rollback().unwrap(); 281 | 282 | db.shutdown().unwrap(); 283 | 284 | fs::remove_dir_all(path).unwrap(); 285 | } 286 | 287 | #[derive(Clone)] 288 | enum WriteCommand<'a> { 289 | Write { 290 | target: Target, 291 | offset: u64, 292 | bytes: &'a [u8], 293 | }, 294 | RemoveStratum(Option), 295 | RemoveIndex, 296 | DoNothing, 297 | } 298 | 299 | #[derive(Clone)] 300 | enum Target { 301 | Grain, 302 | Stratum, 303 | Index, 304 | } 305 | 306 | #[test] 307 | fn last_write_rollback() { 308 | #[track_caller] 309 | fn test_write_after(commands: &[WriteCommand], expect_error: bool) { 310 | let path = Path::new("last-write"); 311 | if path.exists() { 312 | fs::remove_dir_all(path).unwrap(); 313 | } 314 | 315 | let mut written_grains = Vec::new(); 316 | let mut rolled_back_grains = Vec::new(); 317 | let mut commands = commands.iter(); 318 | let mut index = 0_usize; 319 | loop { 320 | let db = Config::for_directory(path) 321 | .configure_wal(|wal| wal.checkpoint_after_bytes(10)) 322 | .recover(); 323 | let db = match db { 324 | Ok(db) => db, 325 | Err(_) if commands.len() == 0 && expect_error => break, 326 | Err(err) => unreachable!("error when not expected: {err}"), 327 | }; 328 | 329 | for (grain_id, expected_data) in &written_grains { 330 | assert_eq!( 331 | &db.read(*grain_id) 332 | .unwrap() 333 | .expect("grain missing") 334 | .read_all_data() 335 | .unwrap(), 336 | expected_data 337 | ) 338 | } 339 | 340 | for (grain_id, expected_data) in &rolled_back_grains { 341 | if let Some(reader) = db.read(*grain_id).unwrap() { 342 | // The grain id can be reused, but the contents shouldn't 343 | // match. Note that this rollback required forcibly changing 344 | // bits after the transaction was written. In a normal crash 345 | // or power outage scenario, the grain id wouldn't have been 346 | // returned until the data is fully synced to disk. 347 | assert_ne!(&reader.read_all_data().unwrap(), expected_data); 348 | } 349 | } 350 | 351 | let mut tx = db.begin_transaction().unwrap(); 352 | let data = index 353 | .to_be_bytes() 354 | .into_iter() 355 | .cycle() 356 | .take(2000) 357 | .collect::>(); 358 | index += 1; 359 | let grain_id = dbg!(tx.write(&data).unwrap()); 360 | assert_eq!(grain_id.grain_count(), 63); 361 | tx.commit().unwrap(); 362 | 363 | db.shutdown().unwrap(); 364 | 365 | match commands.next() { 366 | Some(WriteCommand::Write { 367 | target, 368 | offset, 369 | bytes, 370 | }) => { 371 | let mut file = match target { 372 | Target::Grain | Target::Stratum => OpenOptions::new() 373 | .read(true) 374 | .write(true) 375 | .open(path.join(grain_id.basin_and_stratum().to_string())) 376 | .unwrap(), 377 | Target::Index => OpenOptions::new() 378 | .read(true) 379 | .write(true) 380 | .open(path.join("index")) 381 | .unwrap(), 382 | }; 383 | let position = match target { 384 | Target::Grain => grain_id.file_position() + *offset, 385 | Target::Stratum | Target::Index => *offset, 386 | }; 387 | file.seek(std::io::SeekFrom::Start(position)).unwrap(); 388 | file.write_all(bytes).unwrap(); 389 | rolled_back_grains.push((grain_id, data)); 390 | } 391 | Some(WriteCommand::RemoveStratum(Some(stratum))) => { 392 | let id = BasinAndStratum::from_parts(grain_id.basin_id(), *stratum); 393 | std::fs::remove_file(path.join(id.to_string())).unwrap(); 394 | } 395 | Some(WriteCommand::RemoveStratum(None)) => { 396 | std::fs::remove_file(path.join(grain_id.basin_and_stratum().to_string())) 397 | .unwrap(); 398 | } 399 | Some(WriteCommand::RemoveIndex) => { 400 | std::fs::remove_file(path.join("index")).unwrap(); 401 | } 402 | Some(WriteCommand::DoNothing) => written_grains.push((grain_id, data)), 403 | None if expect_error => unreachable!("expected error but no error was encountered"), 404 | None => break, 405 | } 406 | } 407 | 408 | fs::remove_dir_all(path).unwrap(); 409 | } 410 | 411 | // Test removing the stratum after it's been created. This simulates a file 412 | // being written but the directory metadata not being synchronized, causing 413 | // the file's record to be entirely lost. 414 | test_write_after(&[WriteCommand::RemoveStratum(None)], false); 415 | // Test overwriting the headers with 0 -- an edge case where the file record 416 | // was synced but the headers weren't. 417 | test_write_after( 418 | &[WriteCommand::Write { 419 | target: Target::Stratum, 420 | offset: 0, 421 | bytes: &[1; 16_384], 422 | }], 423 | false, 424 | ); 425 | test_write_after( 426 | &[ 427 | WriteCommand::DoNothing, 428 | WriteCommand::Write { 429 | target: Target::Stratum, 430 | offset: StratumHeader::BYTES, 431 | bytes: &[1; 16_384], 432 | }, 433 | ], 434 | false, 435 | ); 436 | test_write_after( 437 | &[ 438 | WriteCommand::DoNothing, 439 | WriteCommand::DoNothing, 440 | WriteCommand::Write { 441 | target: Target::Stratum, 442 | offset: 0, 443 | bytes: &[1; 16_384 * 2], 444 | }, 445 | ], 446 | true, 447 | ); 448 | // Test overwriting the header with a valid but incorrect header. This 449 | // shouldn't ever happen in practice, because recovery is supposed to 450 | // overwrite the bad headers. 451 | let mut valid_header = StratumHeader::default(); 452 | let mut valid_header_bytes = Vec::new(); 453 | valid_header.transaction_id = TransactionId::from(1); 454 | valid_header.write_to(&mut valid_header_bytes).unwrap(); 455 | test_write_after( 456 | &[WriteCommand::Write { 457 | target: Target::Stratum, 458 | offset: 0, 459 | bytes: &valid_header_bytes, 460 | }], 461 | false, 462 | ); 463 | valid_header.transaction_id = TransactionId::from(2); 464 | valid_header_bytes.clear(); 465 | valid_header.write_to(&mut valid_header_bytes).unwrap(); 466 | test_write_after( 467 | &[ 468 | WriteCommand::DoNothing, 469 | WriteCommand::Write { 470 | target: Target::Stratum, 471 | offset: StratumHeader::BYTES, 472 | bytes: &valid_header_bytes, 473 | }, 474 | ], 475 | false, 476 | ); 477 | // Test overwriting both headers with crc-valid, but not actually valid, 478 | // headers. 479 | valid_header.write_to(&mut valid_header_bytes).unwrap(); 480 | test_write_after( 481 | &[ 482 | WriteCommand::DoNothing, 483 | WriteCommand::Write { 484 | target: Target::Stratum, 485 | offset: 0, 486 | bytes: &valid_header_bytes, 487 | }, 488 | ], 489 | true, 490 | ); 491 | 492 | // Test overwriting a grain's transaction ID in both the first and second 493 | // headers. 494 | test_write_after( 495 | &[WriteCommand::Write { 496 | target: Target::Grain, 497 | offset: 0, 498 | bytes: &[0xFF], 499 | }], 500 | false, 501 | ); 502 | 503 | test_write_after( 504 | &[ 505 | WriteCommand::DoNothing, 506 | WriteCommand::Write { 507 | target: Target::Grain, 508 | offset: 0, 509 | bytes: &[0xFF], 510 | }, 511 | ], 512 | false, 513 | ); 514 | 515 | // Test mutating the grain data, causing its CRC to fail to validate. 516 | test_write_after( 517 | &[WriteCommand::Write { 518 | target: Target::Grain, 519 | offset: 13, 520 | bytes: &[0xFF], 521 | }], 522 | false, 523 | ); 524 | 525 | test_write_after( 526 | &[ 527 | WriteCommand::DoNothing, 528 | WriteCommand::Write { 529 | target: Target::Grain, 530 | offset: 13, 531 | bytes: &[0xFF], 532 | }, 533 | ], 534 | false, 535 | ); 536 | 537 | // Test overwriting the stratum header. 538 | test_write_after( 539 | &[WriteCommand::Write { 540 | target: Target::Stratum, 541 | offset: 0, 542 | bytes: &[0xFF], 543 | }], 544 | false, 545 | ); 546 | 547 | test_write_after( 548 | &[ 549 | WriteCommand::DoNothing, 550 | WriteCommand::Write { 551 | target: Target::Stratum, 552 | offset: StratumHeader::BYTES, 553 | bytes: &[0xFF], 554 | }, 555 | ], 556 | false, 557 | ); 558 | 559 | // Test mucking with the index file 560 | test_write_after( 561 | &[WriteCommand::Write { 562 | target: Target::Index, 563 | offset: 0, 564 | bytes: &[0xFF], 565 | }], 566 | false, 567 | ); 568 | 569 | test_write_after( 570 | &[ 571 | WriteCommand::DoNothing, 572 | WriteCommand::Write { 573 | target: Target::Index, 574 | offset: IndexHeader::BYTES, 575 | bytes: &[0xFF], 576 | }, 577 | ], 578 | false, 579 | ); 580 | 581 | test_write_after( 582 | &[ 583 | WriteCommand::DoNothing, 584 | WriteCommand::Write { 585 | target: Target::Index, 586 | offset: IndexHeader::BYTES, 587 | bytes: &[0xFF], 588 | }, 589 | ], 590 | false, 591 | ); 592 | 593 | // Test writing a "valid" index header, but with broken data. 594 | let mut index_header = IndexHeader { 595 | // Point the commit log to an invalid id. 596 | transaction_id: TransactionId::from(1), 597 | commit_log_head: "71fffffffffe-fffff".parse().ok(), 598 | ..IndexHeader::default() 599 | }; 600 | let mut index_header_bytes = Vec::new(); 601 | index_header.write_to(&mut index_header_bytes).unwrap(); 602 | test_write_after( 603 | &[WriteCommand::Write { 604 | target: Target::Index, 605 | offset: 0, 606 | bytes: &index_header_bytes, 607 | }], 608 | false, 609 | ); 610 | index_header.transaction_id = TransactionId::from(2); 611 | index_header_bytes.clear(); 612 | index_header.write_to(&mut index_header_bytes).unwrap(); 613 | test_write_after( 614 | &[ 615 | WriteCommand::DoNothing, 616 | WriteCommand::Write { 617 | target: Target::Index, 618 | offset: IndexHeader::BYTES, 619 | bytes: &index_header_bytes, 620 | }, 621 | ], 622 | false, 623 | ); 624 | 625 | // Test removing the index file. This should generate an error because the 626 | // existing strata can be found. 627 | test_write_after(&[WriteCommand::RemoveIndex], true); 628 | 629 | // Test writing a valid index header, then overwriting part of the second header, 630 | // causing one header to fail to validate while the other can't parse due to 631 | // a crc error. This should never happen in real life. 632 | index_header.transaction_id = TransactionId::from(1); 633 | index_header_bytes.clear(); 634 | index_header.write_to(&mut index_header_bytes).unwrap(); 635 | // Overwrite part of the transaction id of the second header, causing its crc to fail. 636 | index_header_bytes.push(1); 637 | test_write_after( 638 | &[WriteCommand::Write { 639 | target: Target::Index, 640 | offset: 0, 641 | bytes: &index_header_bytes, 642 | }], 643 | true, 644 | ); 645 | 646 | // Write enough data to need two stratum, then remove the first to receiven 647 | // error. There are 3 grains required at the current allocation strategy for 648 | // a commit log entry that describes 1 new grain. The test function writes 649 | // each grain such that it takes up 63 consecutive grains. 650 | let mut commands = vec![WriteCommand::DoNothing; 16_372 / (3 + 63) + 1]; 651 | *commands.last_mut().unwrap() = WriteCommand::RemoveStratum(Some(StratumId::new(0).unwrap())); 652 | test_write_after(&commands, true); 653 | } 654 | 655 | #[test] 656 | fn invalid_checkpointing() { 657 | let path = Path::new("invalid-checkpointing"); 658 | if path.exists() { 659 | fs::remove_dir_all(path).unwrap(); 660 | } 661 | 662 | let db = Database::recover(path).unwrap(); 663 | let mut tx = db.begin_transaction().unwrap(); 664 | assert!(matches!( 665 | tx.checkpoint_to(TransactionId::from(1)).unwrap_err(), 666 | Error::InvalidTransactionId 667 | )); 668 | assert!(matches!( 669 | tx.checkpointed_to(TransactionId::from(1)).unwrap_err(), 670 | Error::InvalidTransactionId 671 | )); 672 | drop(tx); 673 | 674 | db.shutdown().unwrap(); 675 | fs::remove_dir_all(path).unwrap(); 676 | } 677 | -------------------------------------------------------------------------------- /src/format.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{Display, Write as _}; 2 | use std::io::{BufWriter, Read, Seek, Write}; 3 | use std::ops::{AddAssign, Deref, DerefMut}; 4 | use std::str::FromStr; 5 | 6 | use crc32c::crc32c; 7 | use okaywal::EntryId; 8 | 9 | use crate::commit_log::CommitLogEntry; 10 | use crate::{Error, Result}; 11 | 12 | #[derive(Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] 13 | pub struct GrainId(u64); 14 | 15 | impl GrainId { 16 | pub const NONE: Self = Self(u64::MAX); 17 | 18 | pub const fn new(basin: BasinId, stratum: StratumId, id: LocalGrainId) -> Self { 19 | Self((basin.0 as u64) << 61 | stratum.0 << 20 | id.0) 20 | } 21 | 22 | pub fn from_bytes(bytes: &[u8]) -> Option { 23 | let value = u64::from_be_bytes(bytes.try_into().ok()?); 24 | if value != u64::MAX { 25 | Some(Self(value)) 26 | } else { 27 | None 28 | } 29 | } 30 | 31 | pub const fn to_bytes(self) -> [u8; 8] { 32 | self.0.to_be_bytes() 33 | } 34 | 35 | pub const fn basin_id(self) -> BasinId { 36 | BasinId((self.0 >> 61) as u8) 37 | } 38 | 39 | pub fn local_grain_id(self) -> LocalGrainId { 40 | LocalGrainId(self.0 & 0xF_FFFF) 41 | } 42 | 43 | pub const fn local_grain_index(self) -> GrainIndex { 44 | GrainIndex(((self.0 >> 6) & 0x3FFF) as u16) 45 | } 46 | 47 | pub const fn grain_count(self) -> u8 { 48 | (self.0 & 0x3f) as u8 49 | } 50 | 51 | pub const fn basin_and_stratum(self) -> BasinAndStratum { 52 | BasinAndStratum(self.0 >> 20) 53 | } 54 | 55 | pub const fn stratum_id(self) -> StratumId { 56 | StratumId((self.0 >> 20) & 0x1ff_ffff_ffff) 57 | } 58 | 59 | pub(crate) const fn file_position(self) -> u64 { 60 | let grain_size = self.basin_id().grain_stripe_bytes() as u64; 61 | let index = self.local_grain_index().as_u16() as u64; 62 | let header_size = StratumHeader::BYTES * 2; 63 | 64 | header_size + index * grain_size 65 | } 66 | } 67 | 68 | impl FromStr for GrainId { 69 | type Err = GrainIdError; 70 | 71 | fn from_str(s: &str) -> Result { 72 | let mut parts = s.split('-'); 73 | let basin_and_stratum = parts.next().ok_or(GrainIdError::InvalidFormat)?; 74 | let index = parts.next().ok_or(GrainIdError::InvalidFormat)?; 75 | if parts.next().is_some() || basin_and_stratum.len() < 2 { 76 | return Err(GrainIdError::InvalidFormat); 77 | } 78 | 79 | let basin_and_stratum = BasinAndStratum::from_str(basin_and_stratum)?; 80 | 81 | let index_and_count = 82 | u64::from_str_radix(index, 16).map_err(|_| GrainIdError::InvalidGrainIndex)?; 83 | let count = (index_and_count & 0x3f) as u8; 84 | let index = GrainIndex::new((index_and_count >> 6) as u16) 85 | .ok_or(GrainIdError::InvalidGrainIndex)?; 86 | let id = LocalGrainId::from_parts(index, count).ok_or(GrainIdError::InvalidGrainIndex)?; 87 | 88 | Ok(Self::new( 89 | basin_and_stratum.basin(), 90 | basin_and_stratum.stratum(), 91 | id, 92 | )) 93 | } 94 | } 95 | 96 | impl std::fmt::Debug for GrainId { 97 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 98 | let basin_id = self.basin_id(); 99 | let stratum_id = self.stratum_id(); 100 | let local_index = self.local_grain_index(); 101 | let count = self.grain_count(); 102 | f.debug_struct("GrainId") 103 | .field("basin", &basin_id.0) 104 | .field("stratum", &stratum_id.0) 105 | .field("index", &local_index.0) 106 | .field("count", &count) 107 | .finish() 108 | } 109 | } 110 | 111 | #[derive(Debug, Eq, PartialEq)] 112 | pub enum GrainIdError { 113 | InvalidFormat, 114 | InvalidBasinId, 115 | InvalidStratum, 116 | InvalidGrainIndex, 117 | InvalidGrainCount, 118 | } 119 | 120 | impl Display for GrainId { 121 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 122 | let basin_and_stratum = self.basin_and_stratum(); 123 | let local_index = self.local_grain_id(); 124 | write!(f, "{basin_and_stratum}-{local_index}") 125 | } 126 | } 127 | 128 | #[test] 129 | fn grain_id_strings() { 130 | let zero = GrainId(0); 131 | assert_eq!(zero.to_string(), "00-0"); 132 | let none = GrainId::NONE; 133 | assert_eq!(none.to_string(), "71ffffffffff-fffff"); 134 | assert_eq!( 135 | GrainId::from_str("71ffffffffff-fffff").unwrap(), 136 | GrainId::NONE 137 | ); 138 | assert!(GrainId::from_str("72fffffffffff-fffff").is_err()); 139 | assert!(GrainId::from_str("71fffffffffff-1fffff").is_err()); 140 | assert!(GrainId::from_str("81fffffffffff-3fff").is_err()); 141 | assert!(GrainId::from_str("---").is_err()); 142 | assert!(GrainId::from_str("71ffffffffff-FFFFFFFFFFFFFFFFF").is_err()); 143 | assert!(GrainId::from_str("0FFFFFFFFFFFFFFFFF-3fff").is_err()); 144 | } 145 | 146 | #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, PartialOrd, Ord)] 147 | pub struct StratumId(u64); 148 | 149 | impl StratumId { 150 | pub const fn new(id: u64) -> Option { 151 | if id < 2_u64.pow(45) { 152 | Some(Self(id)) 153 | } else { 154 | None 155 | } 156 | } 157 | 158 | pub const fn as_usize(self) -> usize { 159 | self.0 as usize 160 | } 161 | 162 | pub const fn as_u64(self) -> u64 { 163 | self.0 164 | } 165 | } 166 | 167 | impl Display for StratumId { 168 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 169 | write!(f, "{:0x}", self.0) 170 | } 171 | } 172 | 173 | #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, Default)] 174 | pub struct BasinId(u8); 175 | 176 | impl BasinId { 177 | pub const MAX: Self = BasinId(7); 178 | pub const MIN: Self = BasinId(0); 179 | 180 | pub const fn new(id: u8) -> Option { 181 | if id < 8 { 182 | Some(Self(id)) 183 | } else { 184 | None 185 | } 186 | } 187 | 188 | pub fn to_char(self) -> char { 189 | (b'0' + self.0) as char 190 | } 191 | 192 | pub fn from_char(ch: char) -> Option { 193 | if ('0'..='7').contains(&ch) { 194 | Some(Self(ch as u8 - b'0')) 195 | } else { 196 | None 197 | } 198 | } 199 | 200 | pub const fn index(self) -> u8 { 201 | self.0 202 | } 203 | 204 | pub const fn next(self) -> Option { 205 | Self::new(self.0 + 1) 206 | } 207 | } 208 | 209 | impl Display for BasinId { 210 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 211 | f.write_char(self.to_char()) 212 | } 213 | } 214 | 215 | impl BasinId { 216 | pub const fn grain_stripe_bytes(self) -> u32 { 217 | match self.0 { 218 | 0 => 2_u32.pow(5), 219 | 1 => 2_u32.pow(8), 220 | 2 => 2_u32.pow(12), 221 | 3 => 2_u32.pow(16), 222 | 4 => 2_u32.pow(20), 223 | 5 => 2_u32.pow(24), 224 | 6 => 2_u32.pow(28), 225 | 7 => 2_u32.pow(31), 226 | _ => unreachable!(), 227 | } 228 | } 229 | } 230 | 231 | #[derive(Clone, Copy, Debug, Hash, Eq, PartialEq, PartialOrd, Ord)] 232 | pub struct BasinAndStratum(u64); 233 | 234 | impl BasinAndStratum { 235 | pub const fn from_parts(basin: BasinId, stratum: StratumId) -> Self { 236 | Self((basin.0 as u64) << 41 | stratum.0) 237 | } 238 | 239 | pub fn basin(self) -> BasinId { 240 | BasinId((self.0 >> 41) as u8) 241 | } 242 | 243 | pub fn stratum(self) -> StratumId { 244 | StratumId(self.0 & 0x1ff_ffff_ffff) 245 | } 246 | } 247 | 248 | impl Display for BasinAndStratum { 249 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 250 | let basin = self.basin(); 251 | let stratum = self.stratum(); 252 | write!(f, "{basin}{stratum}") 253 | } 254 | } 255 | 256 | impl FromStr for BasinAndStratum { 257 | type Err = GrainIdError; 258 | 259 | fn from_str(basin_and_stratum: &str) -> Result { 260 | let (basin, stratum) = basin_and_stratum.split_at(1); 261 | let Some(basin) = BasinId::from_char(basin.as_bytes()[0] as char) 262 | else { return Err(GrainIdError::InvalidBasinId) }; 263 | 264 | let stratum = u64::from_str_radix(stratum, 16).map_err(|_| GrainIdError::InvalidStratum)?; 265 | let stratum = StratumId::new(stratum).ok_or(GrainIdError::InvalidStratum)?; 266 | 267 | Ok(Self(u64::from(basin.0) << 41 | stratum.0)) 268 | } 269 | } 270 | 271 | #[test] 272 | fn basin_id_encoding() { 273 | for (ch, value) in ('0'..='7').zip(0..=7) { 274 | let expected = BasinId(value); 275 | assert_eq!(BasinId::from_char(ch), Some(expected)); 276 | assert_eq!(expected.to_char(), ch); 277 | } 278 | } 279 | 280 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] 281 | pub struct GrainIndex(u16); 282 | 283 | impl GrainIndex { 284 | pub const fn new(id: u16) -> Option { 285 | if id < 2_u16.pow(14) { 286 | Some(Self(id)) 287 | } else { 288 | None 289 | } 290 | } 291 | 292 | pub const fn as_u16(self) -> u16 { 293 | self.0 294 | } 295 | } 296 | 297 | impl AddAssign for GrainIndex { 298 | fn add_assign(&mut self, rhs: u8) { 299 | self.0 += u16::from(rhs); 300 | } 301 | } 302 | 303 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 304 | pub struct LocalGrainId(u64); 305 | 306 | impl LocalGrainId { 307 | pub const fn from_parts(index: GrainIndex, grain_count: u8) -> Option { 308 | if grain_count < 64 { 309 | Some(Self((index.0 as u64) << 6 | grain_count as u64)) 310 | } else { 311 | None 312 | } 313 | } 314 | 315 | pub const fn grain_index(self) -> GrainIndex { 316 | GrainIndex((self.0 >> 6) as u16) 317 | } 318 | 319 | pub const fn grain_count(self) -> u8 { 320 | (self.0 & 0x3f) as u8 321 | } 322 | } 323 | 324 | impl Display for LocalGrainId { 325 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 326 | write!(f, "{:0x}", self.0) 327 | } 328 | } 329 | 330 | #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord, Default)] 331 | pub struct TransactionId(u64); 332 | 333 | impl TransactionId { 334 | pub fn to_be_bytes(self) -> [u8; 8] { 335 | self.0.to_be_bytes() 336 | } 337 | 338 | pub const fn from_be_bytes(bytes: [u8; 8]) -> Self { 339 | Self(u64::from_be_bytes(bytes)) 340 | } 341 | } 342 | 343 | impl From for TransactionId { 344 | fn from(id: u64) -> Self { 345 | TransactionId(id) 346 | } 347 | } 348 | 349 | impl From for u64 { 350 | fn from(id: TransactionId) -> Self { 351 | id.0 352 | } 353 | } 354 | 355 | impl From for TransactionId { 356 | fn from(id: EntryId) -> Self { 357 | Self(id.0) 358 | } 359 | } 360 | 361 | impl From for EntryId { 362 | fn from(tx_id: TransactionId) -> Self { 363 | EntryId(tx_id.0) 364 | } 365 | } 366 | 367 | impl PartialEq for TransactionId { 368 | fn eq(&self, other: &u64) -> bool { 369 | self.0 == *other 370 | } 371 | } 372 | 373 | impl PartialEq for TransactionId { 374 | fn eq(&self, other: &EntryId) -> bool { 375 | self.0 == other.0 376 | } 377 | } 378 | 379 | impl PartialOrd for TransactionId { 380 | fn partial_cmp(&self, other: &EntryId) -> Option { 381 | self.0.partial_cmp(&other.0) 382 | } 383 | } 384 | 385 | pub enum FileHeader { 386 | Both(T, T), 387 | First(T), 388 | Second(T), 389 | } 390 | 391 | impl FileHeader 392 | where 393 | T: Duplicable, 394 | { 395 | pub fn read_from(mut file: R, scratch: &mut Vec) -> Result { 396 | let first_header = T::read_from(&mut file, scratch); 397 | if first_header.is_err() { 398 | file.seek(std::io::SeekFrom::Start(T::BYTES))?; 399 | } 400 | let second_header = T::read_from(&mut file, scratch); 401 | match (first_header, second_header) { 402 | (Ok(first_header), Ok(second_header)) => Ok(Self::Both(first_header, second_header)), 403 | (Err(err), Err(_)) => Err(err), 404 | (Ok(first_header), Err(_)) => Ok(Self::First(first_header)), 405 | (Err(_), Ok(second_header)) => Ok(Self::Second(second_header)), 406 | } 407 | } 408 | 409 | pub fn as_options(&self) -> (Option<&T>, Option<&T>) { 410 | match self { 411 | FileHeader::Both(first, second) => (Some(first), Some(second)), 412 | FileHeader::First(first) => (Some(first), None), 413 | FileHeader::Second(second) => (None, Some(second)), 414 | } 415 | } 416 | 417 | pub fn into_first(self) -> T { 418 | match self { 419 | FileHeader::Both(first, _) | FileHeader::First(first) => first, 420 | FileHeader::Second(_) => unreachable!("did not contain a valid first"), 421 | } 422 | } 423 | 424 | pub fn into_second(self) -> T { 425 | match self { 426 | FileHeader::Both(_, second) | FileHeader::Second(second) => second, 427 | FileHeader::First(_) => unreachable!("did not contain a valid second"), 428 | } 429 | } 430 | } 431 | 432 | pub trait Duplicable: Sized { 433 | const BYTES: u64; 434 | 435 | fn read_from(reader: R, scratch: &mut Vec) -> Result; 436 | fn write_to(&mut self, writer: W) -> Result<()>; 437 | } 438 | 439 | /// A header inside of an "Index" file. 440 | /// 441 | /// This data structure is serialized as: 442 | /// 443 | /// - `transaction_id`: 8 bytes 444 | /// - `embedded_header_data`: 8 bytes 445 | /// - `commit_log_head`: 8 bytes 446 | /// - `checkpoint_target`: 8 bytes 447 | /// - `checkpointed_to`: 8 bytes 448 | /// - `basin_strata_count`: 8 x 6 bytes (48 bytes). 449 | /// - `crc32`: 4 bytes (checksum of previous 88 bytes) 450 | /// 451 | /// The total header length is 36 bytes. 452 | /// 453 | /// # About the Index file 454 | /// 455 | /// Index files are the root of a Sediment database. The header is responsible 456 | /// for pointing to several key pieces of data, which will be stored within the 457 | /// other files. 458 | /// 459 | /// The Index file is serialized in this fashion: 460 | /// 461 | /// - Magic code + version (4 bytes) 462 | /// - [`IndexHeader`] 463 | /// - [`IndexHeader`] 464 | /// 465 | /// The record with the highest transaction id should be checked upon recovery 466 | /// to ensure that `embedded_header_data` is written with the same 467 | /// [`TransactionId`]. 468 | #[derive(Debug, Clone, Eq, PartialEq, Default)] 469 | pub struct IndexHeader { 470 | pub transaction_id: TransactionId, 471 | pub embedded_header_data: Option, 472 | pub commit_log_head: Option, 473 | pub checkpoint_target: TransactionId, 474 | pub checkpointed_to: TransactionId, 475 | pub basin_strata_count: [u64; 8], 476 | pub crc32: u32, 477 | } 478 | 479 | impl Duplicable for IndexHeader { 480 | const BYTES: u64 = 92; 481 | 482 | fn read_from(mut file: R, scratch: &mut Vec) -> Result { 483 | scratch.resize(Self::BYTES as usize, 0); 484 | file.read_exact(scratch)?; 485 | let crc32 = u32::from_be_bytes(scratch[88..].try_into().expect("u32 is 4 bytes")); 486 | let computed_crc = crc32c(&scratch[..88]); 487 | if crc32 != computed_crc { 488 | return Err(Error::ChecksumFailed); 489 | } 490 | 491 | let (transaction_bytes, remaining) = scratch.split_at(8); 492 | let transaction_id = TransactionId(u64::from_be_bytes( 493 | transaction_bytes.try_into().expect("u64 is 8 bytes"), 494 | )); 495 | let (embedded_header_bytes, remaining) = remaining.split_at(8); 496 | let embedded_header_data = GrainId::from_bytes(embedded_header_bytes); 497 | let (commit_log_head_bytes, remaining) = remaining.split_at(8); 498 | let commit_log_head = GrainId::from_bytes(commit_log_head_bytes); 499 | let (checkpoint_target_bytes, remaining) = remaining.split_at(8); 500 | let checkpoint_target = TransactionId(u64::from_be_bytes( 501 | checkpoint_target_bytes.try_into().expect("u64 is 8 bytes"), 502 | )); 503 | let (checkpointed_to_bytes, mut remaining) = remaining.split_at(8); 504 | let checkpointed_to = TransactionId(u64::from_be_bytes( 505 | checkpointed_to_bytes.try_into().expect("u64 is 8 bytes"), 506 | )); 507 | let mut basin_strata_count = [0; 8]; 508 | for count in &mut basin_strata_count { 509 | let mut padded_bytes = [0; 8]; 510 | padded_bytes[2..].copy_from_slice(&remaining[..6]); 511 | remaining = &remaining[6..]; 512 | *count = u64::from_be_bytes(padded_bytes); 513 | } 514 | 515 | Ok(Self { 516 | transaction_id, 517 | embedded_header_data, 518 | commit_log_head, 519 | checkpoint_target, 520 | checkpointed_to, 521 | basin_strata_count, 522 | crc32, 523 | }) 524 | } 525 | 526 | fn write_to(&mut self, writer: W) -> Result<()> { 527 | let mut writer = ChecksumWriter::new(writer); 528 | writer.write_all(&self.transaction_id.to_be_bytes())?; 529 | writer.write_all( 530 | &self 531 | .embedded_header_data 532 | .unwrap_or(GrainId::NONE) 533 | .0 534 | .to_be_bytes(), 535 | )?; 536 | writer.write_all( 537 | &self 538 | .commit_log_head 539 | .unwrap_or(GrainId::NONE) 540 | .0 541 | .to_be_bytes(), 542 | )?; 543 | writer.write_all(&self.checkpoint_target.to_be_bytes())?; 544 | writer.write_all(&self.checkpointed_to.to_be_bytes())?; 545 | for count in &self.basin_strata_count { 546 | writer.write_all(&count.to_be_bytes()[2..])?; 547 | } 548 | let (_, crc32) = writer.write_crc32_and_finish()?; 549 | self.crc32 = crc32; 550 | 551 | Ok(()) 552 | } 553 | } 554 | 555 | /// Each Stratum header is 16kb, and describes the state of allocation of each 556 | /// grain within the Stratum. 557 | /// 558 | /// It is serialized as: 559 | /// 560 | /// - [`TransactionId`]: 8 bytes 561 | /// - [`GrainAllocationInfo`]: 16,372 one-byte entries 562 | /// - CRC32: 4 bytes 563 | /// 564 | /// The grain size is determined by the name of the file that contains the 565 | /// header. 566 | /// 567 | /// # About Statum files 568 | /// 569 | /// Strata contain the data written to the Sediment database. 570 | /// 571 | /// The header consists of two [`StratumHeader`]s serialized one after another. 572 | /// The header with the latest [`TransactionId`] is considered the current 573 | /// record. When updating the header, the inactive copy should be overwritten. 574 | /// 575 | /// If an aborted write is detected and a rollback needs to happen, the rolled 576 | /// back header should be overwritten with a second copy of the previous 577 | /// version. 578 | /// 579 | /// Directly after the two [`StratumHeader`]s is a tightly packed list of 580 | /// grains. Each grain is serialized as: 581 | /// 582 | /// - [`TransactionId`]: 8 bytes 583 | /// - Data Length: 4 bytes 584 | /// - Grain Data: The contiguous data stored within the grain. 585 | /// - CRC32: The CRC of the [`TransactionId`] and the grain data. 586 | /// 587 | /// Strata are grouped together to form a Basin. In each Basin, the grain stripe 588 | /// size is always the same. The Basin's grain size is determined by the name of 589 | /// the Stratum file. The first character is a single Base32 character whose 590 | /// value is the exponent of the grain size equation: `2^(grain_exponent)`. 591 | /// Because each piece of data must have 16 extra bytes allocated to it, the 592 | /// smallest usable grain exponent is 5 (`F`). 593 | /// 594 | /// To find the data associated with a grain, its local grain index must be 595 | /// computed. Because each Stratum can contain a maximum of 16,372 grains, the 596 | /// remaining characters in a Stratum's file name is a hexadecimal 597 | /// representation of the top 50 bits of a `GrainId` in big endian. The 598 | /// remaining 14 bits contain the local grain index. 599 | /// 600 | /// The offset of a local grain index is `32kb + local_grain_index * 601 | /// grain_size`. Because grains can be stored in stripes of up to 64 consecutive 602 | /// grains, not every local grain index will point to the start of a grain 603 | /// record. The [`StratumHeader`] must be used to determine if a given local 604 | /// grain index is valid before trusting the data stored. 605 | #[derive(Debug)] 606 | pub struct StratumHeader { 607 | pub transaction_id: TransactionId, 608 | pub grains: [u8; 16372], 609 | pub crc32: u32, 610 | } 611 | 612 | impl StratumHeader { 613 | pub const fn grain_info(&self, index: usize) -> GrainAllocationInfo { 614 | GrainAllocationInfo(self.grains[index]) 615 | } 616 | 617 | pub fn reflects_changes_from(&self, commit_log: &CommitLogEntry) -> bool { 618 | let new_grains = commit_log 619 | .new_grains 620 | .iter() 621 | .map(|new_grain| (GrainAllocationStatus::Allocated, new_grain.id)); 622 | let archived_grains = commit_log 623 | .archived_grains 624 | .iter() 625 | .map(|grain| (GrainAllocationStatus::Archived, *grain)); 626 | let freed_grains = commit_log 627 | .freed_grains 628 | .iter() 629 | .map(|grain| (GrainAllocationStatus::Free, *grain)); 630 | for (expected_status, grain_id) in new_grains.chain(archived_grains).chain(freed_grains) { 631 | let start = usize::from(grain_id.local_grain_index().as_u16()); 632 | let mut expected_count = grain_id.grain_count(); 633 | for info in self 634 | .grains 635 | .iter() 636 | .skip(start) 637 | .take(usize::from(expected_count)) 638 | { 639 | let info = GrainAllocationInfo(*info); 640 | 641 | let matches = if info.status() == Some(expected_status) { 642 | if expected_status == GrainAllocationStatus::Free { 643 | info.count() == 0 644 | } else { 645 | info.count() == expected_count 646 | } 647 | } else { 648 | false 649 | }; 650 | 651 | if !matches { 652 | return false; 653 | } 654 | 655 | expected_count -= 1; 656 | } 657 | } 658 | 659 | true 660 | } 661 | } 662 | 663 | impl Duplicable for StratumHeader { 664 | const BYTES: u64 = 16_384; 665 | 666 | fn read_from(mut file: R, scratch: &mut Vec) -> Result { 667 | scratch.resize(16_384, 0); 668 | file.read_exact(scratch)?; 669 | 670 | let mut grains = [0; 16_372]; 671 | 672 | let crc32 = u32::from_be_bytes(scratch[16_380..].try_into().expect("u32 is 4 bytes")); 673 | let computed_crc = crc32c(&scratch[..16_380]); 674 | if crc32 != computed_crc { 675 | if scratch.iter().all(|b| b == &0) { 676 | return Ok(Self { 677 | transaction_id: TransactionId::default(), 678 | grains, 679 | crc32: 0, 680 | }); 681 | } 682 | 683 | return Err(Error::ChecksumFailed); 684 | } 685 | 686 | let transaction_id = TransactionId(u64::from_be_bytes( 687 | scratch[..8].try_into().expect("u64 is 8 bytes"), 688 | )); 689 | 690 | grains.copy_from_slice(&scratch[8..16_372 + 8]); 691 | 692 | Ok(Self { 693 | transaction_id, 694 | grains, 695 | crc32, 696 | }) 697 | } 698 | 699 | fn write_to(&mut self, writer: W) -> Result<()> { 700 | let mut writer = ChecksumWriter::new(BufWriter::new(writer)); 701 | writer.write_all(&self.transaction_id.to_be_bytes())?; 702 | writer.write_all(&self.grains)?; 703 | self.crc32 = writer.crc32(); 704 | writer.write_all(&self.crc32.to_be_bytes())?; 705 | 706 | writer.flush()?; 707 | 708 | Ok(()) 709 | } 710 | } 711 | 712 | impl Default for StratumHeader { 713 | fn default() -> Self { 714 | Self { 715 | transaction_id: Default::default(), 716 | grains: [0; 16372], 717 | crc32: Default::default(), 718 | } 719 | } 720 | } 721 | 722 | #[derive(Clone, Copy, Debug, Eq, PartialEq, Default)] 723 | #[repr(transparent)] 724 | pub struct GrainAllocationInfo(pub u8); 725 | 726 | impl GrainAllocationInfo { 727 | pub const fn allocated(count: u8) -> Self { 728 | assert!(count < 64); 729 | Self((1 << 6) | count) 730 | } 731 | 732 | pub const fn archived(count: u8) -> Self { 733 | assert!(count < 64); 734 | Self((2 << 6) | count) 735 | } 736 | 737 | pub fn status(self) -> Option { 738 | match self.0 >> 6 { 739 | 0 => Some(GrainAllocationStatus::Free), 740 | 1 => Some(GrainAllocationStatus::Allocated), 741 | 2 => Some(GrainAllocationStatus::Archived), 742 | _ => None, 743 | } 744 | } 745 | 746 | pub fn count(self) -> u8 { 747 | self.0 & 0b0011_1111 748 | } 749 | } 750 | 751 | #[derive(Debug, Ord, PartialOrd, Eq, PartialEq, Clone, Copy)] 752 | pub enum GrainAllocationStatus { 753 | Allocated, 754 | Archived, 755 | Free, 756 | } 757 | 758 | pub trait ByteUtil { 759 | fn to_be_bytes(&self) -> [u8; 8]; 760 | fn from_be_bytes(bytes: [u8; 8]) -> Self; 761 | } 762 | 763 | macro_rules! impl_bytes_for { 764 | ($type:ident) => { 765 | impl ByteUtil for $type { 766 | fn to_be_bytes(&self) -> [u8; 8] { 767 | self.0.to_be_bytes() 768 | } 769 | 770 | fn from_be_bytes(bytes: [u8; 8]) -> Self { 771 | Self(u64::from_be_bytes(bytes)) 772 | } 773 | } 774 | }; 775 | } 776 | 777 | impl_bytes_for!(GrainId); 778 | 779 | impl ByteUtil for Option { 780 | fn to_be_bytes(&self) -> [u8; 8] { 781 | self.unwrap_or(GrainId::NONE).to_be_bytes() 782 | } 783 | 784 | fn from_be_bytes(bytes: [u8; 8]) -> Self { 785 | let id = GrainId::from_be_bytes(bytes); 786 | if id != GrainId::NONE { 787 | Some(id) 788 | } else { 789 | None 790 | } 791 | } 792 | } 793 | 794 | pub struct ChecksumWriter { 795 | writer: W, 796 | crc32: u32, 797 | } 798 | 799 | impl ChecksumWriter 800 | where 801 | W: Write, 802 | { 803 | pub fn new(writer: W) -> Self { 804 | Self { writer, crc32: 0 } 805 | } 806 | 807 | pub fn crc32(&self) -> u32 { 808 | self.crc32 809 | } 810 | 811 | pub fn write_crc32_and_finish(mut self) -> Result<(W, u32)> { 812 | self.write_all(&self.crc32.to_be_bytes())?; 813 | Ok((self.writer, self.crc32)) 814 | } 815 | } 816 | 817 | impl Write for ChecksumWriter 818 | where 819 | W: Write, 820 | { 821 | fn write(&mut self, buf: &[u8]) -> std::io::Result { 822 | let bytes_written = self.writer.write(buf)?; 823 | if bytes_written > 0 { 824 | self.crc32 = crc32c::crc32c_append(self.crc32, &buf[..bytes_written]); 825 | } 826 | Ok(bytes_written) 827 | } 828 | 829 | fn flush(&mut self) -> std::io::Result<()> { 830 | self.writer.flush() 831 | } 832 | } 833 | 834 | #[derive(Debug)] 835 | pub struct Stored { 836 | pub grain_id: GrainId, 837 | pub stored: T, 838 | } 839 | 840 | impl Deref for Stored { 841 | type Target = T; 842 | 843 | fn deref(&self) -> &Self::Target { 844 | &self.stored 845 | } 846 | } 847 | 848 | impl DerefMut for Stored { 849 | fn deref_mut(&mut self) -> &mut Self::Target { 850 | &mut self.stored 851 | } 852 | } 853 | --------------------------------------------------------------------------------