├── .rustme ├── header.md ├── config.ron └── docs.md ├── .github ├── FUNDING.yml └── workflows │ ├── rust.yml │ ├── audit.yml │ ├── docs.yml │ ├── benches.yml │ └── coverage.yml ├── .cargo └── config ├── .gitignore ├── rustfmt.toml ├── xtask ├── Cargo.toml └── src │ └── main.rs ├── benchmarks ├── Cargo.toml └── benches │ └── benchmarks.rs ├── deny.toml ├── src ├── to_io_result.rs ├── manager.rs ├── config.rs ├── buffered.rs ├── entry.rs ├── .crate-docs.md ├── tests.rs ├── log_file.rs └── lib.rs ├── Cargo.toml ├── LICENSE-MIT ├── CHANGELOG.md ├── CONTRIBUTING.md ├── examples └── basic.rs ├── CODE_OF_CONDUCT.md ├── README.md └── LICENSE-APACHE /.rustme/header.md: -------------------------------------------------------------------------------- 1 | # OkayWAL 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [ecton] -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [alias] 2 | xtask = "run --package xtask --" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | perf.data* 4 | .tmp* -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | unstable_features = true 2 | imports_granularity = "Crate" 3 | group_imports = "StdExternalCrate" 4 | format_code_in_doc_comments = true 5 | -------------------------------------------------------------------------------- /xtask/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xtask" 3 | version = "0.0.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [dependencies] 8 | khonsu-tools = { git = "https://github.com/khonsulabs/khonsu-tools.git", branch = "main" } 9 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | timeout-minutes: 30 9 | steps: 10 | - uses: actions/checkout@v2 11 | 12 | - name: Install Rust 13 | uses: hecrj/setup-rust-action@v1 14 | 15 | - name: Run clippy 16 | run: | 17 | cargo clippy 18 | 19 | - name: Run unit tests 20 | run: | 21 | cargo test --all-features --all-targets 22 | -------------------------------------------------------------------------------- /xtask/src/main.rs: -------------------------------------------------------------------------------- 1 | use khonsu_tools::{ 2 | universal::{anyhow, clap::Parser, DefaultConfig}, 3 | Commands, 4 | }; 5 | 6 | fn main() -> anyhow::Result<()> { 7 | Commands::parse().execute::() 8 | } 9 | 10 | enum Config {} 11 | 12 | impl khonsu_tools::Config for Config { 13 | type Publish = Self; 14 | type Universal = DefaultConfig; 15 | } 16 | 17 | impl khonsu_tools::publish::Config for Config { 18 | fn paths() -> Vec { 19 | vec![String::from(".")] 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "benchmarks" 3 | version = "0.0.0" 4 | edition = "2021" 5 | publish = false 6 | 7 | [features] 8 | sqlite = ["dep:rusqlite"] 9 | 10 | [dependencies] 11 | tempfile = "3.3.0" 12 | okaywal = { path = "../" } 13 | timings = { git = "https://github.com/khonsulabs/timings", branch = "main" } 14 | bytesize = "1.1.0" 15 | sharded-log = { version = "0.0.1", optional = true } 16 | postgres = { version = "0.19.4", optional = true } 17 | rusqlite = { version = "0.28.0", optional = true } 18 | 19 | [[bench]] 20 | name = "benchmarks" 21 | harness = false 22 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | targets = [] 2 | 3 | [advisories] 4 | db-path = "~/.cargo/advisory-db" 5 | db-urls = ["https://github.com/rustsec/advisory-db"] 6 | vulnerability = "deny" 7 | unmaintained = "warn" 8 | yanked = "warn" 9 | notice = "warn" 10 | ignore = [] 11 | 12 | [licenses] 13 | unlicensed = "deny" 14 | allow = [] 15 | deny = [] 16 | copyleft = "deny" 17 | allow-osi-fsf-free = "either" 18 | default = "deny" 19 | confidence-threshold = 0.8 20 | exceptions = [] 21 | 22 | [licenses.private] 23 | ignore = true 24 | 25 | [bans] 26 | multiple-versions = "warn" 27 | wildcards = "allow" 28 | highlight = "all" 29 | -------------------------------------------------------------------------------- /.github/workflows/audit.yml: -------------------------------------------------------------------------------- 1 | name: Audit 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | audit: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Install Rust 11 | uses: hecrj/setup-rust-action@v1 12 | - name: Cache 13 | uses: actions/cache@v2 14 | with: 15 | path: | 16 | ~/.cargo/.crates.toml 17 | ~/.cargo/.crates2.json 18 | ~/.cargo/bin/cargo-deny 19 | key: cargo-deny 20 | 21 | - name: Install cargo-deny 22 | run: cargo -v install cargo-deny 23 | 24 | - name: Checkout 25 | uses: actions/checkout@v2 26 | 27 | - name: Audit 28 | run: | 29 | cargo xtask audit 30 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: [push] 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-latest 8 | if: github.ref == 'refs/heads/main' 9 | steps: 10 | - name: Install Rust 11 | uses: hecrj/setup-rust-action@v1 12 | 13 | - uses: actions/checkout@v2 14 | - name: Generate Docs 15 | run: | 16 | cargo doc --no-deps --all-features 17 | 18 | - name: Deploy Docs 19 | uses: JamesIves/github-pages-deploy-action@releases/v4 20 | with: 21 | branch: gh-pages 22 | folder: target/doc/ 23 | git-config-name: kl-botsu 24 | git-config-email: botsu@khonsulabs.com 25 | target-folder: /main/ 26 | clean: true 27 | -------------------------------------------------------------------------------- /src/to_io_result.rs: -------------------------------------------------------------------------------- 1 | use std::{io, num::TryFromIntError}; 2 | 3 | pub trait ToIoResult { 4 | fn to_io(self) -> io::Result; 5 | } 6 | 7 | impl ToIoResult for std::result::Result { 8 | fn to_io(self) -> io::Result { 9 | match self { 10 | Ok(result) => Ok(result), 11 | Err(err) => Err(io::Error::new(io::ErrorKind::Other, err)), 12 | } 13 | } 14 | } 15 | 16 | impl ToIoResult for Result> { 17 | fn to_io(self) -> io::Result { 18 | match self { 19 | Ok(value) => Ok(value), 20 | Err(_) => Err(io::Error::new( 21 | io::ErrorKind::BrokenPipe, 22 | "sender disconnected", 23 | )), 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "okaywal" 3 | version = "0.3.1" 4 | license = "MIT OR Apache-2.0" 5 | edition = "2021" 6 | description = "An okay Write-Ahead Log implementation" 7 | repository = "https://github.com/khonsulabs/okaywal" 8 | keywords = ["write-ahead-log", "write-ahead-logging", "wal"] 9 | categories = ["database"] 10 | readme = "./README.md" 11 | rust-version = "1.58" 12 | 13 | [dependencies] 14 | parking_lot = "0.12.1" 15 | crc32c = "0.6.3" 16 | flume = "0.11.0" 17 | tracing = { version = "0.1.36", optional = true } 18 | file-manager = { git = "https://github.com/khonsulabs/file-manager", branch = "main" } 19 | 20 | [dev-dependencies] 21 | tempfile = "3.3.0" 22 | fastrand = "2.0.1" 23 | 24 | [workspace] 25 | members = ["benchmarks", "xtask"] 26 | 27 | [profile.bench] 28 | debug = true 29 | [profile.release] 30 | debug = true 31 | -------------------------------------------------------------------------------- /.github/workflows/benches.yml: -------------------------------------------------------------------------------- 1 | name: Benchmarks 2 | 3 | on: [push] 4 | 5 | jobs: 6 | benchmark: 7 | services: 8 | postgres: 9 | image: postgres 10 | env: 11 | POSTGRES_DB: bench 12 | POSTGRES_USER: bencher 13 | POSTGRES_PASSWORD: password 14 | options: >- 15 | --health-cmd pg_isready 16 | --health-interval 10s 17 | --health-timeout 5s 18 | --health-retries 5 19 | ports: 20 | - 5432:5432 21 | 22 | runs-on: ubuntu-latest 23 | timeout-minutes: 60 24 | steps: 25 | - uses: actions/checkout@v2 26 | 27 | - name: Install Rust 28 | uses: hecrj/setup-rust-action@v1 29 | 30 | - name: Build benchmarks 31 | run: | 32 | cargo bench -p benchmarks --all-features --no-run 33 | 34 | - name: Run benchmarks 35 | run: | 36 | cargo bench -p benchmarks --all-features 37 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright 2021 Khonsu Labs LLC 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: [push] 4 | 5 | jobs: 6 | coverage: 7 | services: 8 | postgres: 9 | image: postgres 10 | env: 11 | POSTGRES_DB: bench 12 | POSTGRES_USER: bencher 13 | POSTGRES_PASSWORD: password 14 | options: >- 15 | --health-cmd pg_isready 16 | --health-interval 10s 17 | --health-timeout 5s 18 | --health-retries 5 19 | ports: 20 | - 5432:5432 21 | runs-on: ubuntu-latest 22 | timeout-minutes: 30 23 | steps: 24 | - uses: actions/checkout@v2 25 | 26 | - name: Install Rust 27 | uses: hecrj/setup-rust-action@v1 28 | 29 | - name: Run code coverage 30 | run: | 31 | cargo xtask generate-code-coverage-report --install-dependencies 32 | 33 | - name: Deploy Docs 34 | if: github.ref == 'refs/heads/main' 35 | uses: JamesIves/github-pages-deploy-action@releases/v4 36 | with: 37 | branch: gh-pages 38 | folder: coverage/ 39 | git-config-name: kl-botsu 40 | git-config-email: botsu@khonsulabs.com 41 | target-folder: /coverage/ 42 | clean: true 43 | -------------------------------------------------------------------------------- /.rustme/config.ron: -------------------------------------------------------------------------------- 1 | Configuration( 2 | files: { 3 | "../README.md": [ 4 | "header.md", 5 | "docs.md", 6 | "https://github.com/khonsulabs/.github/raw/main/snippets/readme-footer.md", 7 | ], 8 | "../src/.crate-docs.md": ( 9 | for_docs: true, 10 | sections: [ 11 | "docs.md", 12 | ] 13 | ), 14 | "../CONTRIBUTING.md": [ 15 | "https://github.com/khonsulabs/.github/raw/main/docs/CONTRIBUTING.md", 16 | ], 17 | "../CODE_OF_CONDUCT.md": [ 18 | "https://github.com/khonsulabs/.github/raw/main/docs/CODE_OF_CONDUCT.md", 19 | ], 20 | "../LICENSE-APACHE": [ 21 | "https://github.com/khonsulabs/.github/raw/main/licenses/LICENSE-APACHE", 22 | ], 23 | "../LICENSE-MIT": [ 24 | "https://github.com/khonsulabs/.github/raw/main/licenses/LICENSE-MIT", 25 | ], 26 | }, 27 | glossaries: [ 28 | "https://github.com/khonsulabs/.github/raw/main/snippets/glossary.ron", 29 | { 30 | "docs-base": ( 31 | default: "https://khonsulabs.github.io/okaywal/main/okaywal", 32 | release: "https://docs.rs/okaywal", 33 | ), 34 | "src-base": ( 35 | default: "https://github.com/khonsulabs/okaywal/blob/main", 36 | release: "https://github.com/khonsulabs/okaywal/blob/v0.3.0", 37 | ), 38 | "logmanager-trait": ( 39 | default: "https://khonsulabs.github.io/okaywal/main/okaywal/trait.LogManager.html", 40 | release: "https://docs.rs/okaywal/*/okaywal/trait.LogManager.html", 41 | for_docs: "LogManager", 42 | ), 43 | "wal-recover": ( 44 | default: "https://khonsulabs.github.io/okaywal/main/okaywal/struct.WriteAheadLog.html#method.recover", 45 | release: "https://docs.rs/okaywal/*/okaywal/struct.WriteAheadLog.html#method.recover", 46 | for_docs: "WriteAheadLog::recover", 47 | ), 48 | } 49 | ], 50 | ) -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## v0.3.1 9 | 10 | ### Fixed 11 | 12 | - Rolling back the first entry of a log file caused its data to become skipped 13 | during recovery due to a small logic bug that rewrote the file's header if no 14 | entries had been fsynced. Thanks to @losfair for the succinct test case in 15 | issue [#14][14]. 16 | 17 | [14]: https://github.com/khonsulabs/okaywal/issues/14 18 | 19 | ## v0.3.0 20 | 21 | ### Added 22 | 23 | - `WriteAheadLog::checkpoint_active()` is a new function that checkpoints the 24 | currently stored data, regardless of whether the configured thresholds are 25 | met. This function returns after the active file has been rotated and the 26 | checkpointing thread has been notified of the file to checkpoint. Thanks to 27 | @blakesmith for implementing this in [#11][11] 28 | 29 | [11]: https://github.com/khonsulabs/okaywal/pull/11 30 | 31 | ## v0.2.0 32 | 33 | ### Breaking Changes 34 | 35 | - `LogManager::checkpoint_to` now has an additional parameter, `wal: 36 | &WriteAheadLog`. This is provided for convenience because it may be necessary 37 | to randomly access information in the WAL while performing a checkpointing 38 | operation. 39 | 40 | ### Added 41 | 42 | - `LogPosition::serialize_to`/`LogPosition::deserialize_from` provide methods 43 | for reading and writing a `LogPosition` from an arbitrary `Write`/`Read` 44 | implementor (respectively). This uses a fixed-length serialization with a 45 | length of `LogPosition::SERIALIZED_LENGTH` -- 16 bytes. 46 | 47 | ### Changed 48 | 49 | - `WriteAheadLog::shutdown()` now no longer requires all instances of 50 | `WriteAheadLog` to be dropped to succeed. 51 | 52 | ### Fixed 53 | 54 | - When the WAL recycles a segment file, the `LogPosition` returned is now 55 | correct. Previously, returned `LogPosition`s would contain the segment file's 56 | old id, causing those positions to be unreadable. 57 | - When reading from a `LogPosition`, if the data has not been flushed or 58 | synchronized to disk yet, the read will be blocked until the sync operation 59 | finishes. 60 | 61 | ## v0.1.0 62 | 63 | - Initial public preview release. No stability guarantees are being made at this 64 | stage. Feedback is welcome. 65 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to our projects 2 | 3 | Thank you for your interest in contributing to one of our projects. We want 4 | everyone to have a positive experience contributing, so please carefully review 5 | our only requirements for contributing: 6 | 7 | - All contributors must agree to [our Contributor License 8 | Agreement](https://gist.github.com/ecton/b2e1e72abfa122da5e69ed30164f739e). 9 | This will be asked for during your first pull request. 10 | - All contributors must uphold the standards of our [Code of 11 | Conduct](./CODE_OF_CONDUCT.md). 12 | 13 | The rest of this document are recommendations/guidelines to help consistency and 14 | communication within our projects. 15 | 16 | ## Creating Issues 17 | 18 | ### Reporting Bugs 19 | 20 | To us, if something isn't behaving as you expect it to, that's a bug. Even if 21 | it's misbehaving due to a misunderstanding, that means there's an opportunity to 22 | improve our documentation or examples. Please don't hesitate to let us know if 23 | you run into any issues while working with one of our projects. 24 | 25 | ### Requesting New Features 26 | 27 | When requesting new features, please include details about what problem you're 28 | trying to solve, not just a solution to your problem. By helping the community 29 | understand the underlying problem, we can better evaluate what the best solution 30 | to the problem might be. 31 | 32 | ## Contributing Changes 33 | 34 | We openly welcome pull requests on our projects. We don't like bugs, and if 35 | you've found one and wish to submit a fix, we greatly appreciate it. 36 | 37 | If you find that fixing a bug requires a significant change, or you are wanting 38 | to add a somewhat large feature, please submit a proposal as an issue first. We 39 | want to make sure that your efforts have the highest chance of success, and a 40 | short discussion before starting can go a long way towards a pull request being 41 | merged with less revisions. 42 | 43 | When working on an existing issue, update the issue to reflect that you're 44 | working on it. This will help prevent duplicated efforts. 45 | 46 | If you begin working on something but need some assistance, don't hesitate to 47 | reach out inside of the issue, on [our 48 | forums](https://community.khonsulabs.com/), or [our 49 | Discord](https://discord.khonsulabs.com/). We will do our best to help you. 50 | 51 | ### Project-specific requirements 52 | 53 | Be sure to check if a project's README contains additional contributing 54 | guidelines. Each project may have different tools and commands that should be 55 | run to validate that changes pass all requirements. 56 | -------------------------------------------------------------------------------- /src/manager.rs: -------------------------------------------------------------------------------- 1 | use std::{fmt::Debug, io}; 2 | 3 | use file_manager::{fs::StdFileManager, FileManager}; 4 | 5 | use crate::{ 6 | entry::EntryId, 7 | log_file::{Entry, RecoveredSegment, SegmentReader}, 8 | WriteAheadLog, 9 | }; 10 | 11 | /// Customizes recovery and checkpointing behavior for a 12 | /// [`WriteAheadLog`](crate::WriteAheadLog). 13 | pub trait LogManager: Send + Sync + Debug + 'static 14 | where 15 | M: FileManager, 16 | { 17 | /// When recovering a [`WriteAheadLog`](crate::WriteAheadLog), this function 18 | /// is called for each segment as it is read. To allow the segment to have 19 | /// its data recovered, return [`Recovery::Recover`]. If you wish to abandon 20 | /// the data contained in the segment, return [`Recovery::Abandon`]. 21 | fn should_recover_segment(&mut self, _segment: &RecoveredSegment) -> io::Result { 22 | Ok(Recovery::Recover) 23 | } 24 | 25 | /// Invoked once for each entry contained in all recovered segments within a 26 | /// [`WriteAheadLog`](crate::WriteAheadLog). 27 | /// 28 | /// [`Entry::read_chunk()`] can be used to read each chunk of data that was 29 | /// written via 30 | /// [`EntryWriter::write_chunk`](crate::EntryWriter::write_chunk). The order 31 | /// of chunks is guaranteed to be the same as the order they were written 32 | /// in. 33 | fn recover(&mut self, entry: &mut Entry<'_, M::File>) -> io::Result<()>; 34 | 35 | /// Invoked each time the [`WriteAheadLog`](crate::WriteAheadLog) is ready 36 | /// to recycle and reuse segment files. 37 | /// 38 | /// `last_checkpointed_id` is the id of the last entry that is being 39 | /// checkedpointed and removed from the log. If needed, 40 | /// `checkpointed_entries` can be used to iterate over all entries that are 41 | /// being checkpointed. 42 | /// 43 | /// Shortly after this function returns, the entries stored within the file 44 | /// being checkpointed will no longer be accessible. To ensure ACID 45 | /// compliance of the underlying storage layer, all necessary changes must 46 | /// be fully synchronized to the underlying storage medium before this 47 | /// function returns. 48 | fn checkpoint_to( 49 | &mut self, 50 | last_checkpointed_id: EntryId, 51 | checkpointed_entries: &mut SegmentReader, 52 | wal: &WriteAheadLog, 53 | ) -> io::Result<()>; 54 | } 55 | 56 | /// Determines whether to recover a segment or not. 57 | pub enum Recovery { 58 | /// Recover the segment. 59 | Recover, 60 | /// Abandon the segment and any entries stored within it. **Warning: This 61 | /// means losing data that was previously written to the log. This should 62 | /// rarely, if ever, be done.** 63 | Abandon, 64 | } 65 | 66 | /// A [`LogManager`] that does not attempt to recover any existing data. 67 | #[derive(Debug)] 68 | pub struct LogVoid; 69 | 70 | impl LogManager for LogVoid 71 | where 72 | M: file_manager::FileManager, 73 | { 74 | fn should_recover_segment(&mut self, _segment: &RecoveredSegment) -> io::Result { 75 | Ok(Recovery::Abandon) 76 | } 77 | 78 | fn recover(&mut self, _entry: &mut Entry<'_, M::File>) -> io::Result<()> { 79 | Ok(()) 80 | } 81 | 82 | fn checkpoint_to( 83 | &mut self, 84 | _last_checkpointed_id: EntryId, 85 | _reader: &mut SegmentReader, 86 | _wal: &WriteAheadLog, 87 | ) -> io::Result<()> { 88 | Ok(()) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /examples/basic.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read}; 2 | 3 | use okaywal::{Entry, EntryId, LogManager, SegmentReader, WriteAheadLog}; 4 | 5 | fn main() -> io::Result<()> { 6 | // begin rustme snippet: readme-example 7 | // Open a log using a Checkpointer that echoes the information passed into each 8 | // function that the Checkpointer trait defines. 9 | let log = WriteAheadLog::recover("my-log", LoggingCheckpointer)?; 10 | 11 | // Begin writing an entry to the log. 12 | let mut writer = log.begin_entry()?; 13 | 14 | // Each entry is one or more chunks of data. Each chunk can be individually 15 | // addressed using its LogPosition. 16 | let record = writer.write_chunk("this is the first entry".as_bytes())?; 17 | 18 | // To fully flush all written bytes to disk and make the new entry 19 | // resilient to a crash, the writer must be committed. 20 | writer.commit()?; 21 | // end rustme snippet 22 | 23 | // Let's reopen the log. During this process, 24 | // LoggingCheckpointer::should_recover_segment will be invoked for each segment 25 | // file that has not been checkpointed yet. In this example, it will be called 26 | // once. Once the Checkpointer confirms the data should be recovered, 27 | // LoggingCheckpointer::recover will be invoked once for each entry in the WAL 28 | // that hasn't been previously checkpointed. 29 | drop(log); 30 | let log = WriteAheadLog::recover("my-log", LoggingCheckpointer)?; 31 | 32 | // We can use the previously returned DataRecord to read the original data. 33 | let mut reader = log.read_at(record.position)?; 34 | let mut buffer = vec![0; usize::try_from(record.length).unwrap()]; 35 | reader.read_exact(&mut buffer)?; 36 | println!( 37 | "Data read from log: {}", 38 | String::from_utf8(buffer).expect("invalid utf-8") 39 | ); 40 | 41 | // Cleanup 42 | drop(reader); 43 | drop(log); 44 | std::fs::remove_dir_all("my-log")?; 45 | 46 | Ok(()) 47 | } 48 | 49 | #[derive(Debug)] 50 | struct LoggingCheckpointer; 51 | 52 | impl LogManager for LoggingCheckpointer { 53 | fn recover(&mut self, entry: &mut Entry<'_>) -> io::Result<()> { 54 | // This example uses read_all_chunks to load the entire entry into 55 | // memory for simplicity. The crate also supports reading each chunk 56 | // individually to minimize memory usage. 57 | if let Some(all_chunks) = entry.read_all_chunks()? { 58 | // Convert the Vec's to Strings. 59 | let all_chunks = all_chunks 60 | .into_iter() 61 | .map(String::from_utf8) 62 | .collect::, _>>() 63 | .expect("invalid utf-8"); 64 | println!( 65 | "LoggingCheckpointer::recover(entry_id: {:?}, data: {:?})", 66 | entry.id(), 67 | all_chunks, 68 | ); 69 | } else { 70 | // This entry wasn't completely written. This could happen if a 71 | // power outage or crash occurs while writing an entry. 72 | } 73 | 74 | Ok(()) 75 | } 76 | 77 | fn checkpoint_to( 78 | &mut self, 79 | last_checkpointed_id: EntryId, 80 | _checkpointed_entries: &mut SegmentReader, 81 | _wal: &WriteAheadLog, 82 | ) -> io::Result<()> { 83 | // checkpoint_to is called once enough data has been written to the 84 | // WriteAheadLog. After this function returns, the log will recycle the 85 | // file containing the entries being checkpointed. 86 | // 87 | // This function is where the entries must be persisted to the storage 88 | // layer the WriteAheadLog is sitting in front of. To ensure ACID 89 | // compliance of the combination of the WAL and the storage layer, the 90 | // storage layer must be fully resilliant to losing any changes made by 91 | // the checkpointed entries before this function returns. 92 | println!("LoggingCheckpointer::checkpoint_to({last_checkpointed_id:?}"); 93 | Ok(()) 94 | } 95 | } 96 | 97 | #[test] 98 | fn test() -> io::Result<()> { 99 | // Clean up any previous runs of this example. 100 | let path = std::path::Path::new("my-log"); 101 | if path.exists() { 102 | std::fs::remove_dir_all("my-log")?; 103 | } 104 | 105 | main() 106 | } 107 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::{io, ops::Mul, path::Path, sync::Arc}; 2 | 3 | use file_manager::{fs::StdFileManager, FileManager, PathId}; 4 | 5 | use crate::{LogManager, WriteAheadLog}; 6 | 7 | /// A [`WriteAheadLog`] configuration. 8 | #[derive(Debug, Clone)] 9 | #[must_use] 10 | pub struct Configuration { 11 | /// The file manager to use for storing data. 12 | /// 13 | /// Typically this is [`StdFileManager`]. 14 | pub file_manager: M, 15 | /// The directory to store the log files in. 16 | pub directory: PathId, 17 | /// The number of bytes each log file should be preallocated with. Log files 18 | /// may grow to be larger than this size if needed. 19 | pub preallocate_bytes: u32, 20 | /// After this many bytes have been written to the active log file, begin a 21 | /// checkpointing process. This number should be less than 22 | /// `preallocate_bytes` to try to ensure the checkpointing process happens 23 | /// before the preallocated space is fully exhausted. If this amount is too 24 | /// close to the preallocation amount, an entry being written may need to 25 | /// extend the file which is a slow operation. 26 | pub checkpoint_after_bytes: u64, 27 | /// The number of bytes to use for the in-memory buffer when reading and 28 | /// writing from the log. 29 | pub buffer_bytes: usize, 30 | /// An arbitrary chunk of bytes that is stored in the log files. Limited to 31 | /// 255 bytes. This can be used for any purpose, but the design inspiration 32 | /// was to allow detection of what format or version of a format the data 33 | /// was inside of the log without needing to parse the entries. 34 | pub version_info: Arc>, 35 | } 36 | 37 | impl Default for Configuration { 38 | fn default() -> Self { 39 | Self::default_for("okaywal") 40 | } 41 | } 42 | 43 | impl Configuration { 44 | /// Returns the default configuration for a given directory. 45 | /// 46 | /// This currently is: 47 | /// 48 | /// - `preallocate_bytes`: 1 megabyte 49 | /// - `checkpoint_after_bytes`: 768 kilobytes 50 | /// - `buffer_bytes`: 16 kilobytes 51 | pub fn default_for>(path: P) -> Self { 52 | Self::default_with_manager(path, StdFileManager::default()) 53 | } 54 | } 55 | 56 | impl Configuration 57 | where 58 | M: FileManager, 59 | { 60 | /// Returns the default configuration for a given directory and file manager. 61 | /// 62 | /// This currently is: 63 | /// 64 | /// - `preallocate_bytes`: 1 megabyte 65 | /// - `checkpoint_after_bytes`: 768 kilobytes 66 | /// - `buffer_bytes`: 16 kilobytes 67 | pub fn default_with_manager>(path: P, file_manager: M) -> Self { 68 | Self { 69 | file_manager, 70 | directory: PathId::from(path.as_ref()), 71 | preallocate_bytes: megabytes(1), 72 | checkpoint_after_bytes: kilobytes(768), 73 | buffer_bytes: kilobytes(16), 74 | version_info: Arc::default(), 75 | } 76 | } 77 | /// Sets the number of bytes to preallocate for each segment file. Returns `self`. 78 | /// 79 | /// Preallocating disk space allows for more consistent performance. This 80 | /// number should be large enough to allow batching multiple entries into 81 | /// one checkpointing operation. 82 | pub fn preallocate_bytes(mut self, bytes: u32) -> Self { 83 | self.preallocate_bytes = bytes; 84 | self 85 | } 86 | 87 | /// Sets the number of bytes written required to begin a checkpoint 88 | /// operation. Returns `self`. 89 | /// 90 | /// This value should be smaller than `preallocate_bytes` to ensure 91 | /// checkpoint operations begin before too much data is written in a log 92 | /// entry. If more data is written before a checkpoint occurs, the segment 93 | /// will grow to accommodate the extra data, but that write will not be as 94 | /// fast due to needing to allocate more space from the filesystem to 95 | /// perform the write. 96 | pub fn checkpoint_after_bytes(mut self, bytes: u64) -> Self { 97 | self.checkpoint_after_bytes = bytes; 98 | self 99 | } 100 | 101 | /// Sets the number of bytes to use for internal buffers when reading and 102 | /// writing data to the log. Returns `self`. 103 | pub fn buffer_bytes(mut self, bytes: usize) -> Self { 104 | self.buffer_bytes = bytes; 105 | self 106 | } 107 | 108 | /// Opens the log using the provided log manager with this configuration. 109 | pub fn open>(self, manager: Manager) -> io::Result> { 110 | WriteAheadLog::open(self, manager) 111 | } 112 | } 113 | 114 | fn megabytes + From>(megs: T) -> T { 115 | kilobytes(megs) * T::from(1024) 116 | } 117 | 118 | fn kilobytes + From>(bytes: T) -> T { 119 | bytes * T::from(1024) 120 | } 121 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | moderators@khonsulabs.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /src/buffered.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Seek, SeekFrom, Write}; 2 | 3 | use crate::to_io_result::ToIoResult; 4 | 5 | #[derive(Debug)] 6 | pub struct Buffered 7 | where 8 | F: Bufferable + Seek + Write, 9 | { 10 | buffer: Vec, 11 | position: u64, 12 | buffer_write_position: usize, 13 | length: u64, 14 | file: F, 15 | } 16 | 17 | impl Buffered 18 | where 19 | F: Bufferable + Seek + Write, 20 | { 21 | pub fn with_capacity(mut file: F, capacity: usize) -> io::Result { 22 | let length = file.len()?; 23 | let position = file.stream_position()?; 24 | Ok(Self { 25 | buffer: Vec::with_capacity(capacity), 26 | position, 27 | buffer_write_position: 0, 28 | length, 29 | file, 30 | }) 31 | } 32 | 33 | fn flush_buffer(&mut self) -> io::Result<()> { 34 | if !self.buffer.is_empty() { 35 | self.file.write_all(&self.buffer)?; 36 | let bytes_written = u64::try_from(self.buffer.len()).to_io()?; 37 | self.position += bytes_written; 38 | self.length = self.length.max(self.position); 39 | self.buffer_write_position = 0; 40 | self.buffer.clear(); 41 | } 42 | Ok(()) 43 | } 44 | 45 | pub fn position(&self) -> u64 { 46 | self.position + u64::try_from(self.buffer_write_position).expect("impossibly large buffer") 47 | } 48 | 49 | pub const fn buffer_position(&self) -> u64 { 50 | self.position 51 | } 52 | 53 | pub fn inner(&self) -> &F { 54 | &self.file 55 | } 56 | } 57 | 58 | impl Write for Buffered 59 | where 60 | F: Bufferable + Seek + Write, 61 | { 62 | fn write(&mut self, buf: &[u8]) -> io::Result { 63 | if self.buffer.capacity() == self.buffer_write_position { 64 | self.flush_buffer()?; 65 | } 66 | 67 | // If what we're writing is larger than our buffer, skip the buffer 68 | // entirely. 69 | if buf.len() > self.buffer.capacity() { 70 | // Ensure what we've buffered is already written. 71 | self.flush_buffer()?; 72 | let bytes_written = self.file.write(buf)?; 73 | self.position += u64::try_from(bytes_written).to_io()?; 74 | return Ok(bytes_written); 75 | } 76 | 77 | let bytes_remaining = self.buffer.capacity() - self.buffer_write_position; 78 | let bytes_to_write = buf.len().min(bytes_remaining); 79 | if bytes_to_write > 0 { 80 | let bytes_to_copy = 81 | (self.buffer.len() - self.buffer_write_position).min(bytes_to_write); 82 | if bytes_to_copy > 0 { 83 | self.buffer[self.buffer_write_position..self.buffer_write_position + bytes_to_copy] 84 | .copy_from_slice(&buf[..bytes_to_copy]); 85 | } 86 | let bytes_to_extend = bytes_to_write - bytes_to_copy; 87 | if bytes_to_extend > 0 { 88 | self.buffer 89 | .extend_from_slice(&buf[bytes_to_copy..bytes_to_write]); 90 | } 91 | self.buffer_write_position += bytes_to_write; 92 | } 93 | 94 | Ok(bytes_to_write) 95 | } 96 | 97 | fn flush(&mut self) -> io::Result<()> { 98 | self.flush_buffer()?; 99 | self.file.flush()?; 100 | Ok(()) 101 | } 102 | } 103 | 104 | impl Seek for Buffered 105 | where 106 | F: Bufferable + Seek + Write, 107 | { 108 | fn seek(&mut self, pos: SeekFrom) -> io::Result { 109 | let buffer_write_position = u64::try_from(self.buffer_write_position).to_io()?; 110 | let new_position = match pos { 111 | SeekFrom::Start(position) => position, 112 | SeekFrom::End(offset) => { 113 | if let Ok(offset) = u64::try_from(offset) { 114 | offset + self.length 115 | } else { 116 | let offset = u64::try_from(-offset).unwrap(); 117 | self.length - offset 118 | } 119 | } 120 | SeekFrom::Current(offset) => { 121 | if let Ok(offset) = u64::try_from(offset) { 122 | self.position + buffer_write_position + offset 123 | } else { 124 | let absolute_offset = -offset; 125 | let offset = u64::try_from(absolute_offset).unwrap(); 126 | self.position + buffer_write_position - offset 127 | } 128 | } 129 | }; 130 | 131 | let buffer_len = u64::try_from(self.buffer.len()).unwrap(); 132 | let new_position_in_buffer = match new_position.checked_sub(self.position) { 133 | Some(position) if position < buffer_len => Some(position), 134 | _ => None, 135 | }; 136 | 137 | if let Some(new_position_in_buffer) = new_position_in_buffer { 138 | self.buffer_write_position = usize::try_from(new_position_in_buffer).to_io()?; 139 | } else { 140 | self.flush_buffer()?; 141 | self.file.seek(SeekFrom::Start(new_position))?; 142 | self.position = new_position; 143 | } 144 | 145 | Ok(new_position) 146 | } 147 | } 148 | 149 | pub trait Bufferable { 150 | fn len(&self) -> io::Result; 151 | fn set_len(&self, new_length: u64) -> io::Result<()>; 152 | } 153 | 154 | impl Bufferable for F 155 | where 156 | F: file_manager::File, 157 | { 158 | fn len(&self) -> io::Result { 159 | file_manager::File::len(self) 160 | } 161 | 162 | fn set_len(&self, new_length: u64) -> io::Result<()> { 163 | self.set_len(new_length) 164 | } 165 | } 166 | 167 | #[derive(Debug)] 168 | pub struct WriteBuffer { 169 | pub bytes: Vec, 170 | pub position: u64, 171 | } 172 | -------------------------------------------------------------------------------- /.rustme/docs.md: -------------------------------------------------------------------------------- 1 | A [write-ahead log (WAL)][wal] implementation for Rust. 2 | 3 | > There's The Great Wall, and then there's this: an okay WAL. 4 | 5 | **WARNING: This crate is early in development. Please do not use in any 6 | production projects until this has been incorporated into 7 | [Sediment](https://github.com/khonsulabs/sediment) and shipping as part of 8 | [Nebari](https://github.com/khonsulabs/nebari). The file format is currently 9 | considered unstable.** 10 | 11 | ![okaywal forbids unsafe code](https://img.shields.io/badge/unsafe-forbid-success) 12 | [![crate version](https://img.shields.io/crates/v/okaywal.svg)](https://crates.io/crates/okaywal) 13 | [![Live Build Status](https://img.shields.io/github/actions/workflow/status/khonsulabs/okaywal/rust.yml?branch=main)](https://github.com/khonsulabs/okaywal/actions?query=workflow:Tests) 14 | [![HTML Coverage Report for `main` branch](https://khonsulabs.github.io/okaywal/coverage/badge.svg)](https://khonsulabs.github.io/okaywal/coverage/) 15 | [![Documentation](https://img.shields.io/badge/docs-main-informational)]($docs-base$) 16 | 17 | This crate exposes a WAL that supports: 18 | 19 | - Atomic and Durable writes from multiple threads. 20 | - Random access for previously written data. 21 | - Automatic checkpointing to allow reusing disk space and 22 | preventing the WAL from growing too large. 23 | - Interactive recovery process with basic data versioning support. 24 | 25 | ## Basic How-To 26 | 27 | [`WriteAheadLog::recover()`]($wal-recover$) is used to create or recover a WAL 28 | in a given directory. To open a log, an implementer of 29 | [`LogManager`]($logmanager-trait$) must be provided. This trait is how 30 | OkayWAL communicates with your code when recovering or checkpointing a log. 31 | 32 | The [basic example][basic-example] shows this process with many comments 33 | describing how OkayWAL works. 34 | 35 | ```rust,ignore 36 | $../examples/basic.rs:readme-example$ 37 | ``` 38 | 39 | ## Multi-Threaded Writing 40 | 41 | Optimized writing to the log from multiple threads is handled automatically. 42 | Only one thread may access the active log file at any moment in time. Because 43 | the slowest part of writing data to disk is `fsync`, OkayWAL manages 44 | synchronizing multiple writers such that a single `fsync` call can be made for 45 | multiple writes. 46 | 47 | This can be demonstrated by running the benchmark suite: `cargo bench -p 48 | benchmarks`: 49 | 50 | ### commit-256B 51 | 52 | | Label | avg | min | max | stddev | out% | 53 | |-------------|---------|---------|---------|---------|--------| 54 | | okaywal-01t | 1.001ms | 617.5us | 7.924ms | 557.3us | 0.016% | 55 | | okaywal-02t | 1.705ms | 617.3us | 11.38ms | 912.1us | 0.006% | 56 | | okaywal-04t | 1.681ms | 622.4us | 9.688ms | 671.4us | 0.021% | 57 | | okaywal-08t | 1.805ms | 656.5us | 13.88ms | 1.001ms | 0.014% | 58 | | okaywal-16t | 1.741ms | 643.2us | 7.895ms | 796.4us | 0.028% | 59 | 60 | ### commit-1KB 61 | 62 | | Label | avg | min | max | stddev | out% | 63 | |-------------|---------|---------|---------|---------|--------| 64 | | okaywal-01t | 959.3us | 621.9us | 7.419ms | 584.4us | 0.012% | 65 | | okaywal-02t | 1.569ms | 627.5us | 7.986ms | 1.007ms | 0.028% | 66 | | okaywal-04t | 1.856ms | 650.5us | 11.14ms | 1.087ms | 0.017% | 67 | | okaywal-08t | 2.054ms | 697.3us | 11.04ms | 1.066ms | 0.021% | 68 | | okaywal-16t | 1.875ms | 641.5us | 8.193ms | 674.6us | 0.032% | 69 | 70 | ### commit-4KB 71 | 72 | | Label | avg | min | max | stddev | out% | 73 | |-------------|---------|---------|---------|---------|--------| 74 | | okaywal-01t | 1.242ms | 748.8us | 6.902ms | 982.4us | 0.008% | 75 | | okaywal-02t | 1.767ms | 761.9us | 8.986ms | 902.1us | 0.016% | 76 | | okaywal-04t | 2.347ms | 787.1us | 8.853ms | 1.084ms | 0.016% | 77 | | okaywal-08t | 2.798ms | 810.8us | 12.53ms | 1.168ms | 0.014% | 78 | | okaywal-16t | 2.151ms | 840.5us | 14.74ms | 1.201ms | 0.008% | 79 | 80 | ### commit-1MB 81 | 82 | | Label | avg | min | max | stddev | out% | 83 | |-------------|---------|---------|---------|---------|--------| 84 | | okaywal-01t | 7.018ms | 5.601ms | 9.865ms | 788.2us | 0.027% | 85 | | okaywal-02t | 11.06ms | 4.281ms | 20.14ms | 3.521ms | 0.000% | 86 | | okaywal-04t | 19.77ms | 5.094ms | 73.21ms | 8.794ms | 0.007% | 87 | | okaywal-08t | 25.06ms | 2.871ms | 97.60ms | 17.33ms | 0.002% | 88 | | okaywal-16t | 19.01ms | 3.480ms | 58.85ms | 7.195ms | 0.014% | 89 | 90 | These numbers are the time taken for a single thread to perform an atomic and 91 | durable write of a given size to the log file. Despite using a single-file 92 | approach, we are able to keep average write times very low even with a large 93 | number of simultaneous writers. 94 | 95 | ## How OkayWAL works 96 | 97 | OkayWAL streams incoming data into "segments". Each segment file is named with 98 | the format `wal-{id}`. The id of a segment file refers to the first `EntryId` 99 | that could appear within the segment file. 100 | 101 | Segment files are pre-allocated to the length configured in 102 | `Configuration::preallocate_bytes`. Preallocating files is critical for 103 | performance, as overwriting existing bytes in general is less expensive than 104 | allocating new bytes on disk. 105 | 106 | OkayWAL always has a current segment file. When a new entry is written, it 107 | always goes to the current segment file. When an entry is completed, the length 108 | of the segment file is checked against `Configuration::checkpoint_after_bytes`. 109 | If enough data has been written to trigger a checkpoint, the file is sent to the 110 | checkpointing thread and a new segment file is activated. 111 | 112 | Regardless of whether the file is checkpointed, before control returns from 113 | committing an entry, the file is `fsync`ed. `fsync` operations are batched, 114 | allowing multiple entries to be written by separate threads during the same 115 | `fsync` operation. 116 | 117 | OkayWAL also keeps track of any time a new file is created or a file is renamed. 118 | As needed, the directory containing the write-ahead logs is also `fsync`ed to 119 | ensure necessary file and directory metadata is fully synchronized. Just like 120 | file `fsync` batching, OkayWAL also automatically batches directory `fsync`s 121 | across threads. 122 | 123 | ### Checkpointing a segment file (Background Thread) 124 | 125 | The checkpointing thread holds a weak reference to the `WriteAheadLog` data. 126 | When a file is received by the thread to checkpoint, it will upgrade the weak 127 | reference. If it cannot, the checkpointing thread shuts down gracefully and the 128 | recovery process will send the file again for checkpointing the next time the 129 | log is opened. 130 | 131 | The thread invokes `LogManager::checkpoint_to` for the file, allowing the 132 | `LogManager` to make any needed changes to persist the data stored in the 133 | segment being checkpointed. 134 | 135 | After the `LogManager` finishes, the file is renamed to include `-cp` as its 136 | suffix. Until this step, readers are able to be opened against data stored in 137 | the file being checkpointed. Once the file is renamed, new readers will begin 138 | returning not found errors. 139 | 140 | After the file is renamed, the checkpointer waits for all outstanding readers to 141 | finish reading data. The file is then finally recycled by moving it to the 142 | inactive files list. 143 | 144 | ### Activating a new segment file 145 | 146 | If there are any files in the inactive files list, one is reused. Otherwise, a 147 | new file is created and filled with 0's to the configured preallocation length. 148 | 149 | The file's name is set to `wal-{next EntryId}`. For example, a brand new 150 | write-ahead log's first segment file will be named `wal-1`, and the first 151 | `EntryId` written will be `1`. 152 | 153 | ### Segment File Format 154 | 155 | Each segment file starts with this header: 156 | 157 | - `okw`: Three byte magic code 158 | - OkayWAL Version: Single byte version number. Currently 0. 159 | - `Configuration::version_info` length: Single byte. The embedded information 160 | must be 255 or less bytes long. 161 | - Embedded Version Info: The bytes of the version info. The previous byte 162 | controls how many bytes long this field is. 163 | 164 | After this header, the file is a series of entries, each which contain a series 165 | of chunks. A byte with a value of 1 signifies a new entry. Any other byte causes 166 | the reader to stop reading entries from the file. 167 | 168 | The first 8 bytes of the entry are the little-endian representation of its 169 | `EntryId`. 170 | 171 | After the `EntryId`, a series of chunks is expected. A byte with a value of 2 172 | signals that a chunk is next in the file. A byte with a value of 3 signals that 173 | this is the end of the current entry being written. Any other byte causes the 174 | `SegmentReader` to return an AbortedEntry result. Any already-read chunks from 175 | this entry should be ignored/rolled back by the `LogManager`. 176 | 177 | The first four bytes of a chunk are the data length in little-endian 178 | representation. The data for the chunk follows. 179 | 180 | Finally, a four-byte CRC-32 ends the chunk. 181 | 182 | If a reader does not encounter a new chunk marker (2) or an end-of-entry marker 183 | (3), the entry should be considered abandoned and all chunks should be ignored. 184 | 185 | [basic-example]: $src-base$/examples/basic.rs 186 | [wal]: https://en.wikipedia.org/wiki/Write-ahead_logging 187 | -------------------------------------------------------------------------------- /src/entry.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read, Write}; 2 | 3 | use crc32c::crc32c_append; 4 | use file_manager::FileManager; 5 | use parking_lot::MutexGuard; 6 | 7 | use crate::{ 8 | log_file::{LogFile, LogFileWriter}, 9 | to_io_result::ToIoResult, 10 | WriteAheadLog, WriteResult, 11 | }; 12 | 13 | /// A writer for an entry in a [`WriteAheadLog`]. 14 | /// 15 | /// Only one writer can be active for a given [`WriteAheadLog`] at any given 16 | /// time. See [`WriteAheadLog::begin_entry()`] for more information. 17 | #[derive(Debug)] 18 | pub struct EntryWriter<'a, M> 19 | where 20 | M: FileManager, 21 | { 22 | id: EntryId, 23 | log: &'a WriteAheadLog, 24 | file: Option>, 25 | original_length: u64, 26 | } 27 | 28 | pub const NEW_ENTRY: u8 = 1; 29 | pub const CHUNK: u8 = 2; 30 | pub const END_OF_ENTRY: u8 = 3; 31 | 32 | impl<'a, M> EntryWriter<'a, M> 33 | where 34 | M: FileManager, 35 | { 36 | pub(super) fn new( 37 | log: &'a WriteAheadLog, 38 | id: EntryId, 39 | file: LogFile, 40 | ) -> io::Result { 41 | let mut writer = file.lock(); 42 | let original_length = writer.position(); 43 | 44 | writer.write_all(&[NEW_ENTRY])?; 45 | writer.write_all(&id.0.to_le_bytes())?; 46 | drop(writer); 47 | 48 | Ok(Self { 49 | id, 50 | log, 51 | file: Some(file), 52 | original_length, 53 | }) 54 | } 55 | 56 | /// Returns the unique id of the log entry being written. 57 | #[must_use] 58 | pub const fn id(&self) -> EntryId { 59 | self.id 60 | } 61 | 62 | /// Commits this entry to the log. Once this call returns, all data is 63 | /// atomically updated and synchronized to disk. 64 | /// 65 | /// While the entry is being committed, other writers will be allowed to 66 | /// write to the log. See [`WriteAheadLog::begin_entry()`] for more 67 | /// information. 68 | pub fn commit(self) -> io::Result { 69 | self.commit_and(|_file| Ok(())) 70 | } 71 | 72 | pub(crate) fn commit_and) -> io::Result<()>>( 73 | mut self, 74 | callback: F, 75 | ) -> io::Result { 76 | let file = self.file.take().expect("already committed"); 77 | 78 | let mut writer = file.lock(); 79 | 80 | writer.write_all(&[END_OF_ENTRY])?; 81 | let new_length = writer.position(); 82 | callback(&mut writer)?; 83 | writer.set_last_entry_id(Some(self.id)); 84 | drop(writer); 85 | 86 | self.log.reclaim(file, WriteResult::Entry { new_length })?; 87 | 88 | Ok(self.id) 89 | } 90 | 91 | /// Abandons this entry, preventing the entry from being recovered in the 92 | /// future. This is automatically done when dropped, but errors that occur 93 | /// during drop will panic. 94 | pub fn rollback(mut self) -> io::Result<()> { 95 | self.rollback_session() 96 | } 97 | 98 | fn rollback_session(&mut self) -> io::Result<()> { 99 | let file = self.file.take().expect("file already dropped"); 100 | 101 | let mut writer = file.lock(); 102 | writer.revert_to(self.original_length)?; 103 | drop(writer); 104 | 105 | self.log.reclaim(file, WriteResult::RolledBack).unwrap(); 106 | 107 | Ok(()) 108 | } 109 | 110 | /// Appends a chunk of data to this log entry. Each chunk of data is able to 111 | /// be read using [`Entry::read_chunk`](crate::Entry). 112 | pub fn write_chunk(&mut self, data: &[u8]) -> io::Result { 113 | let mut writer = self.begin_chunk(u32::try_from(data.len()).to_io()?)?; 114 | writer.write_all(data)?; 115 | writer.finish() 116 | } 117 | 118 | /// Begins writing a chunk with the given `length`. 119 | /// 120 | /// The writer returned already contains an internal buffer. This function 121 | /// can be used to write a complex payload without needing to first 122 | /// combine it in another buffer. 123 | pub fn begin_chunk(&mut self, length: u32) -> io::Result> { 124 | let mut file = self.file.as_ref().expect("already dropped").lock(); 125 | 126 | let position = LogPosition { 127 | file_id: file.id(), 128 | offset: file.position(), 129 | }; 130 | 131 | file.write_all(&[CHUNK])?; 132 | file.write_all(&length.to_le_bytes())?; 133 | 134 | Ok(ChunkWriter { 135 | file, 136 | position, 137 | length, 138 | bytes_remaining: length, 139 | crc32: 0, 140 | finished: false, 141 | }) 142 | } 143 | } 144 | 145 | impl<'a, M> Drop for EntryWriter<'a, M> 146 | where 147 | M: FileManager, 148 | { 149 | fn drop(&mut self) { 150 | if self.file.is_some() { 151 | self.rollback_session().unwrap(); 152 | } 153 | } 154 | } 155 | 156 | pub struct ChunkWriter<'a, F> 157 | where 158 | F: file_manager::File, 159 | { 160 | file: MutexGuard<'a, LogFileWriter>, 161 | position: LogPosition, 162 | length: u32, 163 | bytes_remaining: u32, 164 | crc32: u32, 165 | finished: bool, 166 | } 167 | 168 | impl<'a, F> ChunkWriter<'a, F> 169 | where 170 | F: file_manager::File, 171 | { 172 | pub fn finish(mut self) -> io::Result { 173 | self.write_tail()?; 174 | Ok(ChunkRecord { 175 | position: self.position, 176 | crc: self.crc32, 177 | length: self.length, 178 | }) 179 | } 180 | 181 | fn write_tail(&mut self) -> io::Result<()> { 182 | self.finished = true; 183 | 184 | if self.bytes_remaining != 0 { 185 | return Err(io::Error::new( 186 | io::ErrorKind::Other, 187 | "written length does not match expected length", 188 | )); 189 | } 190 | 191 | self.file.write_all(&self.crc32.to_le_bytes()) 192 | } 193 | } 194 | 195 | impl<'a, F> Drop for ChunkWriter<'a, F> 196 | where 197 | F: file_manager::File, 198 | { 199 | fn drop(&mut self) { 200 | if !self.finished { 201 | self.write_tail() 202 | .expect("chunk writer dropped without finishing"); 203 | } 204 | } 205 | } 206 | 207 | impl<'a, F> Write for ChunkWriter<'a, F> 208 | where 209 | F: file_manager::File, 210 | { 211 | fn write(&mut self, buf: &[u8]) -> io::Result { 212 | let bytes_to_write = buf 213 | .len() 214 | .min(usize::try_from(self.bytes_remaining).to_io()?); 215 | 216 | let bytes_written = self.file.write(&buf[..bytes_to_write])?; 217 | if bytes_written > 0 { 218 | self.bytes_remaining -= u32::try_from(bytes_written).to_io()?; 219 | self.crc32 = crc32c_append(self.crc32, &buf[..bytes_written]); 220 | } 221 | Ok(bytes_written) 222 | } 223 | 224 | fn flush(&mut self) -> io::Result<()> { 225 | Ok(()) 226 | } 227 | } 228 | 229 | /// The position of a chunk of data within a [`WriteAheadLog`]. 230 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 231 | pub struct LogPosition { 232 | pub(crate) file_id: u64, 233 | pub(crate) offset: u64, 234 | } 235 | 236 | impl LogPosition { 237 | /// The number of bytes required to serialize a `LogPosition` using 238 | /// [`LogPosition::serialize_to()`]. 239 | pub const SERIALIZED_LENGTH: u8 = 16; 240 | 241 | /// Serializes this position to `destination`. 242 | /// 243 | /// This writes [`LogPosition::SERIALIZED_LENGTH`] bytes to `destination`. 244 | pub fn serialize_to(&self, mut destination: W) -> io::Result<()> { 245 | let mut all_bytes = [0; 16]; 246 | all_bytes[..8].copy_from_slice(&self.file_id.to_le_bytes()); 247 | all_bytes[8..].copy_from_slice(&self.offset.to_le_bytes()); 248 | destination.write_all(&all_bytes) 249 | } 250 | 251 | /// Deserializes a `LogPosition` from `read`. 252 | /// 253 | /// This reads [`LogPosition::SERIALIZED_LENGTH`] bytes from `read` and 254 | /// returns the deserialized log position. 255 | pub fn deserialize_from(mut read: R) -> io::Result { 256 | let mut all_bytes = [0; 16]; 257 | read.read_exact(&mut all_bytes)?; 258 | 259 | let file_id = u64::from_le_bytes(all_bytes[..8].try_into().expect("u64 is 8 bytes")); 260 | let offset = u64::from_le_bytes(all_bytes[8..].try_into().expect("u64 is 8 bytes")); 261 | 262 | Ok(Self { file_id, offset }) 263 | } 264 | } 265 | 266 | #[test] 267 | fn log_position_serialization() { 268 | let position = LogPosition { 269 | file_id: 1, 270 | offset: 2, 271 | }; 272 | let mut serialized = Vec::new(); 273 | position.serialize_to(&mut serialized).unwrap(); 274 | assert_eq!( 275 | serialized.len(), 276 | usize::from(LogPosition::SERIALIZED_LENGTH) 277 | ); 278 | let deserialized = LogPosition::deserialize_from(&serialized[..]).unwrap(); 279 | assert_eq!(position, deserialized); 280 | } 281 | 282 | /// A record of a chunk that was written to a [`WriteAheadLog`]. 283 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 284 | pub struct ChunkRecord { 285 | /// The position of the chunk. 286 | pub position: LogPosition, 287 | /// The CRC calculated for the chunk. 288 | pub crc: u32, 289 | /// The length of the data contained inside of the chunk. 290 | pub length: u32, 291 | } 292 | 293 | /// The unique id of an entry written to a [`WriteAheadLog`]. These IDs are 294 | /// ordered by the time the [`EntryWriter`] was created for the entry written with this id. 295 | #[derive(Debug, Eq, PartialEq, Ord, PartialOrd, Copy, Clone, Default, Hash)] 296 | pub struct EntryId(pub u64); 297 | -------------------------------------------------------------------------------- /src/.crate-docs.md: -------------------------------------------------------------------------------- 1 | A [write-ahead log (WAL)][wal] implementation for Rust. 2 | 3 | > There's The Great Wall, and then there's this: an okay WAL. 4 | 5 | **WARNING: This crate is early in development. Please do not use in any 6 | production projects until this has been incorporated into 7 | [Sediment](https://github.com/khonsulabs/sediment) and shipping as part of 8 | [Nebari](https://github.com/khonsulabs/nebari). The file format is currently 9 | considered unstable.** 10 | 11 | ![okaywal forbids unsafe code](https://img.shields.io/badge/unsafe-forbid-success) 12 | [![crate version](https://img.shields.io/crates/v/okaywal.svg)](https://crates.io/crates/okaywal) 13 | [![Live Build Status](https://img.shields.io/github/actions/workflow/status/khonsulabs/okaywal/rust.yml?branch=main)](https://github.com/khonsulabs/okaywal/actions?query=workflow:Tests) 14 | [![HTML Coverage Report for `main` branch](https://khonsulabs.github.io/okaywal/coverage/badge.svg)](https://khonsulabs.github.io/okaywal/coverage/) 15 | [![Documentation](https://img.shields.io/badge/docs-main-informational)](https://khonsulabs.github.io/okaywal/main/okaywal) 16 | 17 | This crate exposes a WAL that supports: 18 | 19 | - Atomic and Durable writes from multiple threads. 20 | - Random access for previously written data. 21 | - Automatic checkpointing to allow reusing disk space and 22 | preventing the WAL from growing too large. 23 | - Interactive recovery process with basic data versioning support. 24 | 25 | ## Basic How-To 26 | 27 | [`WriteAheadLog::recover()`](WriteAheadLog::recover) is used to create or recover a WAL 28 | in a given directory. To open a log, an implementer of 29 | [`LogManager`](LogManager) must be provided. This trait is how 30 | OkayWAL communicates with your code when recovering or checkpointing a log. 31 | 32 | The [basic example][basic-example] shows this process with many comments 33 | describing how OkayWAL works. 34 | 35 | ```rust,ignore 36 | // Open a log using a Checkpointer that echoes the information passed into each 37 | // function that the Checkpointer trait defines. 38 | let log = WriteAheadLog::recover("my-log", LoggingCheckpointer)?; 39 | 40 | // Begin writing an entry to the log. 41 | let mut writer = log.begin_entry()?; 42 | 43 | // Each entry is one or more chunks of data. Each chunk can be individually 44 | // addressed using its LogPosition. 45 | let record = writer.write_chunk("this is the first entry".as_bytes())?; 46 | 47 | // To fully flush all written bytes to disk and make the new entry 48 | // resilient to a crash, the writer must be committed. 49 | writer.commit()?; 50 | ``` 51 | 52 | ## Multi-Threaded Writing 53 | 54 | Optimized writing to the log from multiple threads is handled automatically. 55 | Only one thread may access the active log file at any moment in time. Because 56 | the slowest part of writing data to disk is `fsync`, OkayWAL manages 57 | synchronizing multiple writers such that a single `fsync` call can be made for 58 | multiple writes. 59 | 60 | This can be demonstrated by running the benchmark suite: `cargo bench -p 61 | benchmarks`: 62 | 63 | ### commit-256B 64 | 65 | | Label | avg | min | max | stddev | out% | 66 | |-------------|---------|---------|---------|---------|--------| 67 | | okaywal-01t | 1.001ms | 617.5us | 7.924ms | 557.3us | 0.016% | 68 | | okaywal-02t | 1.705ms | 617.3us | 11.38ms | 912.1us | 0.006% | 69 | | okaywal-04t | 1.681ms | 622.4us | 9.688ms | 671.4us | 0.021% | 70 | | okaywal-08t | 1.805ms | 656.5us | 13.88ms | 1.001ms | 0.014% | 71 | | okaywal-16t | 1.741ms | 643.2us | 7.895ms | 796.4us | 0.028% | 72 | 73 | ### commit-1KB 74 | 75 | | Label | avg | min | max | stddev | out% | 76 | |-------------|---------|---------|---------|---------|--------| 77 | | okaywal-01t | 959.3us | 621.9us | 7.419ms | 584.4us | 0.012% | 78 | | okaywal-02t | 1.569ms | 627.5us | 7.986ms | 1.007ms | 0.028% | 79 | | okaywal-04t | 1.856ms | 650.5us | 11.14ms | 1.087ms | 0.017% | 80 | | okaywal-08t | 2.054ms | 697.3us | 11.04ms | 1.066ms | 0.021% | 81 | | okaywal-16t | 1.875ms | 641.5us | 8.193ms | 674.6us | 0.032% | 82 | 83 | ### commit-4KB 84 | 85 | | Label | avg | min | max | stddev | out% | 86 | |-------------|---------|---------|---------|---------|--------| 87 | | okaywal-01t | 1.242ms | 748.8us | 6.902ms | 982.4us | 0.008% | 88 | | okaywal-02t | 1.767ms | 761.9us | 8.986ms | 902.1us | 0.016% | 89 | | okaywal-04t | 2.347ms | 787.1us | 8.853ms | 1.084ms | 0.016% | 90 | | okaywal-08t | 2.798ms | 810.8us | 12.53ms | 1.168ms | 0.014% | 91 | | okaywal-16t | 2.151ms | 840.5us | 14.74ms | 1.201ms | 0.008% | 92 | 93 | ### commit-1MB 94 | 95 | | Label | avg | min | max | stddev | out% | 96 | |-------------|---------|---------|---------|---------|--------| 97 | | okaywal-01t | 7.018ms | 5.601ms | 9.865ms | 788.2us | 0.027% | 98 | | okaywal-02t | 11.06ms | 4.281ms | 20.14ms | 3.521ms | 0.000% | 99 | | okaywal-04t | 19.77ms | 5.094ms | 73.21ms | 8.794ms | 0.007% | 100 | | okaywal-08t | 25.06ms | 2.871ms | 97.60ms | 17.33ms | 0.002% | 101 | | okaywal-16t | 19.01ms | 3.480ms | 58.85ms | 7.195ms | 0.014% | 102 | 103 | These numbers are the time taken for a single thread to perform an atomic and 104 | durable write of a given size to the log file. Despite using a single-file 105 | approach, we are able to keep average write times very low even with a large 106 | number of simultaneous writers. 107 | 108 | ## How OkayWAL works 109 | 110 | OkayWAL streams incoming data into "segments". Each segment file is named with 111 | the format `wal-{id}`. The id of a segment file refers to the first `EntryId` 112 | that could appear within the segment file. 113 | 114 | Segment files are pre-allocated to the length configured in 115 | `Configuration::preallocate_bytes`. Preallocating files is critical for 116 | performance, as overwriting existing bytes in general is less expensive than 117 | allocating new bytes on disk. 118 | 119 | OkayWAL always has a current segment file. When a new entry is written, it 120 | always goes to the current segment file. When an entry is completed, the length 121 | of the segment file is checked against `Configuration::checkpoint_after_bytes`. 122 | If enough data has been written to trigger a checkpoint, the file is sent to the 123 | checkpointing thread and a new segment file is activated. 124 | 125 | Regardless of whether the file is checkpointed, before control returns from 126 | committing an entry, the file is `fsync`ed. `fsync` operations are batched, 127 | allowing multiple entries to be written by separate threads during the same 128 | `fsync` operation. 129 | 130 | OkayWAL also keeps track of any time a new file is created or a file is renamed. 131 | As needed, the directory containing the write-ahead logs is also `fsync`ed to 132 | ensure necessary file and directory metadata is fully synchronized. Just like 133 | file `fsync` batching, OkayWAL also automatically batches directory `fsync`s 134 | across threads. 135 | 136 | ### Checkpointing a segment file (Background Thread) 137 | 138 | The checkpointing thread holds a weak reference to the `WriteAheadLog` data. 139 | When a file is received by the thread to checkpoint, it will upgrade the weak 140 | reference. If it cannot, the checkpointing thread shuts down gracefully and the 141 | recovery process will send the file again for checkpointing the next time the 142 | log is opened. 143 | 144 | The thread invokes `LogManager::checkpoint_to` for the file, allowing the 145 | `LogManager` to make any needed changes to persist the data stored in the 146 | segment being checkpointed. 147 | 148 | After the `LogManager` finishes, the file is renamed to include `-cp` as its 149 | suffix. Until this step, readers are able to be opened against data stored in 150 | the file being checkpointed. Once the file is renamed, new readers will begin 151 | returning not found errors. 152 | 153 | After the file is renamed, the checkpointer waits for all outstanding readers to 154 | finish reading data. The file is then finally recycled by moving it to the 155 | inactive files list. 156 | 157 | ### Activating a new segment file 158 | 159 | If there are any files in the inactive files list, one is reused. Otherwise, a 160 | new file is created and filled with 0's to the configured preallocation length. 161 | 162 | The file's name is set to `wal-{next EntryId}`. For example, a brand new 163 | write-ahead log's first segment file will be named `wal-1`, and the first 164 | `EntryId` written will be `1`. 165 | 166 | ### Segment File Format 167 | 168 | Each segment file starts with this header: 169 | 170 | - `okw`: Three byte magic code 171 | - OkayWAL Version: Single byte version number. Currently 0. 172 | - `Configuration::version_info` length: Single byte. The embedded information 173 | must be 255 or less bytes long. 174 | - Embedded Version Info: The bytes of the version info. The previous byte 175 | controls how many bytes long this field is. 176 | 177 | After this header, the file is a series of entries, each which contain a series 178 | of chunks. A byte with a value of 1 signifies a new entry. Any other byte causes 179 | the reader to stop reading entries from the file. 180 | 181 | The first 8 bytes of the entry are the little-endian representation of its 182 | `EntryId`. 183 | 184 | After the `EntryId`, a series of chunks is expected. A byte with a value of 2 185 | signals that a chunk is next in the file. A byte with a value of 3 signals that 186 | this is the end of the current entry being written. Any other byte causes the 187 | `SegmentReader` to return an AbortedEntry result. Any already-read chunks from 188 | this entry should be ignored/rolled back by the `LogManager`. 189 | 190 | The first four bytes of a chunk are the data length in little-endian 191 | representation. The data for the chunk follows. 192 | 193 | Finally, a four-byte CRC-32 ends the chunk. 194 | 195 | If a reader does not encounter a new chunk marker (2) or an end-of-entry marker 196 | (3), the entry should be considered abandoned and all chunks should be ignored. 197 | 198 | [basic-example]: https://github.com/khonsulabs/okaywal/blob/main/examples/basic.rs 199 | [wal]: https://en.wikipedia.org/wiki/Write-ahead_logging 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OkayWAL 2 | 3 | A [write-ahead log (WAL)][wal] implementation for Rust. 4 | 5 | > There's The Great Wall, and then there's this: an okay WAL. 6 | 7 | **WARNING: This crate is early in development. Please do not use in any 8 | production projects until this has been incorporated into 9 | [Sediment](https://github.com/khonsulabs/sediment) and shipping as part of 10 | [Nebari](https://github.com/khonsulabs/nebari). The file format is currently 11 | considered unstable.** 12 | 13 | ![okaywal forbids unsafe code](https://img.shields.io/badge/unsafe-forbid-success) 14 | [![crate version](https://img.shields.io/crates/v/okaywal.svg)](https://crates.io/crates/okaywal) 15 | [![Live Build Status](https://img.shields.io/github/actions/workflow/status/khonsulabs/okaywal/rust.yml?branch=main)](https://github.com/khonsulabs/okaywal/actions?query=workflow:Tests) 16 | [![HTML Coverage Report for `main` branch](https://khonsulabs.github.io/okaywal/coverage/badge.svg)](https://khonsulabs.github.io/okaywal/coverage/) 17 | [![Documentation](https://img.shields.io/badge/docs-main-informational)](https://khonsulabs.github.io/okaywal/main/okaywal) 18 | 19 | This crate exposes a WAL that supports: 20 | 21 | - Atomic and Durable writes from multiple threads. 22 | - Random access for previously written data. 23 | - Automatic checkpointing to allow reusing disk space and 24 | preventing the WAL from growing too large. 25 | - Interactive recovery process with basic data versioning support. 26 | 27 | ## Basic How-To 28 | 29 | [`WriteAheadLog::recover()`](https://khonsulabs.github.io/okaywal/main/okaywal/struct.WriteAheadLog.html#method.recover) is used to create or recover a WAL 30 | in a given directory. To open a log, an implementer of 31 | [`LogManager`](https://khonsulabs.github.io/okaywal/main/okaywal/trait.LogManager.html) must be provided. This trait is how 32 | OkayWAL communicates with your code when recovering or checkpointing a log. 33 | 34 | The [basic example][basic-example] shows this process with many comments 35 | describing how OkayWAL works. 36 | 37 | ```rust,ignore 38 | // Open a log using a Checkpointer that echoes the information passed into each 39 | // function that the Checkpointer trait defines. 40 | let log = WriteAheadLog::recover("my-log", LoggingCheckpointer)?; 41 | 42 | // Begin writing an entry to the log. 43 | let mut writer = log.begin_entry()?; 44 | 45 | // Each entry is one or more chunks of data. Each chunk can be individually 46 | // addressed using its LogPosition. 47 | let record = writer.write_chunk("this is the first entry".as_bytes())?; 48 | 49 | // To fully flush all written bytes to disk and make the new entry 50 | // resilient to a crash, the writer must be committed. 51 | writer.commit()?; 52 | ``` 53 | 54 | ## Multi-Threaded Writing 55 | 56 | Optimized writing to the log from multiple threads is handled automatically. 57 | Only one thread may access the active log file at any moment in time. Because 58 | the slowest part of writing data to disk is `fsync`, OkayWAL manages 59 | synchronizing multiple writers such that a single `fsync` call can be made for 60 | multiple writes. 61 | 62 | This can be demonstrated by running the benchmark suite: `cargo bench -p 63 | benchmarks`: 64 | 65 | ### commit-256B 66 | 67 | | Label | avg | min | max | stddev | out% | 68 | |-------------|---------|---------|---------|---------|--------| 69 | | okaywal-01t | 1.001ms | 617.5us | 7.924ms | 557.3us | 0.016% | 70 | | okaywal-02t | 1.705ms | 617.3us | 11.38ms | 912.1us | 0.006% | 71 | | okaywal-04t | 1.681ms | 622.4us | 9.688ms | 671.4us | 0.021% | 72 | | okaywal-08t | 1.805ms | 656.5us | 13.88ms | 1.001ms | 0.014% | 73 | | okaywal-16t | 1.741ms | 643.2us | 7.895ms | 796.4us | 0.028% | 74 | 75 | ### commit-1KB 76 | 77 | | Label | avg | min | max | stddev | out% | 78 | |-------------|---------|---------|---------|---------|--------| 79 | | okaywal-01t | 959.3us | 621.9us | 7.419ms | 584.4us | 0.012% | 80 | | okaywal-02t | 1.569ms | 627.5us | 7.986ms | 1.007ms | 0.028% | 81 | | okaywal-04t | 1.856ms | 650.5us | 11.14ms | 1.087ms | 0.017% | 82 | | okaywal-08t | 2.054ms | 697.3us | 11.04ms | 1.066ms | 0.021% | 83 | | okaywal-16t | 1.875ms | 641.5us | 8.193ms | 674.6us | 0.032% | 84 | 85 | ### commit-4KB 86 | 87 | | Label | avg | min | max | stddev | out% | 88 | |-------------|---------|---------|---------|---------|--------| 89 | | okaywal-01t | 1.242ms | 748.8us | 6.902ms | 982.4us | 0.008% | 90 | | okaywal-02t | 1.767ms | 761.9us | 8.986ms | 902.1us | 0.016% | 91 | | okaywal-04t | 2.347ms | 787.1us | 8.853ms | 1.084ms | 0.016% | 92 | | okaywal-08t | 2.798ms | 810.8us | 12.53ms | 1.168ms | 0.014% | 93 | | okaywal-16t | 2.151ms | 840.5us | 14.74ms | 1.201ms | 0.008% | 94 | 95 | ### commit-1MB 96 | 97 | | Label | avg | min | max | stddev | out% | 98 | |-------------|---------|---------|---------|---------|--------| 99 | | okaywal-01t | 7.018ms | 5.601ms | 9.865ms | 788.2us | 0.027% | 100 | | okaywal-02t | 11.06ms | 4.281ms | 20.14ms | 3.521ms | 0.000% | 101 | | okaywal-04t | 19.77ms | 5.094ms | 73.21ms | 8.794ms | 0.007% | 102 | | okaywal-08t | 25.06ms | 2.871ms | 97.60ms | 17.33ms | 0.002% | 103 | | okaywal-16t | 19.01ms | 3.480ms | 58.85ms | 7.195ms | 0.014% | 104 | 105 | These numbers are the time taken for a single thread to perform an atomic and 106 | durable write of a given size to the log file. Despite using a single-file 107 | approach, we are able to keep average write times very low even with a large 108 | number of simultaneous writers. 109 | 110 | ## How OkayWAL works 111 | 112 | OkayWAL streams incoming data into "segments". Each segment file is named with 113 | the format `wal-{id}`. The id of a segment file refers to the first `EntryId` 114 | that could appear within the segment file. 115 | 116 | Segment files are pre-allocated to the length configured in 117 | `Configuration::preallocate_bytes`. Preallocating files is critical for 118 | performance, as overwriting existing bytes in general is less expensive than 119 | allocating new bytes on disk. 120 | 121 | OkayWAL always has a current segment file. When a new entry is written, it 122 | always goes to the current segment file. When an entry is completed, the length 123 | of the segment file is checked against `Configuration::checkpoint_after_bytes`. 124 | If enough data has been written to trigger a checkpoint, the file is sent to the 125 | checkpointing thread and a new segment file is activated. 126 | 127 | Regardless of whether the file is checkpointed, before control returns from 128 | committing an entry, the file is `fsync`ed. `fsync` operations are batched, 129 | allowing multiple entries to be written by separate threads during the same 130 | `fsync` operation. 131 | 132 | OkayWAL also keeps track of any time a new file is created or a file is renamed. 133 | As needed, the directory containing the write-ahead logs is also `fsync`ed to 134 | ensure necessary file and directory metadata is fully synchronized. Just like 135 | file `fsync` batching, OkayWAL also automatically batches directory `fsync`s 136 | across threads. 137 | 138 | ### Checkpointing a segment file (Background Thread) 139 | 140 | The checkpointing thread holds a weak reference to the `WriteAheadLog` data. 141 | When a file is received by the thread to checkpoint, it will upgrade the weak 142 | reference. If it cannot, the checkpointing thread shuts down gracefully and the 143 | recovery process will send the file again for checkpointing the next time the 144 | log is opened. 145 | 146 | The thread invokes `LogManager::checkpoint_to` for the file, allowing the 147 | `LogManager` to make any needed changes to persist the data stored in the 148 | segment being checkpointed. 149 | 150 | After the `LogManager` finishes, the file is renamed to include `-cp` as its 151 | suffix. Until this step, readers are able to be opened against data stored in 152 | the file being checkpointed. Once the file is renamed, new readers will begin 153 | returning not found errors. 154 | 155 | After the file is renamed, the checkpointer waits for all outstanding readers to 156 | finish reading data. The file is then finally recycled by moving it to the 157 | inactive files list. 158 | 159 | ### Activating a new segment file 160 | 161 | If there are any files in the inactive files list, one is reused. Otherwise, a 162 | new file is created and filled with 0's to the configured preallocation length. 163 | 164 | The file's name is set to `wal-{next EntryId}`. For example, a brand new 165 | write-ahead log's first segment file will be named `wal-1`, and the first 166 | `EntryId` written will be `1`. 167 | 168 | ### Segment File Format 169 | 170 | Each segment file starts with this header: 171 | 172 | - `okw`: Three byte magic code 173 | - OkayWAL Version: Single byte version number. Currently 0. 174 | - `Configuration::version_info` length: Single byte. The embedded information 175 | must be 255 or less bytes long. 176 | - Embedded Version Info: The bytes of the version info. The previous byte 177 | controls how many bytes long this field is. 178 | 179 | After this header, the file is a series of entries, each which contain a series 180 | of chunks. A byte with a value of 1 signifies a new entry. Any other byte causes 181 | the reader to stop reading entries from the file. 182 | 183 | The first 8 bytes of the entry are the little-endian representation of its 184 | `EntryId`. 185 | 186 | After the `EntryId`, a series of chunks is expected. A byte with a value of 2 187 | signals that a chunk is next in the file. A byte with a value of 3 signals that 188 | this is the end of the current entry being written. Any other byte causes the 189 | `SegmentReader` to return an AbortedEntry result. Any already-read chunks from 190 | this entry should be ignored/rolled back by the `LogManager`. 191 | 192 | The first four bytes of a chunk are the data length in little-endian 193 | representation. The data for the chunk follows. 194 | 195 | Finally, a four-byte CRC-32 ends the chunk. 196 | 197 | If a reader does not encounter a new chunk marker (2) or an end-of-entry marker 198 | (3), the entry should be considered abandoned and all chunks should be ignored. 199 | 200 | [basic-example]: https://github.com/khonsulabs/okaywal/blob/main/examples/basic.rs 201 | [wal]: https://en.wikipedia.org/wiki/Write-ahead_logging 202 | 203 | ## Open-source Licenses 204 | 205 | This project, like all projects from [Khonsu Labs](https://khonsulabs.com/), are 206 | open-source. This repository is available under the [MIT License](./LICENSE-MIT) 207 | or the [Apache License 2.0](./LICENSE-APACHE). 208 | 209 | To learn more about contributing, please see [CONTRIBUTING.md](./CONTRIBUTING.md). 210 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Khonsu Labs LLC 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /benchmarks/benches/benchmarks.rs: -------------------------------------------------------------------------------- 1 | use std::{convert::Infallible, fmt::Display, sync::Arc}; 2 | 3 | use okaywal::{Configuration, LogVoid, WriteAheadLog}; 4 | use tempfile::TempDir; 5 | use timings::{Benchmark, BenchmarkImplementation, Label, LabeledTimings, Timings}; 6 | 7 | fn main() { 8 | let measurements = Timings::default(); 9 | let bench = Benchmark::for_each_config(vec![ 10 | InsertConfig { 11 | number_of_bytes: 256, 12 | iters: 500, 13 | }, 14 | InsertConfig { 15 | number_of_bytes: 1024, 16 | iters: 250, 17 | }, 18 | InsertConfig { 19 | number_of_bytes: 4096, 20 | iters: 125, 21 | }, 22 | InsertConfig { 23 | number_of_bytes: 1024 * 1024, 24 | iters: 75, 25 | }, 26 | ]) 27 | .with_each_number_of_threads([1, 2, 4, 8, 16]) 28 | .with::(); 29 | 30 | #[cfg(feature = "sharded-log")] 31 | let bench = bench.with::(); 32 | 33 | #[cfg(feature = "postgres")] 34 | let bench = bench.with::(); 35 | 36 | #[cfg(feature = "sqlite")] 37 | let bench = bench.with::(); 38 | 39 | bench.run(&measurements).unwrap(); 40 | 41 | let stats = measurements.wait_for_stats(); 42 | timings::print_table_summaries(&stats).unwrap(); 43 | } 44 | 45 | #[derive(Copy, Clone, Debug)] 46 | struct InsertConfig { 47 | number_of_bytes: usize, 48 | iters: usize, 49 | } 50 | 51 | struct OkayWal { 52 | config: InsertConfig, 53 | _dir: Arc, 54 | log: WriteAheadLog, 55 | } 56 | 57 | impl BenchmarkImplementation for OkayWal { 58 | type SharedConfig = (InsertConfig, Arc, WriteAheadLog); 59 | 60 | fn label(number_of_threads: usize, _config: &InsertConfig) -> Label { 61 | Label::from(format!("okaywal-{number_of_threads:02}t")) 62 | } 63 | 64 | fn initialize_shared_config( 65 | _number_of_threads: usize, 66 | config: &InsertConfig, 67 | ) -> Result { 68 | let dir = Arc::new(TempDir::new_in(".").unwrap()); 69 | let log = Configuration::default_for(&*dir).open(LogVoid).unwrap(); 70 | Ok((*config, dir, log)) 71 | } 72 | 73 | fn reset(_shutting_down: bool) -> Result<(), Infallible> { 74 | Ok(()) 75 | } 76 | 77 | fn initialize( 78 | _number_of_threads: usize, 79 | (config, dir, log): Self::SharedConfig, 80 | ) -> Result { 81 | Ok(Self { 82 | config, 83 | log, 84 | _dir: dir, 85 | }) 86 | } 87 | 88 | fn measure(&mut self, measurements: &LabeledTimings