├── nomt
    ├── src
    │   ├── sys
    │   │   ├── macos.rs
    │   │   ├── mod.rs
    │   │   ├── unix.rs
    │   │   └── linux.rs
    │   ├── beatree
    │   │   ├── branch
    │   │   │   └── mod.rs
    │   │   ├── leaf
    │   │   │   └── mod.rs
    │   │   ├── writeout.rs
    │   │   ├── benches.rs
    │   │   ├── index.rs
    │   │   └── leaf_cache.rs
    │   ├── bitbox
    │   │   ├── wal
    │   │   │   ├── mod.rs
    │   │   │   ├── tests.rs
    │   │   │   └── read.rs
    │   │   ├── writeout.rs
    │   │   ├── meta_map.rs
    │   │   └── ht_file.rs
    │   ├── store
    │   │   ├── page_loader.rs
    │   │   ├── flock.rs
    │   │   └── sync.rs
    │   ├── task.rs
    │   ├── merkle
    │   │   ├── page_set.rs
    │   │   └── cache_prepopulate.rs
    │   ├── io
    │   │   ├── unix.rs
    │   │   └── fsyncer.rs
    │   ├── seglog
    │   │   └── segment_filename.rs
    │   ├── rollback
    │   │   └── delta.rs
    │   ├── metrics.rs
    │   └── page_diff.rs
    ├── benches
    │   └── beatree.rs
    ├── tests
    │   ├── large_values.rs
    │   ├── exclusive_dir.rs
    │   ├── last_layer_trie.rs
    │   ├── compute_root.rs
    │   ├── add_remove.rs
    │   ├── wal.rs
    │   ├── prev_root_check.rs
    │   ├── witness_check.rs
    │   ├── extend_range_protocol.rs
    │   ├── fill_and_empty.rs
    │   └── overlay.rs
    └── Cargo.toml
├── fuzz
    ├── .gitignore
    ├── fuzz_targets
    │   ├── prefix_len.rs
    │   ├── separate.rs
    │   ├── common
    │   │   └── mod.rs
    │   ├── separator_len.rs
    │   ├── reconstruct_key.rs
    │   └── bitwise_memcpy.rs
    └── Cargo.toml
├── docs
    ├── images
    │   ├── nomt_put.png
    │   ├── nomt_pages.jpg
    │   ├── nomt_number_rule.png
    │   └── binary_merkle_patricia_tree.png
    ├── CONTRIBUTING.md
    └── nomt_specification.md
├── examples
    ├── commit_batch
    │   ├── src
    │   │   ├── main.rs
    │   │   └── lib.rs
    │   └── Cargo.toml
    ├── read_value
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    └── witness_verification
    │   ├── Cargo.toml
    │   └── src
    │       └── main.rs
├── .gitignore
├── trickfs
    ├── trickmnt
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    ├── Cargo.toml
    └── README.md
├── torture
    ├── src
    │   ├── main.rs
    │   ├── supervisor
    │   │   ├── controller.rs
    │   │   └── cli.rs
    │   ├── spawn.rs
    │   └── logging.rs
    └── Cargo.toml
├── Cargo.toml
├── .editorconfig
├── .github
    ├── actions
    │   └── install-fuse
    │   │   └── action.yml
    └── workflows
    │   ├── ci.yml
    │   └── bench.yml
├── core
    ├── src
    │   ├── lib.rs
    │   ├── proof
    │   │   └── mod.rs
    │   ├── page.rs
    │   ├── trie.rs
    │   └── hasher.rs
    └── Cargo.toml
├── LICENSE-MIT
├── benchtop
    ├── Cargo.toml
    └── src
    │   ├── main.rs
    │   ├── timer.rs
    │   ├── bench.rs
    │   ├── custom_workload.rs
    │   ├── backend.rs
    │   ├── transfer_workload.rs
    │   └── sp_trie.rs
└── README.md


/nomt/src/sys/macos.rs:
--------------------------------------------------------------------------------
1 | //! macOS-specific code.
2 | 


--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | corpus
3 | artifacts
4 | coverage
5 | 


--------------------------------------------------------------------------------
/docs/images/nomt_put.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adsstudio24/nomt/HEAD/docs/images/nomt_put.png


--------------------------------------------------------------------------------
/docs/images/nomt_pages.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adsstudio24/nomt/HEAD/docs/images/nomt_pages.jpg


--------------------------------------------------------------------------------
/docs/images/nomt_number_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adsstudio24/nomt/HEAD/docs/images/nomt_number_rule.png


--------------------------------------------------------------------------------
/docs/images/binary_merkle_patricia_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adsstudio24/nomt/HEAD/docs/images/binary_merkle_patricia_tree.png


--------------------------------------------------------------------------------
/examples/commit_batch/src/main.rs:
--------------------------------------------------------------------------------
1 | fn main() -> anyhow::Result<()> {
2 |     commit_batch::NomtDB::commit_batch().map(|_| ())
3 | }
4 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/branch/mod.rs:
--------------------------------------------------------------------------------
1 | pub use node::{body_size, BranchNode, BranchNodeBuilder, BranchNodeView, BRANCH_NODE_BODY_SIZE};
2 | pub mod node;
3 | 
4 | pub const BRANCH_NODE_SIZE: usize = 4096;
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Common ignores
 2 | .DS_Store
 3 | .idea
 4 | .vscode
 5 | .envrc
 6 | 
 7 | /target
 8 | 
 9 | # samply / benchtop
10 | profile.json
11 | /test
12 | /nomt/test
13 | 
14 | # xtask
15 | /benchtop/regression.toml
16 | /benchtop/sov_db*
17 | /benchtop/nomt_db*
18 | /benchtop/sp_trie_db*
19 | /benchtop/target
20 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/wal/mod.rs:
--------------------------------------------------------------------------------
 1 | const WAL_ENTRY_TAG_START: u8 = 1;
 2 | const WAL_ENTRY_TAG_END: u8 = 2;
 3 | const WAL_ENTRY_TAG_CLEAR: u8 = 3;
 4 | const WAL_ENTRY_TAG_UPDATE: u8 = 4;
 5 | 
 6 | pub use read::{WalBlobReader, WalEntry};
 7 | pub use write::WalBlobBuilder;
 8 | 
 9 | mod read;
10 | mod write;
11 | 
12 | #[cfg(test)]
13 | mod tests;
14 | 


--------------------------------------------------------------------------------
/nomt/src/sys/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Platform-specific code.
 2 | //!
 3 | //! At the moment we only target Linux and macOS.
 4 | 
 5 | cfg_if::cfg_if! {
 6 |     if #[cfg(target_os = "linux")] {
 7 |         pub mod linux;
 8 |         pub mod unix;
 9 |     } else if #[cfg(target_os = "macos")] {
10 |         pub mod macos;
11 |         pub mod unix;
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/prefix_len.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | mod common;
 4 | 
 5 | use common::Run;
 6 | use libfuzzer_sys::fuzz_target;
 7 | use nomt::beatree::prefix_len;
 8 | 
 9 | fuzz_target!(|run: Run| {
10 |     let Run {
11 |         prefix_bit_len,
12 |         a,
13 |         b,
14 |     } = run;
15 | 
16 |     assert_eq!(prefix_bit_len, prefix_len(&a, &b));
17 | });
18 | 


--------------------------------------------------------------------------------
/nomt/benches/beatree.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "benchmarks")]
 2 | use criterion::{criterion_group, criterion_main};
 3 | #[cfg(feature = "benchmarks")]
 4 | use nomt::beatree::benches::beatree_benchmark;
 5 | 
 6 | #[cfg(feature = "benchmarks")]
 7 | criterion_group!(benches, beatree_benchmark);
 8 | #[cfg(feature = "benchmarks")]
 9 | criterion_main!(benches);
10 | 
11 | #[cfg(not(feature = "benchmarks"))]
12 | fn main() {}
13 | 


--------------------------------------------------------------------------------
/trickfs/trickmnt/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "trickmnt"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | [dependencies]
11 | trickfs = { path = ".." }
12 | clap = { version = "4.3.5", features = ["derive"] }
13 | env_logger = "0.11.6"
14 | log = "0.4.22"
15 | anyhow = "1.0.95"
16 | 


--------------------------------------------------------------------------------
/trickfs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "trickfs"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | [dependencies]
11 | fuser = { version = "0.15.1", features = ["abi-7-23"] }
12 | libc = "0.2.169"
13 | log = "0.4.22"
14 | tempfile = "3.15.0"
15 | 
16 | [dev-dependencies]
17 | env_logger = "0.11.6"
18 | 


--------------------------------------------------------------------------------
/examples/read_value/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "read_value"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt = { path = "../../nomt"  }
14 | anyhow = "1.0.81"
15 | sha2 = "0.10.6"
16 | 


--------------------------------------------------------------------------------
/examples/commit_batch/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "commit-batch"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt = { path = "../../nomt"  }
14 | anyhow = "1.0.81"
15 | sha2 = "0.10.6"
16 | 


--------------------------------------------------------------------------------
/torture/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use tokio::net::UnixStream;
 3 | 
 4 | mod agent;
 5 | mod logging;
 6 | mod message;
 7 | mod spawn;
 8 | mod supervisor;
 9 | 
10 | #[tokio::main]
11 | async fn main() -> Result<()> {
12 |     if let Some(chan) = spawn::am_spawned() {
13 |         let chan = UnixStream::from_std(chan)?;
14 |         agent::run(chan).await?;
15 |     } else {
16 |         supervisor::run().await?;
17 |     }
18 |     Ok(())
19 | }
20 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | members = [
 4 |     "core",
 5 |     "nomt",
 6 |     "fuzz",
 7 |     "torture",
 8 |     "examples/*",
 9 |     "trickfs",
10 |     "trickfs/trickmnt",
11 | ]
12 | exclude = ["benchtop"]
13 | 
14 | [workspace.package]
15 | authors = ["thrum"]
16 | homepage = "https://thrum.dev"
17 | repository = "https://github.com/thrumdev/nomt"
18 | edition = "2021"
19 | license = "MIT/Apache-2.0"
20 | 
21 | [profile.release]
22 | debug = 1
23 | debug-assertions = true
24 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | [*]
 3 | indent_style=space
 4 | indent_size=space
 5 | tab_width=4
 6 | end_of_line=lf
 7 | charset=utf-8
 8 | trim_trailing_whitespace=true
 9 | max_line_length=100
10 | insert_final_newline=true
11 | 
12 | [*.yml]
13 | indent_style=space
14 | indent_size=2
15 | tab_width=8
16 | end_of_line=lf
17 | 
18 | [*.sh]
19 | indent_style=space
20 | indent_size=4
21 | tab_width=8
22 | end_of_line=lf
23 | 
24 | [*.json]
25 | indent_style=space
26 | indent_size=2
27 | tab_width=8
28 | end_of_line=lf
29 | 
30 | 


--------------------------------------------------------------------------------
/examples/witness_verification/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "witness_verification"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt-core = { path = "../../core"  }
14 | commit-batch = { path = "../commit_batch"  }
15 | anyhow = "1.0.81"
16 | blake3 = "1.5.1"
17 | 


--------------------------------------------------------------------------------
/.github/actions/install-fuse/action.yml:
--------------------------------------------------------------------------------
 1 | name: Install Ubuntu Dependencies
 2 | description: "Installs dependencies on Ubuntu"
 3 | 
 4 | runs:
 5 |   using: "composite"
 6 |   steps:
 7 |     - name: Update apt-get
 8 |       shell: bash
 9 |       run: sudo apt-get update
10 | 
11 |     - name: Install FUSE libraries
12 |       shell: bash
13 |       run: sudo apt-get install -y libfuse3-dev libfuse-dev
14 | 
15 |     - name: Allow non-root users to mount FUSE filesystems
16 |       shell: bash
17 |       run: echo "user_allow_other" | sudo tee -a /etc/fuse.conf
18 | 


--------------------------------------------------------------------------------
/core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Core operations and types within the Nearly Optimal Merkle Trie.
 2 | //!
 3 | //! This crate defines the schema and basic operations over the merkle trie in a backend-agnostic
 4 | //! manner.
 5 | //!
 6 | //! The core types and proof verification routines of this crate do not require the
 7 | //! standard library, but do require Rust's alloc crate.
 8 | 
 9 | #![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
10 | 
11 | extern crate alloc;
12 | 
13 | pub mod hasher;
14 | pub mod page;
15 | pub mod page_id;
16 | pub mod proof;
17 | pub mod trie;
18 | pub mod trie_pos;
19 | pub mod update;
20 | 


--------------------------------------------------------------------------------
/nomt/tests/large_values.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | 
 5 | #[test]
 6 | fn large_values() {
 7 |     let mut t = Test::new("large_values");
 8 | 
 9 |     let large1 = vec![1; 4096 * 128];
10 |     let large2 = vec![2; 4096 * 80 - 1245];
11 | 
12 |     t.write_id(0, Some(large1.clone()));
13 |     t.write_id(1, Some(large2.clone()));
14 |     let _ = t.commit();
15 |     assert_eq!(&*t.read_id(0).unwrap(), &large1);
16 |     assert_eq!(&*t.read_id(1).unwrap(), &large2);
17 |     t.write_id(1, None);
18 |     let _ = t.commit();
19 |     assert_eq!(&*t.read_id(0).unwrap(), &large1);
20 |     assert!(t.read_id(1).is_none());
21 | }
22 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/separate.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | mod common;
 4 | 
 5 | use bitvec::{order::Msb0, view::BitView};
 6 | use common::Run;
 7 | use libfuzzer_sys::fuzz_target;
 8 | use nomt::beatree::separate;
 9 | 
10 | fuzz_target!(|run: Run| {
11 |     let Run {
12 |         prefix_bit_len,
13 |         mut a,
14 |         mut b,
15 |     } = run;
16 | 
17 |     if a > b {
18 |         std::mem::swap(&mut a, &mut b);
19 |     }
20 | 
21 |     let mut expected = [0u8; 32];
22 |     expected.view_bits_mut::<Msb0>()[..prefix_bit_len + 1]
23 |         .copy_from_bitslice(&b.view_bits::<Msb0>()[..prefix_bit_len + 1]);
24 | 
25 |     assert_eq!(expected, separate(&a, &b));
26 | });
27 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/leaf/mod.rs:
--------------------------------------------------------------------------------
 1 | // The `LeafStore` struct manages leaves. It's responsible for management (allocation and
 2 | // deallocation) and querying the LNs by their LNID.
 3 | //
 4 | // It maintains an in-memory copy of the freelist to facilitate the page management. The allocation
 5 | // is performed in LIFO order. The allocations are performed in batches to amortize the IO for the
 6 | // freelist and metadata updates (growing the file in case freelist is empty).
 7 | //
 8 | // The leaf store doesn't perform caching. When querying the leaf store returns a handle to a page.
 9 | // As soon as the handle is dropped, the data becomes inaccessible and another disk roundtrip would
10 | // be required to access the data again.
11 | 
12 | pub mod node;
13 | 


--------------------------------------------------------------------------------
/trickfs/README.md:
--------------------------------------------------------------------------------
 1 | # trickfs
 2 | 
 3 | A FUSE filesystem useful for failure injection.
 4 | 
 5 | # Using trickfs.
 6 | 
 7 | Typically you would not need to run trickfs directly, because it should be used as a dependency
 8 | in other projects. However, if you want to test the filesystem, you can do so by running the
 9 | following command:
10 | 
11 | ```sh
12 | cargo run --release --bin trickmnt
13 | ```
14 | 
15 | # Building
16 | 
17 | Building the project requires fuse3 and fuse to be available. On Ubuntu, you can install them with
18 | the following commands:
19 | 
20 | ```sh
21 | sudo apt update
22 | sudo apt install libfuse3-dev libfuse-dev
23 | ```
24 | 
25 | On macOS you may need to install osxfuse:
26 | 
27 | ```sh
28 | brew install macfuse
29 | ```
30 | 


--------------------------------------------------------------------------------
/trickfs/trickmnt/src/main.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | #[derive(Parser, Debug)]
 4 | #[command(author, version, about, long_about = None)]
 5 | struct Args {
 6 |     /// Path to the directory where trickfs will be mounted
 7 |     #[arg(short, long, default_value = "/tmp/trick")]
 8 |     mountpoint: String,
 9 | }
10 | 
11 | fn waitline() {
12 |     log::info!("press return to stop...");
13 |     let _ = std::io::stdin().read_line(&mut String::new());
14 | }
15 | 
16 | fn main() -> anyhow::Result<()> {
17 |     env_logger::builder()
18 |         .filter_level(log::LevelFilter::Info)
19 |         .init();
20 | 
21 |     let args = Args::parse();
22 | 
23 |     let handle = trickfs::spawn_trick(args.mountpoint).unwrap();
24 |     waitline();
25 |     drop(handle);
26 | 
27 |     Ok(())
28 | }
29 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/writeout.rs:
--------------------------------------------------------------------------------
 1 | //! The writeout logic for beatree.
 2 | 
 3 | // As part of beatree writeout, we need to write BBN and LN files, resizing them to the correct
 4 | // size beforehand. After the writes are completed (fsync'd), we wait for the MANIFEST to be
 5 | // updated and then perform some cleanup.
 6 | 
 7 | use super::allocator::{PageNumber, Store};
 8 | use crate::io::{FatPage, IoHandle};
 9 | 
10 | pub fn submit_freelist_write(
11 |     io_handle: &IoHandle,
12 |     store: &Store,
13 |     free_list_pages: Vec<(PageNumber, FatPage)>,
14 | ) {
15 |     for (pn, page) in free_list_pages {
16 |         io_handle
17 |             .send(crate::io::IoCommand {
18 |                 kind: crate::io::IoKind::Write(store.store_fd(), pn.0 as u64, page),
19 |                 user_data: 0,
20 |             })
21 |             .unwrap();
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/nomt/src/store/page_loader.rs:
--------------------------------------------------------------------------------
 1 | use crate::{bitbox, io::IoHandle};
 2 | use nomt_core::page_id::PageId;
 3 | 
 4 | pub use bitbox::PageLoad;
 5 | 
 6 | pub struct PageLoader {
 7 |     pub(super) inner: bitbox::PageLoader,
 8 | }
 9 | 
10 | impl PageLoader {
11 |     /// Create a new page load.
12 |     pub fn start_load(&self, page_id: PageId) -> PageLoad {
13 |         self.inner.start_load(page_id)
14 |     }
15 | 
16 |     /// Advance the state of the given page load, blocking the current thread.
17 |     ///
18 |     /// Panics if the page load needs a completion or if the I/O pool is down.
19 |     ///
20 |     /// This returns `true` if the page request has been submitted and a completion will be
21 |     /// coming. `false` means that the page is guaranteed to be fresh.
22 |     pub fn probe(&self, load: &mut PageLoad, io_handle: &IoHandle, user_data: u64) -> bool {
23 |         self.inner.probe(load, io_handle, user_data)
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/nomt/src/sys/unix.rs:
--------------------------------------------------------------------------------
 1 | //! Common Unix definitions.
 2 | 
 3 | use std::{fs::File, os::fd::AsRawFd as _};
 4 | 
 5 | pub fn try_lock_exclusive(file: &File) -> std::io::Result<()> {
 6 |     cvt_r(|| unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) }).map(drop)
 7 | }
 8 | 
 9 | pub fn unlock(file: &File) -> std::io::Result<()> {
10 |     unsafe { cvt_r(|| libc::flock(file.as_raw_fd(), libc::LOCK_UN)).map(drop) }
11 | }
12 | 
13 | pub(super) fn cvt_r<F>(mut f: F) -> std::io::Result<i32>
14 | where
15 |     F: FnMut() -> i32,
16 | {
17 |     fn cvt(res: i32) -> std::io::Result<i32> {
18 |         if res == -1 {
19 |             Err(std::io::Error::last_os_error())
20 |         } else {
21 |             Ok(res)
22 |         }
23 |     }
24 | 
25 |     loop {
26 |         match cvt(f()) {
27 |             Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => (),
28 |             other => break other,
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/torture/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "torture"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | nix = { version = "0.29", features = ["process"] }
 8 | libc = "0.2.147"
 9 | anyhow = "1.0.72"
10 | cfg-if = "1.0.0"
11 | serde = { version = "1.0.216", features = ["derive"] }
12 | bincode = "1.3.3"
13 | nomt = { path = "../nomt" }
14 | tokio = { version = "1.42.0", features = ["full"] }
15 | tokio-util = { version = "0.7.13", features = ["codec"] }
16 | tokio-stream = "0.1.17"
17 | futures = "0.3.31"
18 | tempfile = "3.10"
19 | rand = "0.8.5"
20 | rand_pcg = "0.3.1"
21 | imbl = "3.0.0"
22 | tokio-serde = { version = "0.9.0", features = ["bincode"] }
23 | tracing = { version = "0.1.41", features = ["attributes"] }
24 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
25 | hex = "0.4.3"
26 | futures-util = "0.3.31"
27 | clap = { version = "4.5.23", features = ["derive"] }
28 | trickfs = { path = "../trickfs" }
29 | 


--------------------------------------------------------------------------------
/core/src/proof/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Trie proofs and proof verification.
 2 | //!
 3 | //! The Merkle Trie defined in NOMT is an authenticated data structure, which means that it permits
 4 | //! efficient proving against the root. This module exposes types and functions necessary for
 5 | //! handling these kinds of proofs.
 6 | //!
 7 | //! Using the types and functions exposed from this module, you can verify the value of a single
 8 | //! key within the trie ([`PathProof`]), the values of multiple keys ([`MultiProof`]), or the result
 9 | //! of updating a trie with a set of changes ([`verify_update`]).
10 | 
11 | pub use multi_proof::{
12 |     verify as verify_multi_proof, MultiPathProof, MultiProof, MultiProofVerificationError,
13 |     VerifiedMultiProof,
14 | };
15 | pub use path_proof::{
16 |     verify_update, KeyOutOfScope, PathProof, PathProofTerminal, PathProofVerificationError,
17 |     PathUpdate, VerifiedPathProof, VerifyUpdateError,
18 | };
19 | 
20 | mod multi_proof;
21 | mod path_proof;
22 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/nomt/src/store/flock.rs:
--------------------------------------------------------------------------------
 1 | //! This module provides a cross-platform advisory lock on a directory.
 2 | 
 3 | use std::{
 4 |     fs::{File, OpenOptions},
 5 |     path::Path,
 6 | };
 7 | 
 8 | /// Represents a cross-platform advisory lock on a directory.
 9 | pub struct Flock {
10 |     lock_fd: File,
11 | }
12 | 
13 | impl Flock {
14 |     pub fn lock(db_dir: &Path, lock_filename: &str) -> anyhow::Result<Self> {
15 |         let lock_path = db_dir.join(lock_filename);
16 | 
17 |         let lock_fd = OpenOptions::new()
18 |             .read(true)
19 |             .write(true)
20 |             .create(true)
21 |             .open(lock_path)?;
22 | 
23 |         match crate::sys::unix::try_lock_exclusive(&lock_fd) {
24 |             Ok(_) => Ok(Self { lock_fd }),
25 |             Err(e) => {
26 |                 anyhow::bail!("Failed to lock directory: {e}");
27 |             }
28 |         }
29 |     }
30 | }
31 | 
32 | impl Drop for Flock {
33 |     fn drop(&mut self) {
34 |         if let Err(e) = crate::sys::unix::unlock(&self.lock_fd) {
35 |             eprintln!("Failed to unlock directory lock: {e}");
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/nomt/tests/exclusive_dir.rs:
--------------------------------------------------------------------------------
 1 | //! Tests the directory lock behavior.
 2 | 
 3 | use std::path::PathBuf;
 4 | 
 5 | use nomt::{hasher::Blake3Hasher, Nomt, Options};
 6 | 
 7 | fn setup_nomt(path: &str, should_clean_up: bool) -> anyhow::Result<Nomt<Blake3Hasher>> {
 8 |     let path = {
 9 |         let mut p = PathBuf::from("test");
10 |         p.push(path);
11 |         p
12 |     };
13 |     if should_clean_up && path.exists() {
14 |         std::fs::remove_dir_all(&path)?;
15 |     }
16 |     let mut o = Options::new();
17 |     o.path(path);
18 |     o.bitbox_seed([0; 16]);
19 |     Nomt::open(o)
20 | }
21 | 
22 | #[test]
23 | fn smoke() {
24 |     let _nomt = setup_nomt("smoke", true).unwrap();
25 | }
26 | 
27 | #[test]
28 | fn dir_lock() {
29 |     let _nomt_1 = setup_nomt("dir_lock", true).unwrap();
30 |     let nomt_2 = setup_nomt("dir_lock", false);
31 |     assert!(matches!(nomt_2, Err(e) if e.to_string().contains("Resource temporarily unavailable")));
32 | }
33 | 
34 | #[test]
35 | fn dir_unlock() {
36 |     let nomt_1 = setup_nomt("dir_unlock", true).unwrap();
37 |     drop(nomt_1);
38 |     let _nomt_2 = setup_nomt("dir_unlock", false).unwrap();
39 | }
40 | 


--------------------------------------------------------------------------------
/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt-core"
 3 | description = "Core trie operations for NOMT"
 4 | version = "0.1.0"
 5 | authors.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | edition.workspace = true
 9 | license.workspace = true
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | bitvec = { version = "1", default-features = false, features = ["alloc"] }
15 | hex = { version = "0.4.3", default-features = false, features = ["alloc"] }
16 | ruint = { version = "1.12.1", default-features = false }
17 | arrayvec = { version = "0.7", default-features = false }
18 | borsh = { version = ">=1.4, <1.5.0", default-features = false, features = ["derive"], optional = true }
19 | blake3 = { version = "1.5.1", default-features = false, optional = true }
20 | sha2 = { version = "0.10.6" , default-features = false, optional = true }
21 | 
22 | [dev-dependencies]
23 | blake3 = "1.5.1"
24 | 
25 | [features]
26 | default = ["std", "blake3-hasher", "sha2-hasher"]
27 | std = ["bitvec/std", "borsh/std"]
28 | borsh = ["dep:borsh"]
29 | blake3-hasher = ["dep:blake3"]
30 | sha2-hasher = ["dep:sha2"]
31 | 


--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt-fuzz"
 3 | version = "0.0.0"
 4 | publish = false
 5 | edition = "2021"
 6 | 
 7 | [package.metadata]
 8 | cargo-fuzz = true
 9 | 
10 | [dependencies]
11 | libfuzzer-sys = "0.4"
12 | arbitrary = { version = "1.3.1", features = ["derive"] }
13 | tempfile = "3.10.1"
14 | bitvec = { version = "1" }
15 | 
16 | [dependencies.nomt]
17 | path = "../nomt"
18 | features = ["fuzz"]
19 | 
20 | [[bin]]
21 | name = "api_surface"
22 | path = "fuzz_targets/api_surface.rs"
23 | test = false
24 | doc = false
25 | bench = false
26 | 
27 | [[bin]]
28 | name = "bitwise_memcpy"
29 | path = "fuzz_targets/bitwise_memcpy.rs"
30 | test = false
31 | doc = false
32 | bench = false
33 | 
34 | [[bin]]
35 | name = "separate"
36 | path = "fuzz_targets/separate.rs"
37 | test = false
38 | doc = false
39 | bench = false
40 | 
41 | [[bin]]
42 | name = "prefix_len"
43 | path = "fuzz_targets/prefix_len.rs"
44 | test = false
45 | doc = false
46 | bench = false
47 | 
48 | [[bin]]
49 | name = "separator_len"
50 | path = "fuzz_targets/separator_len.rs"
51 | test = false
52 | doc = false
53 | bench = false
54 | 
55 | [[bin]]
56 | name = "reconstruct_key"
57 | path = "fuzz_targets/reconstruct_key.rs"
58 | test = false
59 | doc = false
60 | bench = false
61 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/common/mod.rs:
--------------------------------------------------------------------------------
 1 | use arbitrary::Arbitrary;
 2 | use bitvec::{order::Msb0, view::BitView};
 3 | 
 4 | #[derive(Debug)]
 5 | pub struct Run {
 6 |     pub prefix_bit_len: usize,
 7 |     pub a: [u8; 32],
 8 |     pub b: [u8; 32],
 9 | }
10 | 
11 | impl<'a> Arbitrary<'a> for Run {
12 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
13 |         let prefix_bit_len = input.int_in_range(0..=255)?;
14 |         let mut a = [0; 32];
15 |         let mut b = [0; 32];
16 |         input.fill_buffer(&mut a)?;
17 |         input.fill_buffer(&mut b)?;
18 |         b.view_bits_mut::<Msb0>()[0..prefix_bit_len]
19 |             .copy_from_bitslice(&a.view_bits::<Msb0>()[0..prefix_bit_len]);
20 | 
21 |         let effective_prefix_bit_len = a
22 |             .view_bits::<Msb0>()
23 |             .iter()
24 |             .zip(b.view_bits::<Msb0>().iter())
25 |             .take_while(|(a, b)| a == b)
26 |             .count();
27 | 
28 |         if effective_prefix_bit_len != prefix_bit_len {
29 |             Err(arbitrary::Error::IncorrectFormat)
30 |         } else {
31 |             Ok(Self {
32 |                 prefix_bit_len,
33 |                 a,
34 |                 b,
35 |             })
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/nomt/tests/last_layer_trie.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | 
 5 | #[test]
 6 | fn last_layer_trie() {
 7 |     let mut t = Test::new_with_params(
 8 |         "last_layer_trie", // name
 9 |         1,                 // commit_concurrency
10 |         10_000,            // hashtable_buckets
11 |         None,              // panic_on_sync
12 |         true,              // cleanup_dir
13 |     );
14 | 
15 |     let key1 = [170; 32];
16 |     let mut key2 = key1.clone();
17 |     key2[31] = 171;
18 | 
19 |     // write two leaf nodes at the last layer of the trie
20 |     t.write(key1, Some(vec![1; 128]));
21 |     t.write(key2, Some(vec![2; 128]));
22 |     t.commit();
23 |     assert_eq!(t.read(key1), Some(vec![1; 128]));
24 |     assert_eq!(t.read(key2), Some(vec![2; 128]));
25 | 
26 |     // modify two leaf nodes at the last layer of the trie
27 |     t.write(key1, Some(vec![3; 100]));
28 |     t.write(key2, Some(vec![4; 100]));
29 |     t.commit();
30 |     assert_eq!(t.read(key1), Some(vec![3; 100]));
31 |     assert_eq!(t.read(key2), Some(vec![4; 100]));
32 | 
33 |     // delete two leaf nodes at the last layer of the trie
34 |     t.write(key1, None);
35 |     t.write(key2, None);
36 |     t.commit();
37 |     assert_eq!(t.read(key1), None);
38 |     assert_eq!(t.read(key2), None);
39 | }
40 | 


--------------------------------------------------------------------------------
/nomt/tests/compute_root.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use nomt::{hasher::Blake3Hasher, trie::NodeKind};
 5 | 
 6 | #[test]
 7 | fn root_on_empty_db() {
 8 |     let t = Test::new("compute_root_empty");
 9 |     let root = t.root();
10 |     assert_eq!(
11 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
12 |         NodeKind::Terminator
13 |     );
14 | }
15 | 
16 | #[test]
17 | fn root_on_leaf() {
18 |     {
19 |         let mut t = Test::new("compute_root_leaf");
20 |         t.write([1; 32], Some(vec![1, 2, 3]));
21 |         t.commit();
22 |     }
23 | 
24 |     let t = Test::new_with_params("compute_root_leaf", 1, 1, None, false);
25 |     let root = t.root();
26 |     assert_eq!(
27 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
28 |         NodeKind::Leaf
29 |     );
30 | }
31 | 
32 | #[test]
33 | fn root_on_internal() {
34 |     {
35 |         let mut t = Test::new("compute_root_internal");
36 |         t.write([0; 32], Some(vec![1, 2, 3]));
37 |         t.write([1; 32], Some(vec![1, 2, 3]));
38 |         t.commit();
39 |     }
40 | 
41 |     let t = Test::new_with_params("compute_root_internal", 1, 1, None, false);
42 |     let root = t.root();
43 |     assert_eq!(
44 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
45 |         NodeKind::Internal
46 |     );
47 | }
48 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contribute to NOMT
 2 | 
 3 | We license all code under MIT / Apache2.0 licenses. The maintainers reserve the right to refuse contributions and reject issues, even when useful.
 4 | 
 5 | ## Formatting
 6 | 
 7 | We use spaces for indentation and adhere to the vanilla `rustfmt` style.
 8 | 
 9 | Format your code using `rustfmt`:
10 |   1. `cargo install cargo-fmt`
11 |   2. `cargo fmt --all`
12 | 
13 | ## Documentation Policy
14 | 
15 | Well-commented code is readable code. We require all `pub` and `pub(crate)` items to be annotated with doc-strings. This leads to much better auto-generated documentation pages using `rustdoc` and a better experience for library users.
16 | 
17 | Public modules and crates should begin with doc-strings which explain the purpose of the module and crate and assist the reader in determining where to proceed.
18 | 
19 | ## Pull Requests and Tests
20 | 
21 | We require that the entire test-suite passes for every merged PR. A PR is the responsibility of the author. In submitting a PR, you are consenting to become responsible for it and continually improve, update, and request reviews for it until merged. Stale PRs are not the responsibility of the maintainers and may be closed.
22 | 
23 | ## Code of Conduct
24 | 
25 | We ask that all contributors maintain a respectful attitude towards each other.


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/separator_len.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | use arbitrary::Arbitrary;
 4 | use bitvec::{order::Msb0, view::BitView};
 5 | use libfuzzer_sys::fuzz_target;
 6 | 
 7 | fuzz_target!(|run: Run| {
 8 |     let Run {
 9 |         separator_len,
10 |         separator,
11 |     } = run;
12 | 
13 |     assert_eq!(separator_len, nomt::beatree::separator_len(&separator));
14 | });
15 | 
16 | #[derive(Debug)]
17 | struct Run {
18 |     separator_len: usize,
19 |     separator: [u8; 32],
20 | }
21 | 
22 | impl<'a> Arbitrary<'a> for Run {
23 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
24 |         let mut separator_len = input.int_in_range(0..=255)?;
25 |         let mut separator = [0; 32];
26 |         input.fill_buffer(&mut separator)?;
27 |         separator.view_bits_mut::<Msb0>()[separator_len..].fill(false);
28 | 
29 |         if separator == [0u8; 32] {
30 |             separator_len = 1;
31 |         } else {
32 |             let effective_separator_len = 256 - separator.view_bits::<Msb0>().trailing_zeros();
33 |             if separator_len != effective_separator_len {
34 |                 return Err(arbitrary::Error::IncorrectFormat);
35 |             }
36 |         };
37 | 
38 |         Ok(Self {
39 |             separator_len,
40 |             separator,
41 |         })
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/nomt/src/task.rs:
--------------------------------------------------------------------------------
 1 | pub type TaskResult<R> = std::thread::Result<R>;
 2 | 
 3 | /// Spawn the given task within the given ThreadPool.
 4 | /// Use the provided Sender to send the result of the task execution.
 5 | ///
 6 | /// The result will contain the effective result or the payload
 7 | /// of the panic that occurred.
 8 | pub fn spawn_task<F, R>(
 9 |     thread_pool: &threadpool::ThreadPool,
10 |     task: F,
11 |     tx: crossbeam_channel::Sender<TaskResult<R>>,
12 | ) where
13 |     R: Send + 'static,
14 |     F: FnOnce() -> R + Send + 'static,
15 | {
16 |     thread_pool.execute(move || {
17 |         let res = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| task()));
18 |         let _ = tx.send(res);
19 |     });
20 | }
21 | 
22 | /// Blocks waiting for completion of the task spawned with [`spawn_task`].
23 | /// It requires the receiver associated to the sender used to spawn the task.
24 | ///
25 | /// Panics if the sender is dropped.
26 | pub fn join_task<R>(receiver: &crossbeam_channel::Receiver<TaskResult<R>>) -> R
27 | where
28 |     R: Send + 'static,
29 | {
30 |     // UNWRAP: The sender is not expected to be dropped by the spawned task.
31 |     let res = receiver.recv().unwrap();
32 |     match res {
33 |         Ok(res) => res,
34 |         Err(err_payload) => std::panic::resume_unwind(err_payload),
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/read_value/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams, WitnessMode};
 3 | use sha2::Digest;
 4 | 
 5 | const NOMT_DB_FOLDER: &str = "nomt_db";
 6 | 
 7 | fn main() -> Result<()> {
 8 |     // Define the options used to open NOMT
 9 |     let mut opts = Options::new();
10 |     opts.path(NOMT_DB_FOLDER);
11 |     opts.commit_concurrency(1);
12 | 
13 |     // Open nomt database. This will create the folder if it does not exist
14 |     let nomt = Nomt::<Blake3Hasher>::open(opts)?;
15 | 
16 |     // Instantiate a new Session object to handle read and write operations
17 |     // and generate a Witness later on
18 |     let session =
19 |         nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write()));
20 | 
21 |     // Reading a key from the database
22 |     let key_path = sha2::Sha256::digest(b"key").into();
23 |     let value = session.read(key_path)?;
24 | 
25 |     // Even though this key is only being read, we ask NOMT to warm up the on-disk data because
26 |     // we will prove the read.
27 |     session.warm_up(key_path);
28 | 
29 |     let mut finished = session
30 |         .finish(vec![(key_path, KeyReadWrite::Read(value))])
31 |         .unwrap();
32 |     let _witness = finished.take_witness();
33 |     finished.commit(&nomt)?;
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/benches.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(feature = "benchmarks")]
 2 | 
 3 | use crate::beatree::{
 4 |     branch::node::benches::*, leaf::node::benches::*, ops::benches::*, ops::bit_ops::benches::*,
 5 |     Key,
 6 | };
 7 | use rand::RngCore;
 8 | 
 9 | pub fn beatree_benchmark(c: &mut criterion::Criterion) {
10 |     separate_benchmark(c);
11 |     separator_len_benchmark(c);
12 |     prefix_len_benchmark(c);
13 |     search_branch_benchmark(c);
14 |     leaf_search_benchmark(c);
15 |     reconstruct_key_benchmark(c);
16 |     branch_builder_benchmark(c);
17 |     leaf_builder_benchmark(c);
18 | }
19 | 
20 | // returns two keys a and b where b > a and b shares the first n bits with a
21 | pub fn get_key_pair(shared_bytes: usize) -> (Key, Key) {
22 |     let mut rand = rand::thread_rng();
23 |     let mut a = [0; 32];
24 |     rand.fill_bytes(&mut a[0..shared_bytes]);
25 | 
26 |     // b > a
27 |     let mut b = a.clone();
28 |     b[shared_bytes] = 1;
29 | 
30 |     (a, b)
31 | }
32 | 
33 | // Get a vector containing `n` random keys that share the first `shared_bytes`
34 | pub fn get_keys(shared_bytes: usize, n: usize) -> Vec<Key> {
35 |     let mut rand = rand::thread_rng();
36 |     let mut prefix = [0; 32];
37 |     rand.fill_bytes(&mut prefix[0..shared_bytes]);
38 | 
39 |     let mut keys = vec![];
40 |     for _ in 0..n {
41 |         let mut key = prefix.clone();
42 |         rand.fill_bytes(&mut key[shared_bytes..]);
43 |         keys.push(key);
44 |     }
45 | 
46 |     keys
47 | }
48 | 


--------------------------------------------------------------------------------
/core/src/page.rs:
--------------------------------------------------------------------------------
 1 | //! Pages: efficient node storage.
 2 | //!
 3 | //! Because each node in the trie is exactly 32 bytes, we can easily pack groups of nodes into
 4 | //! a predictable paged representation regardless of the information in the trie.
 5 | //!
 6 | //! Each page is 4096 bytes and stores up to 126 nodes plus a unique 32-byte page identifier,
 7 | //! with 32 bytes left over.
 8 | //!
 9 | //! A page stores a rootless sub-tree with depth 6: that is, it stores up to
10 | //! 2 + 4 + 8 + 16 + 32 + 64 nodes at known positions.
11 | //! Semantically, all nodes within the page should descend from the layer above, and the
12 | //! top two nodes are expected to be siblings. Each page logically has up to 64 child pages, which
13 | //! correspond to the rootless sub-tree descending from each of the 64 child nodes on the bottom
14 | //! layer.
15 | //!
16 | //! Every page is referred to by a unique ID, given by `parent_id * 2^6 + child_index + 1`, where
17 | //! the root page has ID `0x00..00`. The child index ranges from 0 to 63 and therefore can be
18 | //! represented as a 6 bit string. This module exposes functions for manipulating page IDs.
19 | //!
20 | //! The [`RawPage`] structure wraps a borrowed slice of 32-byte data and treats it as a page.
21 | 
22 | /// Depth of the rootless sub-binary tree stored in a page
23 | pub const DEPTH: usize = 6;
24 | 
25 | // Total number of nodes stored in one Page. It depends on the `DEPTH`
26 | // of the rootless sub-binary tree stored in a page following this formula:
27 | // (2^(DEPTH + 1)) - 2
28 | pub const NODES_PER_PAGE: usize = (1 << DEPTH + 1) - 2;
29 | 
30 | /// A raw, unsized page data slice.
31 | pub type RawPage = [[u8; 32]];
32 | 


--------------------------------------------------------------------------------
/nomt/src/sys/linux.rs:
--------------------------------------------------------------------------------
 1 | //! Linux-specific code.
 2 | 
 3 | use super::unix::cvt_r;
 4 | use std::fs::File;
 5 | use std::os::fd::AsRawFd;
 6 | 
 7 | /// Returns an instance of `FsCheck` for the given file.
 8 | pub fn fs_check(file: &File) -> std::io::Result<FsCheck> {
 9 |     unsafe {
10 |         // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed
11 |         //         by reference. This should be memory-safe because the `statfs` struct is
12 |         //         zeroed and the `f_type` field should be set by the ffi call.
13 |         let mut stat: libc::statfs = std::mem::zeroed();
14 |         cvt_r(|| libc::fstatfs(file.as_raw_fd(), &mut stat))?;
15 |         Ok(FsCheck { stat })
16 |     }
17 | }
18 | 
19 | /// A utility struct to get filesystem information at a given path.
20 | pub struct FsCheck {
21 |     stat: libc::statfs,
22 | }
23 | 
24 | impl FsCheck {
25 |     /// Returns true if the filesystem is tmpfs.
26 |     pub fn is_tmpfs(&self) -> bool {
27 |         self.stat.f_type == libc::TMPFS_MAGIC
28 |     }
29 | }
30 | 
31 | /// fallocate changes the size of the file to the given length if it's less than the current size.
32 | /// If the file is larger than the given length, the file is not truncated.
33 | ///
34 | /// Doesn't work on tmpfs.
35 | pub fn falloc_zero_file(file: &File, len: u64) -> std::io::Result<()> {
36 |     cvt_r(|| unsafe {
37 |         // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed
38 |         //         by reference.
39 |         libc::fallocate(
40 |             file.as_raw_fd(),
41 |             libc::FALLOC_FL_ZERO_RANGE,
42 |             0 as _,
43 |             len as _,
44 |         )
45 |     })
46 |     .map(drop)
47 | }
48 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/writeout.rs:
--------------------------------------------------------------------------------
 1 | //! The writeout logic for bitbox.
 2 | 
 3 | // The logic for writeout is split into three parts:
 4 | // - first we write out the wal blob to the WAL file and wait for the MANIFEST to be synced.
 5 | // - then we write out the metabits and bucket pages to the HT file.
 6 | // - finally, we truncate the WAL file.
 7 | 
 8 | use std::{
 9 |     fs::File,
10 |     io::{Seek as _, SeekFrom, Write},
11 |     os::fd::AsRawFd as _,
12 |     sync::Arc,
13 | };
14 | 
15 | use crate::io::{FatPage, IoCommand, IoHandle, IoKind};
16 | 
17 | pub(super) fn write_wal(mut wal_fd: &File, wal_blob: &[u8]) -> std::io::Result<()> {
18 |     wal_fd.set_len(0)?;
19 |     wal_fd.seek(SeekFrom::Start(0))?;
20 |     wal_fd.write_all(wal_blob)?;
21 |     wal_fd.sync_all()?;
22 |     Ok(())
23 | }
24 | 
25 | /// Truncates the WAL file to zero length.
26 | ///
27 | /// Conditionally syncs the file to disk.
28 | pub(super) fn truncate_wal(mut wal_fd: &File, do_sync: bool) -> std::io::Result<()> {
29 |     wal_fd.set_len(0)?;
30 |     wal_fd.seek(SeekFrom::Start(0))?;
31 |     if do_sync {
32 |         wal_fd.sync_all()?;
33 |     }
34 |     Ok(())
35 | }
36 | 
37 | pub(super) fn write_ht(
38 |     io_handle: IoHandle,
39 |     ht_fd: &File,
40 |     mut ht: Vec<(u64, Arc<FatPage>)>,
41 | ) -> std::io::Result<()> {
42 |     let mut sent = 0;
43 | 
44 |     ht.sort_unstable_by_key(|item| item.0);
45 |     for (pn, page) in ht {
46 |         io_handle
47 |             .send(IoCommand {
48 |                 kind: IoKind::WriteArc(ht_fd.as_raw_fd(), pn, page),
49 |                 user_data: 0,
50 |             })
51 |             .unwrap();
52 |         sent += 1;
53 |     }
54 | 
55 |     while sent > 0 {
56 |         io_handle.recv().unwrap();
57 |         sent -= 1;
58 |     }
59 | 
60 |     ht_fd.sync_all()?;
61 | 
62 |     Ok(())
63 | }
64 | 


--------------------------------------------------------------------------------
/benchtop/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "benchtop"
 3 | version = "0.1.0"
 4 | authors = ["thrum"]
 5 | homepage = "https://thrum.dev"
 6 | repository = "https://github.com/thrumdev/nomt"
 7 | edition = "2021"
 8 | license = "MIT/Apache-2.0"
 9 | 
10 | [dependencies]
11 | 
12 | # benchmarking
13 | clap = { version = "4.4.8", features = ["derive"] }
14 | anyhow = { version = "1.0.75" }
15 | hdrhistogram = "7.5.4"
16 | fxhash = "0.2.1"
17 | rand = "0.8.5"
18 | rand_distr = "0.4.3"
19 | sha2 = { version = "0.10.6" }
20 | ruint = { version = "1.12.1" }
21 | toml = "0.8.12"
22 | serde = "1.0.199"
23 | humantime = "2.1.0"
24 | rayon = "1.10"
25 | lru = "0.12.5"
26 | libc = "0.2.155"
27 | 
28 | # sov-db
29 | sov-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
30 | sov-schema-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
31 | sov-prover-storage-manager = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
32 | jmt = { git = "https://github.com/penumbra-zone/jmt.git", rev = "1d007e11cb68aa5ca13e9a5af4a12e6439d5f7b6", optional = true }
33 | 
34 | # sp-trie
35 | sp-trie = { version = "32.0.0", optional = true }
36 | sp-state-machine = { version = "0.35.0", optional = true }
37 | trie-db = { version = "0.28.0", optional = true }
38 | hash-db = { version = "0.16.0", optional = true }
39 | sp-core = { version = "31.0.0", optional = true }
40 | kvdb = { version = "0.13.0", optional = true }
41 | kvdb-rocksdb = { version = "0.19.0", optional = true }
42 | array-bytes = { version = "6.1", optional = true }
43 | 
44 | # nomt
45 | nomt = { path = "../nomt" }
46 | 
47 | [profile.release]
48 | debug = true
49 | 
50 | [features]
51 | sov-db=["dep:sov-db", "sov-schema-db", "sov-prover-storage-manager", "jmt" ]
52 | sp-trie=["dep:sp-trie", "sp-state-machine", "trie-db", "hash-db", "sp-core", "kvdb", "kvdb-rocksdb", "array-bytes" ]
53 | 


--------------------------------------------------------------------------------
/nomt/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt"
 3 | description = "Nearly Optimal Merkle Trie - Schema and Database"
 4 | version = "0.1.0"
 5 | authors.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | edition.workspace = true
 9 | license.workspace = true
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | anyhow = { version = "1.0.81", features = ["backtrace"] }
15 | nomt-core = { path = "../core", default-features = false, features = ["std"] }
16 | parking_lot = { version = "0.12.3", features = ["arc_lock", "send_guard"] }
17 | threadpool = "1.8.1"
18 | bitvec = { version = "1" }
19 | twox-hash = "2.1.0"
20 | fxhash = "0.2.1"
21 | dashmap = "5.5.3"
22 | crossbeam = "0.8.4"
23 | crossbeam-channel = "0.5.13"
24 | slab = "0.4.9"
25 | rand = "0.8.5"
26 | ahash = "0.8.11"
27 | imbl = "3.0.0"
28 | lru = "0.12.3"
29 | libc = "0.2.155"
30 | criterion = { version = "0.3", optional = true }
31 | thread_local = "1.1.8"
32 | cfg-if = "1.0.0"
33 | borsh = { version = ">=1.4, <1.5.0", default-features = false, features = ["derive"], optional = true }
34 | 
35 | [target.'cfg(target_os="linux")'.dependencies]
36 | io-uring = "0.6.4"
37 | 
38 | [target.'cfg(loom)'.dependencies]
39 | loom = { version = "0.7", features = ["checkpoint"] }
40 | 
41 | [dev-dependencies]
42 | rand_pcg = "0.3.1"
43 | hex-literal = "0.4"
44 | tempfile = "3.8.1"
45 | criterion = "0.3"
46 | lazy_static = "1.5.0"
47 | hex = "0.4.3"
48 | quickcheck = "1.0.3"
49 | blake3 = "1.5.1"
50 | 
51 | [lints.rust]
52 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(loom)'] }
53 | 
54 | [[bench]]
55 | name = "beatree"
56 | harness = false
57 | 
58 | [features]
59 | default = ["blake3-hasher", "sha2-hasher"]
60 | benchmarks = ["dep:criterion"]
61 | fuzz = []
62 | borsh = ["dep:borsh", "nomt-core/borsh"]
63 | blake3-hasher = ["nomt-core/blake3-hasher"]
64 | sha2-hasher = ["nomt-core/sha2-hasher"]
65 | 


--------------------------------------------------------------------------------
/nomt/tests/add_remove.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use hex_literal::hex;
 5 | use nomt::trie::Node;
 6 | 
 7 | #[test]
 8 | fn add_remove_1000() {
 9 |     let mut accounts = 0;
10 |     let mut t = Test::new("add_remove");
11 | 
12 |     let expected_roots = [
13 |         hex!("0000000000000000000000000000000000000000000000000000000000000000"),
14 |         hex!("4a7a6fe118037086a49ff10484f4d80b0a9f31f1060eeb1c9f0162634604b0d9"),
15 |         hex!("7d5b013105d7b835225256f2233a458e1a158a53d20e0d3834886df89a26c27b"),
16 |         hex!("1a290e07bcacfb58ddcd0b9da348c740ca1bf87b05ed96752a1503ed7c187b69"),
17 |         hex!("5e9abfee6d927b084fed3e1306bbe65f0880d0b7de12522c38813014927f1336"),
18 |         hex!("57b39e06b2ee98dccd882033eb4136f5376699128b421c83bdc7c6ca96168938"),
19 |         hex!("7fd75809ef0e2133102eb5e31e47cb577149dcaebb42cddeb2fd6754256b365f"),
20 |         hex!("7c00cb11ec8262385078613e7b7977e50b0751f8cb2384fdccc048eea02acb63"),
21 |         hex!("516d6911c3b0a36c9227922ca0273a4aee44886201bd186f7ee7e538a769eaa5"),
22 |         hex!("381b24719ff91b13d36cf0dd7622f391f4a461452ed7547a46a992ee4a4025aa"),
23 |         hex!("207793e2ce76c1feb68c7259f883229f985706c8cc2fcf99f481b622a54ba375"),
24 |     ];
25 | 
26 |     let mut root = Node::default();
27 |     for i in 0..10 {
28 |         let _ = t.read_id(0);
29 |         for _ in 0..100 {
30 |             common::set_balance(&mut t, accounts, 1000);
31 |             accounts += 1;
32 |         }
33 |         {
34 |             root = t.commit().0.into_inner();
35 |         }
36 | 
37 |         assert_eq!(root, common::expected_root(accounts));
38 |         assert_eq!(root, expected_roots[i + 1]);
39 |     }
40 | 
41 |     assert_eq!(root, expected_roots[10]);
42 | 
43 |     for i in 0..10 {
44 |         for _ in 0..100 {
45 |             accounts -= 1;
46 |             common::kill(&mut t, accounts);
47 |         }
48 |         {
49 |             root = t.commit().0.into_inner();
50 |         }
51 | 
52 |         assert_eq!(root, common::expected_root(accounts));
53 |         assert_eq!(root, expected_roots[10 - i - 1]);
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/meta_map.rs:
--------------------------------------------------------------------------------
 1 | //! in-memory metadata for each bucket. this is also persisted on disk.
 2 | 
 3 | const EMPTY: u8 = 0b0000_0000;
 4 | const TOMBSTONE: u8 = 0b0111_1111;
 5 | const FULL_MASK: u8 = 0b1000_0000;
 6 | 
 7 | fn full_entry(hash: u64) -> u8 {
 8 |     (hash >> 57) as u8 ^ FULL_MASK
 9 | }
10 | 
11 | pub struct MetaMap {
12 |     buckets: usize,
13 |     bitvec: Vec<u8>,
14 | }
15 | 
16 | impl MetaMap {
17 |     // Create a new meta-map from an existing vector.
18 |     pub fn from_bytes(meta_bytes: Vec<u8>, buckets: usize) -> Self {
19 |         assert_eq!(meta_bytes.len() % 4096, 0);
20 |         MetaMap {
21 |             buckets,
22 |             bitvec: meta_bytes,
23 |         }
24 |     }
25 | 
26 |     pub fn full_count(&self) -> usize {
27 |         self.bitvec
28 |             .iter()
29 |             .filter(|&&byte| byte & FULL_MASK != 0)
30 |             .count()
31 |     }
32 | 
33 |     pub fn len(&self) -> usize {
34 |         self.buckets
35 |     }
36 | 
37 |     pub fn set_full(&mut self, bucket: usize, hash: u64) {
38 |         self.bitvec[bucket] = full_entry(hash);
39 |     }
40 | 
41 |     pub fn set_tombstone(&mut self, bucket: usize) {
42 |         self.bitvec[bucket] = TOMBSTONE;
43 |     }
44 | 
45 |     // true means definitely empty.
46 |     pub fn hint_empty(&self, bucket: usize) -> bool {
47 |         self.bitvec[bucket] == EMPTY
48 |     }
49 | 
50 |     // true means definitely a tombstone.
51 |     pub fn hint_tombstone(&self, bucket: usize) -> bool {
52 |         self.bitvec[bucket] == TOMBSTONE
53 |     }
54 | 
55 |     // returns true if it's definitely not a match.
56 |     pub fn hint_not_match(&self, bucket: usize, raw_hash: u64) -> bool {
57 |         self.bitvec[bucket] != full_entry(raw_hash)
58 |     }
59 | 
60 |     // get the page index of a bucket in the meta-map.
61 |     pub fn page_index(&self, bucket: usize) -> usize {
62 |         bucket / 4096
63 |     }
64 | 
65 |     // get a page-sized slice of the metamap. This is guaranteed to have len 4096
66 |     pub fn page_slice(&self, page_index: usize) -> &[u8] {
67 |         let start = page_index * 4096;
68 |         let end = start + 4096;
69 |         &self.bitvec[start..end]
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/nomt/src/merkle/page_set.rs:
--------------------------------------------------------------------------------
 1 | //! A set of pages that the page walker draws upon and which is filled by `Seek`ing.
 2 | 
 3 | use nomt_core::page_id::PageId;
 4 | use std::{collections::HashMap, sync::Arc};
 5 | 
 6 | use super::BucketInfo;
 7 | use crate::{
 8 |     io::PagePool,
 9 |     page_cache::{Page, PageMut},
10 | };
11 | 
12 | pub struct PageSet {
13 |     map: HashMap<PageId, (Page, BucketInfo)>,
14 |     warm_up_map: Option<Arc<HashMap<PageId, (Page, BucketInfo)>>>,
15 |     page_pool: PagePool,
16 | }
17 | 
18 | impl PageSet {
19 |     pub fn new(page_pool: PagePool, warmed_up: Option<FrozenSharedPageSet>) -> Self {
20 |         PageSet {
21 |             map: HashMap::new(),
22 |             page_pool,
23 |             warm_up_map: warmed_up.map(|x| x.0),
24 |         }
25 |     }
26 | 
27 |     /// Insert a page with a known bucket index.
28 |     pub fn insert(&mut self, page_id: PageId, page: Page, bucket_info: BucketInfo) {
29 |         self.map.insert(page_id, (page, bucket_info));
30 |     }
31 | 
32 |     /// Freeze this page-set and make a shareable version of it. This returns a frozen page set
33 |     /// containing all insertions into this map.
34 |     pub fn freeze(self) -> FrozenSharedPageSet {
35 |         FrozenSharedPageSet(Arc::new(self.map))
36 |     }
37 | 
38 |     fn get_warmed_up(&self, page_id: &PageId) -> Option<(Page, BucketInfo)> {
39 |         self.warm_up_map
40 |             .as_ref()
41 |             .and_then(|m| m.get(page_id))
42 |             .map(|(p, b)| (p.clone(), b.clone()))
43 |     }
44 | }
45 | 
46 | impl super::page_walker::PageSet for PageSet {
47 |     fn fresh(&self, page_id: &PageId) -> (PageMut, BucketInfo) {
48 |         let page = PageMut::pristine_empty(&self.page_pool, &page_id);
49 |         let bucket_info = BucketInfo::Fresh;
50 | 
51 |         (page, bucket_info)
52 |     }
53 | 
54 |     fn get(&self, page_id: &PageId) -> Option<(Page, BucketInfo)> {
55 |         self.map
56 |             .get(&page_id)
57 |             .map(|(p, bucket_info)| (p.clone(), bucket_info.clone()))
58 |             .or_else(|| self.get_warmed_up(page_id))
59 |     }
60 | }
61 | 
62 | /// A frozen, shared page set. This is cheap to clone.
63 | #[derive(Clone)]
64 | pub struct FrozenSharedPageSet(Arc<HashMap<PageId, (Page, BucketInfo)>>);
65 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/index.rs:
--------------------------------------------------------------------------------
 1 | //! In-memory index tracking bottom level branch nodes. This is an immutable data structure,
 2 | //! which is cheaply cloneable in O(1) and performs COW operations.
 3 | 
 4 | use std::ops::{Bound, RangeBounds};
 5 | use std::sync::Arc;
 6 | 
 7 | use imbl::OrdMap;
 8 | 
 9 | use super::Key;
10 | use crate::beatree::branch::BranchNode;
11 | 
12 | #[derive(Default, Clone)]
13 | pub struct Index {
14 |     first_key_map: OrdMap<Key, Arc<BranchNode>>,
15 | }
16 | 
17 | impl Index {
18 |     /// Look up the branch that would store the given key.
19 |     ///
20 |     /// This is either a branch whose separator is exactly equal to this key or the branch with the
21 |     /// highest separator less than the key.
22 |     pub fn lookup(&self, key: Key) -> Option<(Key, Arc<BranchNode>)> {
23 |         self.first_key_map
24 |             .get_prev(&key)
25 |             .map(|(sep, b)| (sep.clone(), b.clone()))
26 |     }
27 | 
28 |     /// Get the first separator greater than the given key.
29 |     pub fn next_key(&self, key: Key) -> Option<Key> {
30 |         self.first_key_map
31 |             .range(RangeFromExclusive { start: key })
32 |             .next()
33 |             .map(|(k, _)| *k)
34 |     }
35 | 
36 |     /// Remove the branch with the given separator key.
37 |     pub fn remove(&mut self, separator: &Key) -> Option<Arc<BranchNode>> {
38 |         self.first_key_map.remove(separator)
39 |     }
40 | 
41 |     /// Insert a branch with the given separator key.
42 |     pub fn insert(&mut self, separator: Key, branch: Arc<BranchNode>) -> Option<Arc<BranchNode>> {
43 |         self.first_key_map.insert(separator, branch)
44 |     }
45 | 
46 |     #[cfg(test)]
47 |     pub fn into_iter(self) -> impl Iterator<Item = (Key, Arc<BranchNode>)> {
48 |         self.first_key_map.into_iter()
49 |     }
50 | }
51 | 
52 | struct RangeFromExclusive {
53 |     start: Key,
54 | }
55 | 
56 | impl RangeBounds<Key> for RangeFromExclusive {
57 |     fn start_bound(&self) -> Bound<&Key> {
58 |         Bound::Excluded(&self.start)
59 |     }
60 | 
61 |     fn end_bound(&self) -> Bound<&Key> {
62 |         Bound::Unbounded
63 |     }
64 | 
65 |     fn contains<U>(&self, item: &U) -> bool
66 |     where
67 |         U: PartialOrd<Key> + ?Sized,
68 |     {
69 |         item > &self.start
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   nomt_test:
14 |     name: NOMT - test
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       # Avoid shrinking the inputs when an error is found in the leaf/branch stage tests.
18 |       NO_STAGES_SHRINKING: true
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: ./.github/actions/install-fuse
22 |       - uses: dtolnay/rust-toolchain@stable
23 |       - run: cargo build --verbose --workspace --locked
24 |       - run: cargo test --verbose --workspace
25 |   benchtop_check:
26 |     name: NOMT - check benchtop
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - uses: ./.github/actions/install-fuse
31 |       - uses: dtolnay/rust-toolchain@stable
32 |       - run: cargo check --verbose --manifest-path=benchtop/Cargo.toml --locked
33 |   loom_rw_pass_cell:
34 |     name: NOMT - loom rw_pass_cell
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |       - uses: actions/checkout@v4
38 |       - uses: dtolnay/rust-toolchain@stable
39 |       - run: RUSTFLAGS="--cfg loom" cargo test -p nomt --release --lib rw_pass_cell
40 |   doc:
41 |     name: NOMT - doc
42 |     runs-on: ubuntu-latest
43 |     env:
44 |       # Treat rustdoc warnings as errors.
45 |       RUSTDOCFLAGS: "-D warnings"
46 |     steps:
47 |       - uses: actions/checkout@v4
48 |       - uses: ./.github/actions/install-fuse
49 |       - uses: dtolnay/rust-toolchain@stable
50 |       - run: cargo doc --verbose --workspace --document-private-items
51 |   fmt:
52 |     name: NOMT - fmt
53 |     runs-on: ubuntu-latest
54 |     steps:
55 |       - uses: actions/checkout@v4
56 |       - uses: dtolnay/rust-toolchain@stable
57 |       - run: cargo fmt --all --check
58 |       - run: cargo fmt --manifest-path=benchtop/Cargo.toml --check
59 |   darwin_check:
60 |     name: NOMT - check darwin target
61 |     runs-on: ubuntu-latest
62 |     env:
63 |       # This is a workaround for the blake3 crate.
64 |       CARGO_FEATURE_PURE: 1
65 |     steps:
66 |       - uses: actions/checkout@v4
67 |       - uses: dtolnay/rust-toolchain@stable
68 |         with:
69 |           targets: x86_64-apple-darwin
70 |       # Build only the NOMT crate. Not everything builds cleanly under this configuration.
71 |       - run: cargo check --verbose -p nomt --locked --target x86_64-apple-darwin
72 | 


--------------------------------------------------------------------------------
/nomt/tests/wal.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use nomt::PanicOnSyncMode;
 5 | 
 6 | #[test]
 7 | fn wal_recovery_test_post_meta_swap() {
 8 |     // Initialize the db with panic on sync equals true.
 9 |     let mut t = Test::new_with_params(
10 |         "wal_add_remove_1000",
11 |         1,                               // commit_concurrency,
12 |         1000000,                         // hashtable_buckets,
13 |         Some(PanicOnSyncMode::PostMeta), // panic_on_sync
14 |         true,                            // clean
15 |     );
16 | 
17 |     common::set_balance(&mut t, 0, 1000);
18 |     common::set_balance(&mut t, 1, 2000);
19 |     common::set_balance(&mut t, 2, 3000);
20 | 
21 |     let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
22 |         t.commit();
23 |     }));
24 |     assert!(r.is_err());
25 |     drop(t);
26 | 
27 |     // Re-open the db without cleaning the DB dir and without panic on sync.
28 |     let mut t = Test::new_with_params(
29 |         "wal_add_remove_1000",
30 |         1,       // commit_concurrency,
31 |         1000000, // hashtable_buckets,
32 |         None,    // panic_on_sync
33 |         false,   // clean
34 |     );
35 |     assert_eq!(common::read_balance(&mut t, 0), Some(1000));
36 |     assert_eq!(common::read_balance(&mut t, 1), Some(2000));
37 |     assert_eq!(common::read_balance(&mut t, 2), Some(3000));
38 | }
39 | 
40 | #[test]
41 | fn wal_recovery_test_pre_meta_swap() {
42 |     // Initialize the db with panic on sync equals true.
43 |     let mut t = Test::new_with_params(
44 |         "wal_pre_meta_swap",
45 |         1,                              // commit_concurrency,
46 |         1000000,                        // hashtable_buckets,
47 |         Some(PanicOnSyncMode::PostWal), // panic_on_sync
48 |         true,                           // clean
49 |     );
50 | 
51 |     for i in 0..1000 {
52 |         common::set_balance(&mut t, i, 1000);
53 |     }
54 | 
55 |     let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
56 |         t.commit();
57 |     }));
58 |     assert!(r.is_err());
59 |     drop(t);
60 | 
61 |     // Re-open the db without cleaning the DB dir and without panic on sync.
62 |     let mut t = Test::new_with_params(
63 |         "wal_pre_meta_swap",
64 |         1,       // commit_concurrency,
65 |         1000000, // hashtable_buckets,
66 |         None,    // panic_on_sync
67 |         false,   // clean
68 |     );
69 | 
70 |     // DB should open cleanly and not have any incomplete changes; the WAL is too new and will be
71 |     // discarded.
72 |     for i in 0..1000 {
73 |         assert_eq!(common::read_balance(&mut t, i), None);
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/nomt/tests/prev_root_check.rs:
--------------------------------------------------------------------------------
 1 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams};
 2 | use std::path::PathBuf;
 3 | 
 4 | /// Setup a NOMT with the given path, rollback enabled, and the given commit concurrency.
 5 | ///
 6 | /// It's important that tests that run in parallel don't use the same path.
 7 | fn setup_nomt(path: &str) -> Nomt<Blake3Hasher> {
 8 |     let path = {
 9 |         let mut p = PathBuf::from("test");
10 |         p.push(path);
11 |         p
12 |     };
13 |     if path.exists() {
14 |         std::fs::remove_dir_all(&path).unwrap();
15 |     }
16 |     let mut o = Options::new();
17 |     o.path(path);
18 |     o.commit_concurrency(1);
19 |     Nomt::open(o).unwrap()
20 | }
21 | 
22 | #[test]
23 | fn test_prev_root_commits() {
24 |     let nomt = setup_nomt("prev_root_commits");
25 |     let session1 = nomt.begin_session(SessionParams::default());
26 |     let finished1 = session1
27 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
28 |         .unwrap();
29 | 
30 |     let session2 = nomt.begin_session(SessionParams::default());
31 |     let finished2 = session2
32 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
33 |         .unwrap();
34 | 
35 |     finished1.commit(&nomt).unwrap();
36 | 
37 |     finished2.commit(&nomt).unwrap_err();
38 | }
39 | 
40 | #[test]
41 | fn test_prev_root_overlay_invalidated() {
42 |     let nomt = setup_nomt("prev_root_overlay_invalidated");
43 |     let session1 = nomt.begin_session(SessionParams::default());
44 |     let finished1 = session1
45 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
46 |         .unwrap();
47 |     let overlay1 = finished1.into_overlay();
48 | 
49 |     let session2 = nomt.begin_session(SessionParams::default());
50 |     let finished2 = session2
51 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
52 |         .unwrap();
53 | 
54 |     finished2.commit(&nomt).unwrap();
55 | 
56 |     overlay1.commit(&nomt).unwrap_err();
57 | }
58 | 
59 | #[test]
60 | fn test_prev_root_overlay_invalidates_session() {
61 |     let nomt = setup_nomt("prev_root_overlays");
62 |     let session1 = nomt.begin_session(SessionParams::default());
63 |     let finished1 = session1
64 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
65 |         .unwrap();
66 |     let overlay1 = finished1.into_overlay();
67 | 
68 |     let session2 = nomt.begin_session(SessionParams::default());
69 |     let finished2 = session2
70 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
71 |         .unwrap();
72 | 
73 |     overlay1.commit(&nomt).unwrap();
74 | 
75 |     finished2.commit(&nomt).unwrap_err();
76 | }
77 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/leaf_cache.rs:
--------------------------------------------------------------------------------
 1 | //! The leaf cache stores recently accessed leaf nodes.
 2 | 
 3 | use crate::{
 4 |     beatree::{allocator::PageNumber, leaf::node::LeafNode},
 5 |     io::PAGE_SIZE,
 6 | };
 7 | use lru::LruCache;
 8 | use parking_lot::{Mutex, MutexGuard};
 9 | use std::{collections::hash_map::RandomState, hash::BuildHasher, sync::Arc};
10 | 
11 | /// A cache for leaf nodes.
12 | ///
13 | /// This i cheap to clone.
14 | #[derive(Clone)]
15 | pub struct LeafCache {
16 |     inner: Arc<Shared>,
17 | }
18 | 
19 | impl LeafCache {
20 |     /// Create a new cache with the given number of shards and the maximum number of items
21 |     /// to hold. `shards` must be non-zero.
22 |     pub fn new(shards: usize, leaf_cache_size: usize) -> Self {
23 |         let max_items = (leaf_cache_size * 1024 * 1024) / PAGE_SIZE;
24 |         let items_per_shard = max_items / shards;
25 |         LeafCache {
26 |             inner: Arc::new(Shared {
27 |                 shards: (0..shards)
28 |                     .map(|_| Shard {
29 |                         cache: LruCache::unbounded(),
30 |                         max_items: items_per_shard,
31 |                     })
32 |                     .map(Mutex::new)
33 |                     .collect::<Vec<_>>(),
34 |                 shard_assigner: RandomState::new(),
35 |             }),
36 |         }
37 |     }
38 | 
39 |     /// Get a cache entry, updating the LRU state.
40 |     pub fn get(&self, page_number: PageNumber) -> Option<Arc<LeafNode>> {
41 |         let mut shard = self.inner.shard_for(page_number);
42 | 
43 |         shard.cache.get(&page_number).map(|x| x.clone())
44 |     }
45 | 
46 |     /// Insert a cache entry. This does not evict anything.
47 |     pub fn insert(&self, page_number: PageNumber, node: Arc<LeafNode>) {
48 |         let mut shard = self.inner.shard_for(page_number);
49 | 
50 |         shard.cache.put(page_number, node);
51 |     }
52 | 
53 |     /// Evict all excess items from the cache.
54 |     pub fn evict(&self) {
55 |         for shard in &self.inner.shards {
56 |             let mut shard = shard.lock();
57 |             while shard.cache.len() > shard.max_items {
58 |                 let _ = shard.cache.pop_lru();
59 |             }
60 |         }
61 |     }
62 | }
63 | 
64 | struct Shared {
65 |     shards: Vec<Mutex<Shard>>,
66 |     shard_assigner: RandomState,
67 | }
68 | 
69 | impl Shared {
70 |     fn shard_for(&self, page_number: PageNumber) -> MutexGuard<'_, Shard> {
71 |         self.shards[self.shard_index_for(page_number)].lock()
72 |     }
73 | 
74 |     fn shard_index_for(&self, page_number: PageNumber) -> usize {
75 |         (self.shard_assigner.hash_one(page_number.0) as usize) % self.shards.len()
76 |     }
77 | }
78 | 
79 | struct Shard {
80 |     cache: LruCache<PageNumber, Arc<LeafNode>>,
81 |     max_items: usize,
82 | }
83 | 


--------------------------------------------------------------------------------
/.github/workflows/bench.yml:
--------------------------------------------------------------------------------
 1 | name: Benchtop
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   bench:
14 |     name: NOMT - run benchtop
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       SIZE: 22
18 |       BUCKETS: 4000000
19 |       RUST_BACKTRACE: 1
20 |     steps:
21 |       - name: Free Disk Space (Ubuntu)
22 |         uses: jlumbroso/free-disk-space@main
23 |         with:
24 |           tool-cache: false
25 |           android: true
26 |           dotnet: true
27 |           haskell: true
28 |           large-packages: true
29 |           docker-images: true
30 |           swap-storage: true
31 |       - uses: actions/checkout@v4
32 |       - run: |
33 |           # Install required dependencies
34 |           sudo apt-get update
35 |           sudo apt-get install -y libclang-dev
36 |       - run: df -h /
37 |       - run: |
38 |           # First build the binary
39 |           cargo build --release --verbose --manifest-path=benchtop/Cargo.toml
40 |           
41 |           # Verify binary exists before proceeding
42 |           if [ ! -f "benchtop/target/release/benchtop" ]; then
43 |             echo "Binary not found at benchtop/target/release/benchtop"
44 |             exit 1
45 |           fi
46 |           
47 |           # Create directories first to avoid potential issues
48 |           mkdir -p /tmp
49 |           
50 |           # Save our binary
51 |           cp benchtop/target/release/benchtop /tmp/benchtop
52 |           
53 |           # Verify copy succeeded
54 |           if [ ! -f "/tmp/benchtop" ]; then
55 |             echo "Failed to copy binary to /tmp"
56 |             exit 1
57 |           fi
58 |           
59 |           # Now safe to clean up
60 |           cargo clean
61 |           rm -rf ~/.cargo/registry
62 |           rm -rf ~/.cargo/git
63 |           rm -rf ~/.rustup
64 |           
65 |           # Create target directory after cleanup
66 |           mkdir -p target/release
67 |           
68 |           # Move binary to final location
69 |           mv /tmp/benchtop target/release/benchtop
70 |           
71 |           # Final verification
72 |           if [ ! -f "target/release/benchtop" ] || [ ! -x "target/release/benchtop" ]; then
73 |             echo "Final binary is missing or not executable"
74 |             exit 1
75 |           fi
76 |           
77 |           # Make absolutely sure it's executable
78 |           chmod +x target/release/benchtop
79 |           
80 |       - run: >-
81 |           ./target/release/benchtop init
82 |           -b nomt
83 |           -c $SIZE
84 |           -w transfer
85 |           --buckets $BUCKETS
86 |       - run: >-
87 |           ./target/release/benchtop run
88 |           -w transfer
89 |           -b nomt
90 |           -s 10000
91 |           -c $SIZE
92 |           --time-limit 30s
93 |           --workload-concurrency 6
94 | 


--------------------------------------------------------------------------------
/nomt/tests/witness_check.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use nomt::{hasher::Blake3Hasher, proof, trie::LeafData};
 5 | 
 6 | #[test]
 7 | fn produced_witness_validity() {
 8 |     let mut accounts = 0;
 9 |     let mut t = Test::new("witness_validity");
10 | 
11 |     let (prev_root, _) = {
12 |         for _ in 0..10 {
13 |             common::set_balance(&mut t, accounts, 1000);
14 |             accounts += 1;
15 |         }
16 |         t.commit()
17 |     };
18 | 
19 |     let (new_root, witness) = {
20 |         // read all existing accounts.
21 |         for i in 0..accounts {
22 |             t.read_id(i);
23 |         }
24 | 
25 |         // read some nonexistent accounts.
26 |         for i in 100..105 {
27 |             t.read_id(i);
28 |         }
29 | 
30 |         // kill half the existing ones.
31 |         for i in 0..5 {
32 |             common::kill(&mut t, i);
33 |         }
34 | 
35 |         // and add 5 more.
36 |         for _ in 0..5 {
37 |             common::set_balance(&mut t, accounts, 1000);
38 |             accounts += 1;
39 |         }
40 |         t.commit()
41 |     };
42 | 
43 |     assert_eq!(witness.operations.reads.len(), 15); // 10 existing + 5 nonexisting
44 |     assert_eq!(witness.operations.writes.len(), 10); // 5 deletes + 5 inserts
45 | 
46 |     let mut updates = Vec::new();
47 |     for (i, witnessed_path) in witness.path_proofs.iter().enumerate() {
48 |         let verified = witnessed_path
49 |             .inner
50 |             .verify::<Blake3Hasher>(&witnessed_path.path.path(), prev_root.into_inner())
51 |             .unwrap();
52 |         for read in witness
53 |             .operations
54 |             .reads
55 |             .iter()
56 |             .skip_while(|r| r.path_index != i)
57 |             .take_while(|r| r.path_index == i)
58 |         {
59 |             match read.value {
60 |                 None => assert!(verified.confirm_nonexistence(&read.key).unwrap()),
61 |                 Some(ref v) => {
62 |                     let leaf = LeafData {
63 |                         key_path: read.key,
64 |                         value_hash: *v,
65 |                     };
66 |                     assert!(verified.confirm_value(&leaf).unwrap());
67 |                 }
68 |             }
69 |         }
70 | 
71 |         let mut write_ops = Vec::new();
72 |         for write in witness
73 |             .operations
74 |             .writes
75 |             .iter()
76 |             .skip_while(|r| r.path_index != i)
77 |             .take_while(|r| r.path_index == i)
78 |         {
79 |             write_ops.push((write.key, write.value.clone()));
80 |         }
81 | 
82 |         if !write_ops.is_empty() {
83 |             updates.push(proof::PathUpdate {
84 |                 inner: verified,
85 |                 ops: write_ops,
86 |             });
87 |         }
88 |     }
89 | 
90 |     assert_eq!(
91 |         proof::verify_update::<Blake3Hasher>(prev_root.into_inner(), &updates).unwrap(),
92 |         new_root.into_inner(),
93 |     );
94 | }
95 | 


--------------------------------------------------------------------------------
/nomt/tests/extend_range_protocol.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | use common::Test;
 3 | use std::path::Path;
 4 | 
 5 | // nomt::beatree::branch::LEAF_NODE_BODY_SIZE is
 6 | // expected to be 4096 and thus the merge threshold is 2047.
 7 | //
 8 | // This parameter makes it possible to define the following vector of
 9 | // keys and values whose size, when inserted into the database, will result
10 | // in the expected set of leaves. Each line adheres to the half full
11 | // requirement, and the first element of the next row does not fit
12 | // in the previous leaf, requiring a new one. The last row does not
13 | // need to meet the half full requirement, as it may be the rightmost leaf.
14 | #[rustfmt::skip]
15 | const KEYS_AND_VALUE_SIZES: [(u8, usize); 16] =[
16 |     // leaf 1
17 |     (1, 1100), (2, 1000), (3, 1000),
18 |     // leaf 2
19 |     (4, 900), (5, 900), (7, 900), (8, 900),
20 |     // leaf 3
21 |     (10, 1200), (11, 1100), (13, 700),
22 |     // leaf 4
23 |     (15, 1300), (16, 1100), (17, 700),
24 |     // leaf 5
25 |     (18, 1100), (19, 1000), (20, 500),
26 | ];
27 | 
28 | // 2 update workers will be used and the first half of `to_delete` items
29 | // which fall under the same set of leaves are assigned to the first worker
30 | // and all the remaining keys to the next worker. This makes possible
31 | // to expect the type of communication between the two workers
32 | fn insert_delete_and_read(name: impl AsRef<Path>, to_delete: Vec<u8>) {
33 |     let mut t = Test::new_with_params(name, 2, 64_000, None, true);
34 | 
35 |     // insert values
36 |     for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() {
37 |         t.write(key(k), Some(vec![k; value_size]));
38 |     }
39 |     t.commit();
40 | 
41 |     // delete values
42 |     for k in to_delete.clone() {
43 |         t.write(key(k), None);
44 |     }
45 |     t.commit();
46 | 
47 |     // read values
48 |     for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() {
49 |         if to_delete.contains(&k) {
50 |             let res = t.read(key(k));
51 |             assert_eq!(None, res);
52 |         } else {
53 |             let value = vec![k; value_size];
54 |             let res = t.read(key(k));
55 |             assert_eq!(Some(value), res);
56 |         }
57 |     }
58 | }
59 | 
60 | fn key(id: u8) -> [u8; 32] {
61 |     let mut key = [0; 32];
62 |     key[0] = id;
63 |     key
64 | }
65 | 
66 | #[test]
67 | fn extend_range_protocol_underfull_to_degenerate_split() {
68 |     insert_delete_and_read("underfull_to_degenerate_split", vec![7, 8, 13])
69 | }
70 | 
71 | #[test]
72 | fn extend_range_protocol_final_unchanged_range() {
73 |     insert_delete_and_read("final_unchanged_range", vec![7, 8, 10, 11, 13])
74 | }
75 | 
76 | #[test]
77 | fn extend_range_protocol_unchanged_range_to_changed() {
78 |     insert_delete_and_read("unchanged_range_to_changed", vec![7, 8, 10, 11, 13, 20])
79 | }
80 | 
81 | #[test]
82 | fn extend_range_protocol_remove_cutoff() {
83 |     insert_delete_and_read(
84 |         "remove_cutoff",
85 |         vec![7, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20],
86 |     );
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/commit_batch/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt::{
 3 |     hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, Root, SessionParams, Witness, WitnessMode,
 4 | };
 5 | use sha2::Digest;
 6 | 
 7 | const NOMT_DB_FOLDER: &str = "nomt_db";
 8 | 
 9 | pub struct NomtDB;
10 | 
11 | impl NomtDB {
12 |     pub fn commit_batch() -> Result<(Root, Root, Witness)> {
13 |         // Define the options used to open NOMT
14 |         let mut opts = Options::new();
15 |         opts.path(NOMT_DB_FOLDER);
16 |         opts.commit_concurrency(1);
17 | 
18 |         // Open NOMT database, it will create the folder if it does not exist
19 |         let nomt = Nomt::<Blake3Hasher>::open(opts)?;
20 | 
21 |         // Create a new Session object
22 |         //
23 |         // During a session, the backend is responsible for returning read keys
24 |         // and receiving hints about future writes
25 |         //
26 |         // Writes do not occur immediately, instead,
27 |         // they are cached and applied all at once later on
28 |         let session =
29 |             nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write()));
30 | 
31 |         // Here we will move the data saved under b"key1" to b"key2" and deletes it
32 |         //
33 |         // NOMT expects keys to be uniformly distributed across the key space
34 |         let key_path_1 = sha2::Sha256::digest(b"key1").into();
35 |         let key_path_2 = sha2::Sha256::digest(b"key2").into();
36 | 
37 |         // First, read what is under key_path_1
38 |         //
39 |         // `read` will immediately return the value present in the database
40 |         let value = session.read(key_path_1)?;
41 | 
42 |         // We are going to perform writes on both key-paths, so we have NOMT warm up the on-disk
43 |         // data for both.
44 |         session.warm_up(key_path_1);
45 |         session.warm_up(key_path_2);
46 | 
47 |         // Retrieve the previous value of the root before committing changes
48 |         let prev_root = nomt.root();
49 | 
50 |         // To commit the batch to the backend we need to collect every
51 |         // performed actions into a vector where items are ordered by the key_path
52 |         let mut actual_access: Vec<_> = vec![
53 |             (key_path_1, KeyReadWrite::ReadThenWrite(value.clone(), None)),
54 |             (key_path_2, KeyReadWrite::Write(value)),
55 |         ];
56 |         actual_access.sort_by_key(|(k, _)| *k);
57 | 
58 |         // The final step in handling a session involves committing all changes
59 |         // to update the trie structure and obtaining the new root of the trie,
60 |         // along with a witness and the witnessed operations.
61 |         let mut finished = session.finish(actual_access).unwrap();
62 | 
63 |         // This field is set because the finished session was configured with
64 |         // `WitnessMode::read_write`.
65 |         let witness = finished.take_witness().unwrap();
66 |         let root = finished.root();
67 |         finished.commit(&nomt)?;
68 | 
69 |         Ok((prev_root, root, witness))
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/nomt/src/store/sync.rs:
--------------------------------------------------------------------------------
 1 | use nomt_core::page_id::PageId;
 2 | 
 3 | use super::{
 4 |     meta::{self, Meta},
 5 |     DirtyPage, Shared,
 6 | };
 7 | use crate::{beatree, bitbox, options::PanicOnSyncMode, page_cache::PageCache, rollback};
 8 | 
 9 | pub struct Sync {
10 |     pub(crate) sync_seqn: u32,
11 |     pub(crate) bitbox_num_pages: u32,
12 |     pub(crate) bitbox_seed: [u8; 16],
13 |     pub(crate) panic_on_sync: Option<PanicOnSyncMode>,
14 | }
15 | 
16 | impl Sync {
17 |     pub fn new(
18 |         sync_seqn: u32,
19 |         bitbox_num_pages: u32,
20 |         bitbox_seed: [u8; 16],
21 |         panic_on_sync: Option<PanicOnSyncMode>,
22 |     ) -> Self {
23 |         Self {
24 |             sync_seqn,
25 |             bitbox_num_pages,
26 |             bitbox_seed,
27 |             panic_on_sync,
28 |         }
29 |     }
30 | 
31 |     pub fn sync(
32 |         &mut self,
33 |         shared: &Shared,
34 |         value_tx: impl IntoIterator<Item = (beatree::Key, beatree::ValueChange)> + Send + 'static,
35 |         bitbox: bitbox::DB,
36 |         beatree: beatree::Tree,
37 |         rollback: Option<rollback::Rollback>,
38 |         page_cache: PageCache,
39 |         updated_pages: impl IntoIterator<Item = (PageId, DirtyPage)> + Send + 'static,
40 |     ) -> anyhow::Result<()> {
41 |         let sync_seqn = self.sync_seqn + 1;
42 | 
43 |         let mut bitbox_sync = bitbox.sync();
44 |         let mut beatree_sync = beatree.sync();
45 |         let mut rollback_sync = rollback.map(|rollback| rollback.sync());
46 | 
47 |         bitbox_sync.begin_sync(sync_seqn, page_cache, updated_pages);
48 |         beatree_sync.begin_sync(value_tx);
49 |         let (rollback_start_live, rollback_end_live) = match rollback_sync {
50 |             Some(ref mut rollback) => rollback.begin_sync(),
51 |             None => (0, 0),
52 |         };
53 | 
54 |         bitbox_sync.wait_pre_meta()?;
55 |         let beatree_meta_wd = beatree_sync.wait_pre_meta()?;
56 | 
57 |         if let Some(PanicOnSyncMode::PostWal) = self.panic_on_sync {
58 |             panic!("panic_on_sync is true (post-wal)")
59 |         }
60 | 
61 |         let new_meta = Meta {
62 |             magic: meta::MAGIC,
63 |             version: meta::VERSION,
64 |             ln_freelist_pn: beatree_meta_wd.ln_freelist_pn,
65 |             ln_bump: beatree_meta_wd.ln_bump,
66 |             bbn_freelist_pn: beatree_meta_wd.bbn_freelist_pn,
67 |             bbn_bump: beatree_meta_wd.bbn_bump,
68 |             sync_seqn,
69 |             bitbox_num_pages: self.bitbox_num_pages,
70 |             bitbox_seed: self.bitbox_seed,
71 |             rollback_start_live,
72 |             rollback_end_live,
73 |         };
74 |         Meta::write(&shared.io_pool.page_pool(), &shared.meta_fd, &new_meta)?;
75 |         self.sync_seqn += 1;
76 | 
77 |         if let Some(PanicOnSyncMode::PostMeta) = self.panic_on_sync {
78 |             panic!("panic_on_sync is true (post-meta)");
79 |         }
80 | 
81 |         if let Some(ref mut rollback) = rollback_sync {
82 |             rollback.post_meta();
83 |         }
84 | 
85 |         bitbox_sync.post_meta(shared.io_pool.make_handle())?;
86 |         beatree_sync.post_meta();
87 | 
88 |         if let Some(ref rollback) = rollback_sync {
89 |             rollback.wait_post_meta()?;
90 |         }
91 |         Ok(())
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/nomt/src/merkle/cache_prepopulate.rs:
--------------------------------------------------------------------------------
 1 | //! Utility for prepopulating the first N layers of the cache.
 2 | 
 3 | use std::io;
 4 | 
 5 | use crate::{
 6 |     io::IoHandle,
 7 |     page_cache::{PageCache, PageMut},
 8 |     store::{PageLoad, PageLoader, Store},
 9 | };
10 | 
11 | use nomt_core::page_id::{ChildPageIndex, PageId, MAX_PAGE_DEPTH, NUM_CHILDREN, ROOT_PAGE_ID};
12 | 
13 | /// Prepopulate the given number of levels of the page tree into the page cache.
14 | ///
15 | /// This function blocks until the prepopulation has finished.
16 | pub fn prepopulate(
17 |     io_handle: IoHandle,
18 |     page_cache: &PageCache,
19 |     store: &Store,
20 |     levels: usize,
21 | ) -> io::Result<()> {
22 |     let page_loader = store.page_loader();
23 |     let mut loads = Vec::new();
24 | 
25 |     let levels = std::cmp::min(levels, MAX_PAGE_DEPTH);
26 | 
27 |     // dispatch all page loads recursively.
28 |     dispatch_recursive(ROOT_PAGE_ID, &page_loader, &io_handle, &mut loads, levels)?;
29 | 
30 |     let mut completed = 0;
31 | 
32 |     // wait on I/O results.
33 |     while completed < loads.len() {
34 |         // UNWRAP: we don't expect the I/O pool to go down. fatal error.
35 |         let complete_io = io_handle.recv().expect("I/O Pool Down");
36 |         complete_io.result?;
37 |         let load_index = complete_io.command.user_data as usize;
38 |         let load = &mut loads[load_index];
39 | 
40 |         // UNWRAP: all submitted requests are of kind Read(FatPage).
41 |         if let Some((page, bucket)) = load.try_complete(complete_io.command.kind.unwrap_buf()) {
42 |             completed += 1;
43 |             page_cache.insert(
44 |                 load.page_id().clone(),
45 |                 PageMut::pristine_with_data(page).freeze(),
46 |                 bucket,
47 |             );
48 |         } else {
49 |             // misprobe. try again.
50 |             if !page_loader.probe(load, &io_handle, complete_io.command.user_data) {
51 |                 // guaranteed empty.
52 |                 completed += 1;
53 |             }
54 |         }
55 |     }
56 | 
57 |     Ok(())
58 | }
59 | 
60 | // dispatch page loads for all the children of the given page.
61 | fn dispatch_recursive(
62 |     page_id: PageId,
63 |     page_loader: &PageLoader,
64 |     io_handle: &IoHandle,
65 |     loads: &mut Vec<PageLoad>,
66 |     levels_remaining: usize,
67 | ) -> io::Result<()> {
68 |     if levels_remaining == 0 {
69 |         return Ok(());
70 |     }
71 | 
72 |     for child_index in 0..NUM_CHILDREN {
73 |         // UNWRAP: all indices up to NUM_CHILDREN are allowed.
74 |         let child_index = ChildPageIndex::new(child_index as u8).unwrap();
75 | 
76 |         // UNWRAP: depth is not out of bounds and child index is valid.
77 |         let child_page_id = page_id.child_page_id(child_index).unwrap();
78 | 
79 |         let mut page_load = page_loader.start_load(child_page_id.clone());
80 | 
81 |         let next_index = loads.len() as u64;
82 |         if page_loader.probe(&mut page_load, io_handle, next_index) {
83 |             // probe has been dispatched.
84 |             loads.push(page_load);
85 |             dispatch_recursive(
86 |                 child_page_id,
87 |                 page_loader,
88 |                 io_handle,
89 |                 loads,
90 |                 levels_remaining - 1,
91 |             )?;
92 |         }
93 |     }
94 | 
95 |     Ok(())
96 | }
97 | 


--------------------------------------------------------------------------------
/examples/witness_verification/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt_core::{hasher::Blake3Hasher, proof, trie::LeafData};
 3 | 
 4 | fn main() -> Result<()> {
 5 |     // The witness produced in the example `commit_batch` will be used
 6 |     let (prev_root, new_root, witness) = commit_batch::NomtDB::commit_batch().unwrap();
 7 | 
 8 |     let mut updates = Vec::new();
 9 | 
10 |     // A witness is composed of multiple WitnessedPath objects,
11 |     // which stores all the necessary information to verify the operations
12 |     // performed on the same path
13 |     for (i, witnessed_path) in witness.path_proofs.iter().enumerate() {
14 |         // Constructing the verified operations
15 |         let verified = witnessed_path
16 |             .inner
17 |             .verify::<Blake3Hasher>(&witnessed_path.path.path(), prev_root.into_inner())
18 |             .unwrap();
19 | 
20 |         // Among all read operations performed the ones that interact
21 |         // with the current verified path are selected
22 |         //
23 |         // Each witnessed operation contains an index to the path it needs to be verified against
24 |         //
25 |         // This information could already be known if we committed the batch initially,
26 |         // and thus, the witnessed field could be discarded entirely.
27 |         for read in witness
28 |             .operations
29 |             .reads
30 |             .iter()
31 |             .skip_while(|r| r.path_index != i)
32 |             .take_while(|r| r.path_index == i)
33 |         {
34 |             match read.value {
35 |                 // Check for non-existence if the return value was None
36 |                 None => assert!(verified.confirm_nonexistence(&read.key).unwrap()),
37 |                 // Verify the correctness of the returned value when it is Some(_)
38 |                 Some(ref v) => {
39 |                     let leaf = LeafData {
40 |                         key_path: read.key,
41 |                         value_hash: *blake3::hash(v).as_bytes(),
42 |                     };
43 |                     assert!(verified.confirm_value(&leaf).unwrap());
44 |                 }
45 |             }
46 |         }
47 | 
48 |         // The correctness of write operations cannot be easily verified like reads.
49 |         // Write operations need to be collected.
50 |         // All writes that have worked on shared prefixes,
51 |         // such as the witnessed_path, need to be bundled together.
52 |         // Later, it needs to be verified that all these writes bring
53 |         // the new trie to the expected state
54 |         let mut write_ops = Vec::new();
55 |         for write in witness
56 |             .operations
57 |             .writes
58 |             .iter()
59 |             .skip_while(|r| r.path_index != i)
60 |             .take_while(|r| r.path_index == i)
61 |         {
62 |             write_ops.push((
63 |                 write.key,
64 |                 write.value.as_ref().map(|v| *blake3::hash(v).as_bytes()),
65 |             ));
66 |         }
67 | 
68 |         if !write_ops.is_empty() {
69 |             updates.push(proof::PathUpdate {
70 |                 inner: verified,
71 |                 ops: write_ops,
72 |             });
73 |         }
74 |     }
75 | 
76 |     assert_eq!(
77 |         proof::verify_update::<Blake3Hasher>(prev_root.into_inner(), &updates).unwrap(),
78 |         new_root.into_inner(),
79 |     );
80 | 
81 |     Ok(())
82 | }
83 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/wal/tests.rs:
--------------------------------------------------------------------------------
  1 | use super::{WalBlobBuilder, WalBlobReader, WalEntry};
  2 | use crate::{io::page_pool::PagePool, page_diff::PageDiff};
  3 | use std::{fs::OpenOptions, io::Write as _};
  4 | 
  5 | #[test]
  6 | fn test_write_read() {
  7 |     let tempdir = tempfile::tempdir().unwrap();
  8 |     let wal_filename = tempdir.path().join("wal");
  9 |     std::fs::create_dir_all(tempdir.path()).unwrap();
 10 |     let mut wal_fd = {
 11 |         let mut options = OpenOptions::new();
 12 |         options.read(true).write(true).create(true);
 13 |         options.open(&wal_filename).unwrap()
 14 |     };
 15 | 
 16 |     let mut builder = WalBlobBuilder::new().unwrap();
 17 |     builder.reset(69);
 18 |     builder.write_clear(0);
 19 |     builder.write_update(
 20 |         [0; 32],
 21 |         &PageDiff::from_bytes(hex_literal::hex!(
 22 |             "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
 23 |         ))
 24 |         .unwrap(),
 25 |         vec![].into_iter(),
 26 |         0,
 27 |     );
 28 |     builder.write_clear(1);
 29 |     builder.write_update(
 30 |         [1; 32],
 31 |         &PageDiff::from_bytes(hex_literal::hex!(
 32 |             "01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
 33 |         ))
 34 |         .unwrap(),
 35 |         vec![[1; 32]].into_iter(),
 36 |         1,
 37 |     );
 38 |     builder.write_update(
 39 |         [2; 32],
 40 |         &{
 41 |             let mut diff = PageDiff::default();
 42 |             for i in 0..126 {
 43 |                 diff.set_changed(i);
 44 |             }
 45 |             diff
 46 |         },
 47 |         (0..126).map(|x| [x; 32]),
 48 |         2,
 49 |     );
 50 |     builder.finalize();
 51 |     wal_fd.write_all(builder.as_slice()).unwrap();
 52 |     wal_fd.sync_data().unwrap();
 53 | 
 54 |     let page_pool = PagePool::new();
 55 |     let mut reader = WalBlobReader::new(&page_pool, &wal_fd).unwrap();
 56 | 
 57 |     assert_eq!(reader.sync_seqn(), 69);
 58 |     assert_eq!(
 59 |         reader.read_entry().unwrap(),
 60 |         Some(WalEntry::Clear { bucket: 0 })
 61 |     );
 62 |     assert_eq!(
 63 |         reader.read_entry().unwrap(),
 64 |         Some(WalEntry::Update {
 65 |             page_id: [0; 32],
 66 |             page_diff: PageDiff::default(),
 67 |             changed_nodes: vec![],
 68 |             bucket: 0,
 69 |         })
 70 |     );
 71 |     assert_eq!(
 72 |         reader.read_entry().unwrap(),
 73 |         Some(WalEntry::Clear { bucket: 1 })
 74 |     );
 75 |     assert_eq!(
 76 |         reader.read_entry().unwrap(),
 77 |         Some(WalEntry::Update {
 78 |             page_id: [1; 32],
 79 |             page_diff: {
 80 |                 let mut diff = PageDiff::default();
 81 |                 diff.set_changed(0);
 82 |                 diff
 83 |             },
 84 |             changed_nodes: vec![[1; 32]],
 85 |             bucket: 1,
 86 |         })
 87 |     );
 88 |     assert_eq!(
 89 |         reader.read_entry().unwrap(),
 90 |         Some(WalEntry::Update {
 91 |             page_id: [2; 32],
 92 |             page_diff: {
 93 |                 let mut diff = PageDiff::default();
 94 |                 for i in 0..126 {
 95 |                     diff.set_changed(i);
 96 |                 }
 97 |                 diff
 98 |             },
 99 |             changed_nodes: (0..126).map(|x| [x; 32]).collect(),
100 |             bucket: 2,
101 |         })
102 |     );
103 |     assert_eq!(reader.read_entry().unwrap(), None);
104 | }
105 | 


--------------------------------------------------------------------------------
/nomt/src/io/unix.rs:
--------------------------------------------------------------------------------
 1 | use super::{CompleteIo, IoCommand, IoKind, IoKindResult, IoPacket, PagePool, PAGE_SIZE};
 2 | use crossbeam_channel::{Receiver, Sender};
 3 | use threadpool::ThreadPool;
 4 | 
 5 | pub fn start_io_worker(
 6 |     page_pool: PagePool,
 7 |     io_workers_tp: &ThreadPool,
 8 |     io_workers: usize,
 9 | ) -> Sender<IoPacket> {
10 |     let (command_tx, command_rx) = crossbeam_channel::unbounded();
11 | 
12 |     for _ in 0..io_workers {
13 |         spawn_worker_thread(page_pool.clone(), io_workers_tp, command_rx.clone());
14 |     }
15 | 
16 |     command_tx
17 | }
18 | 
19 | fn spawn_worker_thread(
20 |     page_pool: PagePool,
21 |     io_workers_tp: &ThreadPool,
22 |     command_rx: Receiver<IoPacket>,
23 | ) {
24 |     let work = move || loop {
25 |         let Ok(packet) = command_rx.recv() else {
26 |             // Why the `drop` here?
27 |             //
28 |             // `command_rx` receives the IoPacket's which are ultimately parameterized by buffers.
29 |             // Those buffers are allocated in the `page_pool`. If the `page_pool` is deallocated
30 |             // before this worker thread is done, that's a use-after-free.
31 |             //
32 |             // So in other words, we plumb `page_pool` all the way here and drop it here only to
33 |             // ensure safety.
34 |             drop(page_pool);
35 |             return;
36 |         };
37 |         let complete = execute(packet.command);
38 |         let _ = packet.completion_sender.send(complete);
39 |     };
40 | 
41 |     io_workers_tp.execute(work);
42 | }
43 | 
44 | fn execute(mut command: IoCommand) -> CompleteIo {
45 |     let result = loop {
46 |         let res = match command.kind {
47 |             IoKind::Read(fd, page_index, ref mut page) => unsafe {
48 |                 libc::pread(
49 |                     fd,
50 |                     page.as_mut_ptr() as *mut libc::c_void,
51 |                     PAGE_SIZE as libc::size_t,
52 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
53 |                 )
54 |             },
55 |             IoKind::Write(fd, page_index, ref page) => unsafe {
56 |                 libc::pwrite(
57 |                     fd,
58 |                     page.as_ptr() as *const libc::c_void,
59 |                     PAGE_SIZE as libc::size_t,
60 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
61 |                 )
62 |             },
63 |             IoKind::WriteArc(fd, page_index, ref page) => unsafe {
64 |                 let page: &[u8] = &*page;
65 |                 libc::pwrite(
66 |                     fd,
67 |                     page.as_ptr() as *const libc::c_void,
68 |                     PAGE_SIZE as libc::size_t,
69 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
70 |                 )
71 |             },
72 |             IoKind::WriteRaw(fd, page_index, ref mut page) => unsafe {
73 |                 libc::pwrite(
74 |                     fd,
75 |                     page.as_ptr() as *const libc::c_void,
76 |                     PAGE_SIZE as libc::size_t,
77 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
78 |                 )
79 |             },
80 |         };
81 |         match command.kind.get_result(res) {
82 |             IoKindResult::Ok => break Ok(()),
83 |             IoKindResult::Err => break Err(std::io::Error::last_os_error()),
84 |             IoKindResult::Retry => (),
85 |         }
86 |     };
87 | 
88 |     CompleteIo { command, result }
89 | }
90 | 


--------------------------------------------------------------------------------
/nomt/tests/fill_and_empty.rs:
--------------------------------------------------------------------------------
  1 | mod common;
  2 | use common::Test;
  3 | use rand::{prelude::SliceRandom, Rng, SeedableRng};
  4 | use std::time::{SystemTime, UNIX_EPOCH};
  5 | 
  6 | fn seed() -> [u8; 16] {
  7 |     SystemTime::now()
  8 |         .duration_since(UNIX_EPOCH)
  9 |         .expect("no time?")
 10 |         .as_nanos()
 11 |         .to_le_bytes()[0..16]
 12 |         .try_into()
 13 |         .unwrap()
 14 | }
 15 | 
 16 | fn fill_and_empty(seed: [u8; 16], commit_concurrency: usize) {
 17 |     let mut rng = rand_pcg::Lcg64Xsh32::from_seed(seed);
 18 | 
 19 |     let db_size = 1 << 12;
 20 |     let commit_size = db_size / 16;
 21 | 
 22 |     let mut items = std::collections::BTreeSet::new();
 23 |     while items.len() < db_size as usize {
 24 |         items.insert(rand_key(&mut rng));
 25 |     }
 26 |     let mut items: Vec<_> = items.into_iter().collect();
 27 |     items.shuffle(&mut rng);
 28 | 
 29 |     let mut to_delete: Vec<usize> = (0..db_size as usize).collect();
 30 |     to_delete.shuffle(&mut rng);
 31 | 
 32 |     let mut t = Test::new_with_params(
 33 |         format!("fill_and_empty_{}", commit_concurrency), // name
 34 |         commit_concurrency,
 35 |         15000, // hashtable_buckets
 36 |         None,  // panic_on_sync
 37 |         true,  //  cleanup_dir
 38 |     );
 39 | 
 40 |     // inserting all the values
 41 |     let mut to_check = vec![];
 42 |     for i in 0..db_size {
 43 |         let key = items[i];
 44 |         let value = vec![i as u8; 400];
 45 | 
 46 |         to_check.push((key, value.clone()));
 47 |         t.write(key, Some(value));
 48 | 
 49 |         if (i + 1) % commit_size == 0 {
 50 |             t.commit();
 51 |             // check for presence
 52 |             for (key, value) in to_check.drain(..) {
 53 |                 assert_eq!(t.read(key), Some(value));
 54 |             }
 55 |         }
 56 |     }
 57 | 
 58 |     // deleting all the values in different order
 59 |     let mut to_check = vec![];
 60 |     for i in 0..db_size {
 61 |         let key = items[to_delete[i]];
 62 | 
 63 |         to_check.push(key);
 64 |         t.write(key, None);
 65 | 
 66 |         if (i + 1) % commit_size == 0 {
 67 |             t.commit();
 68 |             // check for absence
 69 |             for key in to_check.drain(..) {
 70 |                 assert_eq!(t.read(key), None);
 71 |             }
 72 |         }
 73 |     }
 74 | 
 75 |     assert!(t.commit().0.is_empty());
 76 | }
 77 | 
 78 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] {
 79 |     let mut key = [0; 32];
 80 |     rng.fill(&mut key[..]);
 81 |     key
 82 | }
 83 | 
 84 | #[test]
 85 | fn fill_and_empty_1_commit_worker() {
 86 |     let seed = seed();
 87 |     let test_result = std::panic::catch_unwind(|| {
 88 |         fill_and_empty(seed, 1);
 89 |     });
 90 |     if let Err(cause) = test_result {
 91 |         eprintln!(
 92 |             "fill_and_empty_1_commit_worker failed with seed: {:?}",
 93 |             seed
 94 |         );
 95 |         std::panic::resume_unwind(cause);
 96 |     }
 97 | }
 98 | 
 99 | #[test]
100 | fn fill_and_empty_64_commit_worker() {
101 |     let seed = seed();
102 |     let test_result = std::panic::catch_unwind(|| {
103 |         fill_and_empty(seed, 64);
104 |     });
105 |     if let Err(cause) = test_result {
106 |         eprintln!(
107 |             "fill_and_empty_64_commit_worker failed with seed: {:?}",
108 |             seed
109 |         );
110 |         std::panic::resume_unwind(cause);
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/reconstruct_key.rs:
--------------------------------------------------------------------------------
  1 | #![no_main]
  2 | 
  3 | use arbitrary::Arbitrary;
  4 | use bitvec::{order::Msb0, view::BitView};
  5 | use libfuzzer_sys::fuzz_target;
  6 | use nomt::beatree::reconstruct_key;
  7 | 
  8 | fuzz_target!(|run: Run| {
  9 |     let Run {
 10 |         raw_separator,
 11 |         raw_prefix,
 12 |     } = run;
 13 | 
 14 |     let expected = reference_reconstruct_key(&raw_prefix, &raw_separator);
 15 | 
 16 |     let maybe_prefix = if raw_prefix.bit_len == 0 {
 17 |         None
 18 |     } else {
 19 |         Some((&raw_prefix.bytes[..], raw_prefix.bit_len))
 20 |     };
 21 | 
 22 |     let raw_separator = (
 23 |         &raw_separator.bytes[..],
 24 |         raw_separator.bit_start,
 25 |         raw_separator.bit_len,
 26 |     );
 27 | 
 28 |     assert_eq!(expected, reconstruct_key(maybe_prefix, raw_separator));
 29 | });
 30 | 
 31 | #[derive(Debug)]
 32 | struct Run {
 33 |     raw_separator: RawSeparator,
 34 |     raw_prefix: RawPrefix,
 35 | }
 36 | 
 37 | #[derive(Debug)]
 38 | struct RawSeparator {
 39 |     bit_start: usize,
 40 |     bit_len: usize,
 41 |     bytes: Vec<u8>,
 42 | }
 43 | 
 44 | #[derive(Debug)]
 45 | struct RawPrefix {
 46 |     bit_len: usize,
 47 |     bytes: Vec<u8>,
 48 | }
 49 | 
 50 | impl<'a> Arbitrary<'a> for Run {
 51 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 52 |         let raw_separator = RawSeparator::arbitrary(input)?;
 53 | 
 54 |         let raw_prefix_bit_len = input.int_in_range(0..=(256 - raw_separator.bit_len))?;
 55 |         let raw_prefix_min_byte_len = (raw_prefix_bit_len + 7) / 8;
 56 |         let raw_prefix_byte_len = input.int_in_range(raw_prefix_min_byte_len..=(1 << 12))?;
 57 |         let mut raw_prefix_bytes = vec![0; raw_prefix_byte_len];
 58 |         input.fill_buffer(&mut raw_prefix_bytes)?;
 59 | 
 60 |         let run = Run {
 61 |             raw_separator,
 62 |             raw_prefix: RawPrefix {
 63 |                 bit_len: raw_prefix_bit_len,
 64 |                 bytes: raw_prefix_bytes,
 65 |             },
 66 |         };
 67 | 
 68 |         Ok(run)
 69 |     }
 70 | }
 71 | 
 72 | impl<'a> Arbitrary<'a> for RawSeparator {
 73 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 74 |         let bit_start = input.int_in_range(0..=7)?;
 75 | 
 76 |         let bit_len = input.int_in_range(0..=(256 - bit_start))?;
 77 | 
 78 |         let bytes_len = (((bit_start + bit_len + 7) / 8) as usize).next_multiple_of(8);
 79 |         let mut bytes: Vec<u8> = vec![0; bytes_len];
 80 |         input.fill_buffer(&mut bytes)?;
 81 | 
 82 |         Ok(Self {
 83 |             bit_start,
 84 |             bit_len,
 85 |             bytes,
 86 |         })
 87 |     }
 88 | }
 89 | 
 90 | fn reference_reconstruct_key(maybe_prefix: &RawPrefix, separator: &RawSeparator) -> [u8; 32] {
 91 |     let mut key = [0; 32];
 92 | 
 93 |     let mut key_start_separator = 0;
 94 |     let RawPrefix { bit_len, bytes } = maybe_prefix;
 95 |     if *bit_len != 0 {
 96 |         key.view_bits_mut::<Msb0>()[..*bit_len]
 97 |             .copy_from_bitslice(&bytes.view_bits::<Msb0>()[..*bit_len]);
 98 |         key_start_separator = *bit_len;
 99 |     }
100 | 
101 |     let RawSeparator {
102 |         bit_start,
103 |         bit_len,
104 |         bytes,
105 |     } = separator;
106 | 
107 |     key.view_bits_mut::<Msb0>()[key_start_separator..][..*bit_len]
108 |         .copy_from_bitslice(&bytes.view_bits::<Msb0>()[*bit_start..][..*bit_len]);
109 | 
110 |     key
111 | }
112 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/bitwise_memcpy.rs:
--------------------------------------------------------------------------------
  1 | #![no_main]
  2 | 
  3 | use arbitrary::Arbitrary;
  4 | use bitvec::{order::Msb0, view::BitView};
  5 | use libfuzzer_sys::fuzz_target;
  6 | use nomt::beatree::bitwise_memcpy;
  7 | 
  8 | const MAX_BYTES_LEN: usize = 1 << 12; // 4KiB
  9 | 
 10 | fuzz_target!(|run: Run| {
 11 |     let Run {
 12 |         source,
 13 |         mut destination,
 14 |     } = run;
 15 | 
 16 |     let expected = reference_bitwise_memcpy(&source, &destination);
 17 | 
 18 |     bitwise_memcpy(
 19 |         &mut destination.bytes,
 20 |         destination.bit_start,
 21 |         &source.bytes,
 22 |         source.bit_start,
 23 |         source.bit_len,
 24 |     );
 25 | 
 26 |     assert_eq!(expected, destination.bytes);
 27 | });
 28 | 
 29 | #[derive(Debug)]
 30 | struct Run {
 31 |     source: Source,
 32 |     destination: Destination,
 33 | }
 34 | 
 35 | #[derive(Debug)]
 36 | struct Source {
 37 |     bit_start: usize,
 38 |     bit_len: usize,
 39 |     bytes: Vec<u8>,
 40 | }
 41 | 
 42 | #[derive(Debug)]
 43 | struct Destination {
 44 |     bit_start: usize,
 45 |     bytes: Vec<u8>,
 46 | }
 47 | 
 48 | impl<'a> Arbitrary<'a> for Run {
 49 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 50 |         let source = Source::arbitrary(input)?;
 51 | 
 52 |         // Destination must be long enough to store the source.
 53 |         let destination_bit_start = input.int_in_range(0..=7)?;
 54 |         let min_destination_len = (destination_bit_start + source.bit_len + 7) / 8;
 55 |         let destination_len = input.int_in_range(min_destination_len..=MAX_BYTES_LEN)?;
 56 |         let mut destination_bytes = vec![0; destination_len];
 57 |         input.fill_buffer(&mut destination_bytes)?;
 58 | 
 59 |         let run = Run {
 60 |             source,
 61 |             destination: Destination {
 62 |                 bit_start: destination_bit_start,
 63 |                 bytes: destination_bytes,
 64 |             },
 65 |         };
 66 | 
 67 |         Ok(run)
 68 |     }
 69 | }
 70 | 
 71 | impl<'a> Arbitrary<'a> for Source {
 72 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 73 |         let bytes_len = (input.int_in_range(0..=MAX_BYTES_LEN)? as usize).next_multiple_of(8);
 74 | 
 75 |         let mut bytes: Vec<u8> = vec![0; bytes_len];
 76 |         input.fill_buffer(&mut bytes)?;
 77 | 
 78 |         let bit_start = if bytes_len != 0 {
 79 |             input.int_in_range(0..=7)?
 80 |         } else {
 81 |             0
 82 |         };
 83 | 
 84 |         let bit_len = if bytes_len > 0 {
 85 |             // `bitwise_memcpy` requires to the source length to be the smallest length,
 86 |             // multiple of 8 bytes that the contain the source bits.
 87 |             let min_bit_len = ((bytes_len - 8) * 8).saturating_sub(bit_start) + 1;
 88 |             let max_bit_len = (bytes_len * 8) - bit_start;
 89 |             input.int_in_range(min_bit_len..=max_bit_len)?
 90 |         } else {
 91 |             0
 92 |         };
 93 | 
 94 |         Ok(Self {
 95 |             bit_start,
 96 |             bit_len,
 97 |             bytes,
 98 |         })
 99 |     }
100 | }
101 | 
102 | fn reference_bitwise_memcpy(source: &Source, destination: &Destination) -> Vec<u8> {
103 |     let mut destination_bytes = destination.bytes.clone();
104 | 
105 |     destination_bytes.view_bits_mut::<Msb0>()[destination.bit_start..][..source.bit_len]
106 |         .copy_from_bitslice(
107 |             &source.bytes.view_bits::<Msb0>()[source.bit_start..][..source.bit_len],
108 |         );
109 | 
110 |     destination_bytes
111 | }
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## NOMT: Nearly Optimal Merkle Trie
 2 | 
 3 | An implementation of a novel binary Merkle Trie and DB, written in Rust.
 4 | 
 5 | NOMT is an embedded key-value store that maintains a Merklized representation of key-value pairs with a simple key-value API, powering high throughput authenticated commits with billions of key-value pairs on relatively inexpensive hardware. It is largely designed for use in a blockchain setting as a drop-in replacement for RocksDB, MDBX, LevelDB, or ParityDB.
 6 | 
 7 | NOMT is optimized for fast random lookups of values, fast merkle tree updates, and fast writeout. It supports the generation of Merkle multiproofs for large batches of changes.
 8 | 
 9 | NOMT is designed to take advantage of hardware improvements in Solid State Drives (SSDs) using NVMe and Linux's io-uring API for asynchronous I/O. NOMT adequately supports generic Unix as well as macOS for daily development and testing, but primarily targets Linux for performance. The impressive trend in performance and capacity in modern SSDs enables us to build a DB that scales along with the hardware.
10 | 
11 | NOMT exposes a many-readers-one-writer API organized around batch transactions referred to as `Session`s. Predictable performance in a metered execution environment is a key goal of NOMT, and therefore only one `Session` may be live at a time.
12 | 
13 | ## Project Structure
14 | 
15 | <pre>
16 | NOMT: Project Root.
17 | ├──<a href="./benchtop">benchtop</a>: A benchmarking tool for NOMT.
18 | |--<a href="./core">core</a>: Core logic, primarily for verifying and updating the NOMT.
19 | |--<a href="./docs">docs</a>: Documentation
20 | |--<a href="./fuzz">fuzz</a>: Fuzzing suite.
21 | ├──<a href="./examples">examples</a>: Various examples of using NOMT.
22 | │   ├── <a href="./examples/commit_batch">commit_batch</a>: Demonstration of a simple commit.
23 | │   ├── <a href="./examples/read_value">read_value</a>: Reading a value from the NOMT.
24 | │   ├── <a href="./examples/witness_verification">witness_verification</a>: Demonstration of how to verify a witness in a light-client setting.
25 | |--<a href="./nomt">nomt</a>: Implementation of the NOMT database.
26 | |──<a href="./torture">torture</a>: Extensive testing suite for NOMT.
27 | |--<a href="./trickfs">trickfs</a>: A FUSE filesystem aiding deeper testing. Experimental.
28 | │   ├──<a href="./trickfs/trickmnt">trickmnt</a>: A tool that allows mounting trickfs.
29 | </pre>
30 | 
31 | ## Architecture
32 | 
33 | Internally, NOMT consists of two parallel stores, Beatree and Bitbox. Beatree stores raw key-value pairs and is based around a B-Tree variant optimized for stable, fast random access patterns and high-entropy keys. Bitbox stores a custom sparse binary merkle tree in an on-disk hashtable in a format amenable to SSDs.
34 | 
35 | For more information on NOMT, the thesis behind it, and performance targets, see [this November 2024 presentation](https://x.com/TheKusamarian/status/1855477208762261910) by @rphmeier or [view the slides here](https://hackmd.io/@Xo-wxO7bQkKidH1LrqACsw/rkG0lmjWyg#/).
36 | 
37 | We have built a benchmarking tool, `benchtop`, which is located in the `benchtop` directory as a separate subcrate.
38 | 
39 | ## Contributing
40 | 
41 | See [CONTRIBUTING.md](docs/CONTRIBUTING.md).
42 | 
43 | If you would like to discuss the development of NOMT or follow along with contributor discussions, join the official [Telegram Channel](https://t.me/thrum_nomt).
44 | 
45 | ## Acknowledgements
46 | 
47 | The development of this project is supported financially by [Sovereign Labs](https://www.sovereign.xyz/), creators of the [Sovereign SDK](https://github.com/Sovereign-Labs/sovereign-sdk/). The idea for this project originated in [this post by Preston Evans](https://sovereign.mirror.xyz/jfx_cJ_15saejG9ZuQWjnGnG-NfahbazQH98i1J3NN8).
48 | 


--------------------------------------------------------------------------------
/core/src/trie.rs:
--------------------------------------------------------------------------------
 1 | //! This module defines the types of a binary merkle trie, generalized over a 256 bit hash function.
 2 | //! All lookup paths in the trie are 256 bits.
 3 | //!
 4 | //! All nodes are 256 bits. There are three kinds of nodes.
 5 | //!   1. Internal nodes, which each have two children. The value of an internal node is
 6 | //!       given by hashing the concatenation of the two child nodes and setting the MSB to 0.
 7 | //!   2. Leaf nodes, which have zero children. The value of a leaf node is given by hashing
 8 | //!       the concatenation of the 256-bit lookup path and the hash of the value stored at the leaf,
 9 | //!       and setting the MSB to 1.
10 | //!   3. [`TERMINATOR`] nodes, which have the special value of all 0s. These nodes have no children
11 | //!      and serve as a stand-in for an empty sub-trie at any height. Terminator nodes enable the
12 | //!      trie to be tractably represented.
13 | //!
14 | //! All node preimages are 512 bits.
15 | 
16 | use crate::hasher::NodeHasher;
17 | 
18 | /// A node in the binary trie. In this schema, it is always 256 bits and is the hash of either
19 | /// an [`LeafData`] or [`InternalData`], or zeroed if it's a [`TERMINATOR`].
20 | ///
21 | /// [`Node`]s are labeled by the [`NodeHasher`] used to indicate whether they are leaves or internal
22 | /// nodes. Typically, this is done by setting the MSB.
23 | pub type Node = [u8; 32];
24 | 
25 | /// The path to a key. All paths have a 256 bit fixed length.
26 | pub type KeyPath = [u8; 32];
27 | 
28 | /// The hash of a value. In this schema, it is always 256 bits.
29 | pub type ValueHash = [u8; 32];
30 | 
31 | /// The terminator hash is a special node hash value denoting an empty sub-tree.
32 | /// Concretely, when this appears at a given location in the trie,
33 | /// it implies that no key with a path beginning with the location has a value.
34 | ///
35 | /// This value may appear at any height.
36 | pub const TERMINATOR: Node = [0u8; 32];
37 | 
38 | /// Whether the node hash indicates the node is a leaf.
39 | pub fn is_leaf<H: NodeHasher>(hash: &Node) -> bool {
40 |     H::node_kind(hash) == NodeKind::Leaf
41 | }
42 | 
43 | /// Whether the node hash indicates the node is an internal node.
44 | pub fn is_internal<H: NodeHasher>(hash: &Node) -> bool {
45 |     H::node_kind(hash) == NodeKind::Internal
46 | }
47 | 
48 | /// Whether the node holds the special `EMPTY_SUBTREE` value.
49 | pub fn is_terminator<H: NodeHasher>(hash: &Node) -> bool {
50 |     H::node_kind(hash) == NodeKind::Terminator
51 | }
52 | 
53 | /// The kind of a node.
54 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
55 | pub enum NodeKind {
56 |     /// A terminator node indicates an empty sub-trie.
57 |     Terminator,
58 |     /// A leaf node indicates a sub-trie with a single leaf.
59 |     Leaf,
60 |     /// An internal node indicates at least two values.
61 |     Internal,
62 | }
63 | 
64 | impl NodeKind {
65 |     /// Get the kind of the provided node.
66 |     pub fn of<H: NodeHasher>(node: &Node) -> Self {
67 |         H::node_kind(node)
68 |     }
69 | }
70 | 
71 | /// The data of an internal (branch) node.
72 | #[derive(Debug, Clone, PartialEq, Eq)]
73 | pub struct InternalData {
74 |     /// The hash of the left child of this node.
75 |     pub left: Node,
76 |     /// The hash of the right child of this node.
77 |     pub right: Node,
78 | }
79 | 
80 | /// The data of a leaf node.
81 | #[derive(Debug, Default, Clone, PartialEq, Eq)]
82 | #[cfg_attr(
83 |     feature = "borsh",
84 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
85 | )]
86 | pub struct LeafData {
87 |     /// The total path to this value within the trie.
88 |     ///
89 |     /// The actual location of this node may be anywhere along this path, depending on the other
90 |     /// data within the trie.
91 |     pub key_path: KeyPath,
92 |     /// The hash of the value carried in this leaf.
93 |     pub value_hash: ValueHash,
94 | }
95 | 


--------------------------------------------------------------------------------
/benchtop/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod backend;
  2 | mod cli;
  3 | mod custom_workload;
  4 | mod nomt;
  5 | 
  6 | #[cfg(feature = "sov-db")]
  7 | mod sov_db;
  8 | #[cfg(feature = "sp-trie")]
  9 | mod sp_trie;
 10 | 
 11 | mod timer;
 12 | mod transfer_workload;
 13 | mod workload;
 14 | 
 15 | use anyhow::Result;
 16 | use clap::Parser;
 17 | use cli::{Cli, Commands, InitParams, RunParams};
 18 | use timer::Timer;
 19 | 
 20 | pub fn main() -> Result<()> {
 21 |     let cli = Cli::parse();
 22 | 
 23 |     match cli.command {
 24 |         Commands::Init(params) => init(params),
 25 |         Commands::Run(params) => run(params),
 26 |     }
 27 | }
 28 | 
 29 | pub fn init(params: InitParams) -> Result<()> {
 30 |     let workload_params = params.workload;
 31 |     let (mut init, _) = workload::parse(&workload_params, u64::max_value())?;
 32 | 
 33 |     let mut db = params.backend.instantiate(
 34 |         true,
 35 |         workload_params.commit_concurrency,
 36 |         workload_params.io_workers,
 37 |         workload_params.hashtable_buckets,
 38 |         workload_params.page_cache_size,
 39 |         workload_params.leaf_cache_size,
 40 |         workload_params.page_cache_upper_levels,
 41 |         workload_params.prepopulate_page_cache,
 42 |         0,
 43 |     );
 44 |     db.execute(None, &mut *init, None);
 45 | 
 46 |     Ok(())
 47 | }
 48 | 
 49 | pub fn run(params: RunParams) -> Result<()> {
 50 |     let workload_params = params.workload;
 51 |     let (mut init, mut workloads) = workload::parse(
 52 |         &workload_params,
 53 |         params.limits.ops.unwrap_or(u64::max_value()),
 54 |     )?;
 55 | 
 56 |     let mut db = params.backend.instantiate(
 57 |         params.reset,
 58 |         workload_params.commit_concurrency,
 59 |         workload_params.io_workers,
 60 |         workload_params.hashtable_buckets,
 61 |         workload_params.page_cache_size,
 62 |         workload_params.leaf_cache_size,
 63 |         workload_params.page_cache_upper_levels,
 64 |         workload_params.prepopulate_page_cache,
 65 |         workload_params.overlay_window_length,
 66 |     );
 67 | 
 68 |     if params.reset {
 69 |         db.execute(None, &mut *init, None);
 70 |     }
 71 | 
 72 |     let mut timer = Timer::new(format!("{}", params.backend));
 73 |     let warmup_timeout = params
 74 |         .warm_up
 75 |         .map(|time_limit| std::time::Instant::now() + time_limit.into());
 76 | 
 77 |     let thread_pool = rayon::ThreadPoolBuilder::new()
 78 |         .thread_name(|_| "benchtop-workload".into())
 79 |         .num_threads(workload_params.workload_concurrency as usize)
 80 |         .build()?;
 81 | 
 82 |     if let Some(t) = warmup_timeout {
 83 |         if workload_params.workload_concurrency == 1 {
 84 |             db.execute(Some(&mut timer), &mut *workloads[0], Some(t));
 85 |         } else {
 86 |             db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, Some(t))?;
 87 |         };
 88 | 
 89 |         timer = Timer::new(format!("{}", params.backend));
 90 |     }
 91 | 
 92 |     let timeout = params
 93 |         .limits
 94 |         .time
 95 |         .map(|time_limit| std::time::Instant::now() + time_limit.into());
 96 | 
 97 |     if workload_params.workload_concurrency == 1 {
 98 |         db.execute(Some(&mut timer), &mut *workloads[0], timeout);
 99 |     } else {
100 |         db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, timeout)?;
101 |     };
102 | 
103 |     db.print_metrics();
104 |     timer.print(workload_params.size);
105 |     print_max_rss();
106 | 
107 |     Ok(())
108 | }
109 | 
110 | fn print_max_rss() {
111 |     let max_rss = get_max_rss().unwrap_or(0);
112 |     println!("max rss: {} MiB", max_rss / 1024);
113 |     fn get_max_rss() -> Option<usize> {
114 |         let mut usage: libc::rusage = unsafe { std::mem::zeroed() };
115 |         let ret = unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut usage) };
116 |         if ret == 0 {
117 |             Some(usage.ru_maxrss as usize)
118 |         } else {
119 |             None
120 |         }
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/controller.rs:
--------------------------------------------------------------------------------
  1 | use crate::message::{InitOutcome, OpenOutcome};
  2 | 
  3 | use super::comms;
  4 | use anyhow::Result;
  5 | use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
  6 | use tokio::{net::UnixStream, process::Child};
  7 | 
  8 | /// A controller is responsible for overseeing a single agent process and handle its lifecycle.
  9 | pub struct SpawnedAgentController {
 10 |     child: Child,
 11 |     rr: comms::RequestResponse,
 12 |     torn_down: AtomicBool,
 13 |     agent_number: usize,
 14 | }
 15 | 
 16 | // This is a safe-guard to ensure that the [`SpawnedAgentController::teardown`] is called
 17 | // properly.
 18 | impl Drop for SpawnedAgentController {
 19 |     fn drop(&mut self) {
 20 |         if self.torn_down.load(Ordering::Relaxed) {
 21 |             // The controller was torn down properly, disarm.
 22 |             return;
 23 |         }
 24 |         if std::thread::panicking() {
 25 |             // The controller was not torn down properly, but we are panicking.
 26 |             eprintln!("controller was not torn down properly");
 27 |             return;
 28 |         }
 29 |         panic!("controller was not torn down properly");
 30 |     }
 31 | }
 32 | 
 33 | impl SpawnedAgentController {
 34 |     pub async fn init(&mut self, workdir: String, workload_id: u64) -> Result<InitOutcome> {
 35 |         let id = format!("agent-{}-{}", workload_id, self.agent_number);
 36 |         let response = self
 37 |             .rr
 38 |             .send_request(crate::message::ToAgent::Init(crate::message::InitPayload {
 39 |                 id,
 40 |                 workdir,
 41 |             }))
 42 |             .await?;
 43 |         match response {
 44 |             crate::message::ToSupervisor::InitResponse(outcome) => return Ok(outcome),
 45 |             _ => {
 46 |                 panic!("expected init, unexpected response: {:?}", response);
 47 |             }
 48 |         }
 49 |     }
 50 | 
 51 |     pub async fn open(&self, bitbox_seed: [u8; 16], rollback: Option<u32>) -> Result<OpenOutcome> {
 52 |         let response = self
 53 |             .rr
 54 |             .send_request(crate::message::ToAgent::Open(crate::message::OpenPayload {
 55 |                 bitbox_seed,
 56 |                 rollback,
 57 |             }))
 58 |             .await?;
 59 |         match response {
 60 |             crate::message::ToSupervisor::OpenResponse(outcome) => return Ok(outcome),
 61 |             _ => {
 62 |                 panic!("expected open, unexpected response: {:?}", response);
 63 |             }
 64 |         }
 65 |     }
 66 | 
 67 |     /// Kills the process, shuts down the comms, and cleans up the resources.
 68 |     ///
 69 |     /// This returns only when the process is dead and the resources are cleaned up.
 70 |     ///
 71 |     /// The controller must be torn down manually. Dropping the controller is disallowed. This is
 72 |     /// done to control precisely when the agent process is killed.
 73 |     pub async fn teardown(mut self) {
 74 |         self.torn_down.store(true, Ordering::Relaxed);
 75 |         let _ = self.child.kill().await;
 76 |     }
 77 | 
 78 |     /// Resolves when the agent process exits.
 79 |     pub async fn died(&mut self) {
 80 |         let _ = self.child.wait().await;
 81 |     }
 82 | 
 83 |     pub fn rr(&self) -> &comms::RequestResponse {
 84 |         &self.rr
 85 |     }
 86 | }
 87 | 
 88 | /// Spawns an agent process creating a controller.
 89 | ///
 90 | /// The controller is placed in the `place` argument. `place` must be `None` when calling this
 91 | /// function.
 92 | pub async fn spawn_agent_into(place: &mut Option<SpawnedAgentController>) -> Result<()> {
 93 |     assert!(place.is_none(), "the controller must be empty");
 94 | 
 95 |     let (child, sock) = crate::spawn::spawn_child()?;
 96 | 
 97 |     let stream = UnixStream::from_std(sock)?;
 98 | 
 99 |     let (rr, task) = comms::run(stream);
100 |     let _ = tokio::spawn(task);
101 | 
102 |     // Assign a unique ID to the agent.
103 |     static AGENT_COUNT: AtomicUsize = AtomicUsize::new(0);
104 |     let agent_number = AGENT_COUNT.fetch_add(1, Ordering::Relaxed);
105 | 
106 |     *place = Some(SpawnedAgentController {
107 |         agent_number,
108 |         child,
109 |         rr,
110 |         torn_down: AtomicBool::new(false),
111 |     });
112 |     Ok(())
113 | }
114 | 


--------------------------------------------------------------------------------
/nomt/src/io/fsyncer.rs:
--------------------------------------------------------------------------------
  1 | use parking_lot::{Condvar, Mutex};
  2 | use std::{fs::File, sync::Arc};
  3 | 
  4 | #[derive(Debug)]
  5 | enum State {
  6 |     Idle,
  7 |     Started,
  8 |     Done(Result<(), std::io::Error>),
  9 |     HandleDead,
 10 | }
 11 | 
 12 | impl State {
 13 |     fn force_take_done(&mut self) -> Result<(), std::io::Error> {
 14 |         let s = std::mem::replace(self, State::Idle);
 15 |         if let State::Done(res) = s {
 16 |             res
 17 |         } else {
 18 |             panic!("force_take_done called on non-done state");
 19 |         }
 20 |     }
 21 | }
 22 | 
 23 | struct Shared {
 24 |     cv: Condvar,
 25 |     s: Mutex<State>,
 26 | }
 27 | 
 28 | /// Fsyncer is a helper that allows to fsync a file in a non-blocking manner.
 29 | ///
 30 | /// It spawns a thread that will fsync the file in the background.
 31 | ///
 32 | /// The expected usage is from two threads: the one that calls [`Self::fsync`] and the one that calls
 33 | /// [`Self::wait`].
 34 | pub struct Fsyncer {
 35 |     shared: Arc<Shared>,
 36 | }
 37 | 
 38 | impl Fsyncer {
 39 |     /// Creates a new fsyncer with the given file descriptor and identifier.
 40 |     pub fn new(name: &'static str, fd: Arc<File>) -> Self {
 41 |         let name = format!("nomt-fsyncer-{}", name);
 42 |         let shared = Arc::new(Shared {
 43 |             cv: Condvar::new(),
 44 |             s: Mutex::new(State::Idle),
 45 |         });
 46 |         let _thread = std::thread::Builder::new()
 47 |             .name(name)
 48 |             .spawn({
 49 |                 let shared = shared.clone();
 50 |                 move || {
 51 |                     worker(fd, shared);
 52 |                 }
 53 |             })
 54 |             .expect("failed to spawn fsyncer thread");
 55 |         Fsyncer { shared }
 56 |     }
 57 | 
 58 |     /// Issues a fsync request.
 59 |     ///
 60 |     /// # Panics
 61 |     ///
 62 |     /// Panics if there is an outstanding fsync operation that hasn't been consumed by
 63 |     /// [`Self::wait()`] yet.
 64 |     ///
 65 |     /// Make sure to call [`Self::wait()`] to consume any previous fsync result before issuing a new
 66 |     /// request.
 67 |     pub fn fsync(&self) {
 68 |         let mut s_guard = self.shared.s.lock();
 69 |         assert!(matches!(&*s_guard, State::Idle));
 70 |         *s_guard = State::Started;
 71 |         self.shared.cv.notify_all();
 72 |     }
 73 | 
 74 |     /// Waits for the fsync to complete and consumes the result.
 75 |     ///
 76 |     /// This blocks until a synchronization initiated by [`Self::fsync`] completes. If no fsync has been
 77 |     /// initiated yet, this will block until one is both started and completed. After consuming the result,
 78 |     /// subsequent calls will block until the next `fsync()` operation finishes.
 79 |     pub fn wait(&self) -> Result<(), std::io::Error> {
 80 |         let mut s_guard = self.shared.s.lock();
 81 |         self.shared
 82 |             .cv
 83 |             .wait_while(&mut s_guard, |s| !matches!(s, State::Done(_)));
 84 |         s_guard.force_take_done()
 85 |     }
 86 | }
 87 | 
 88 | impl Drop for Fsyncer {
 89 |     fn drop(&mut self) {
 90 |         let mut s_guard = self.shared.s.lock();
 91 |         *s_guard = State::HandleDead;
 92 |         self.shared.cv.notify_all();
 93 |     }
 94 | }
 95 | 
 96 | fn worker(fd: Arc<File>, shared: Arc<Shared>) {
 97 |     let bomb = Bomb;
 98 |     'outer: loop {
 99 |         let mut s_guard = shared.s.lock();
100 |         shared.cv.wait_while(&mut s_guard, |state| {
101 |             !matches!(state, State::Started | State::HandleDead)
102 |         });
103 |         if matches!(&*s_guard, State::HandleDead) {
104 |             break 'outer;
105 |         }
106 |         assert!(matches!(&*s_guard, State::Started | State::Done(_)));
107 |         drop(s_guard);
108 | 
109 |         let sync_result = fd.sync_all();
110 | 
111 |         let mut s_guard = shared.s.lock();
112 |         if matches!(&*s_guard, State::HandleDead) {
113 |             break 'outer;
114 |         }
115 |         *s_guard = State::Done(sync_result);
116 |         shared.cv.notify_all();
117 |     }
118 |     bomb.defuse();
119 | 
120 |     struct Bomb;
121 |     impl Bomb {
122 |         fn defuse(self) {
123 |             std::mem::forget(self);
124 |         }
125 |     }
126 |     impl Drop for Bomb {
127 |         fn drop(&mut self) {
128 |             panic!("worker panicked");
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/torture/src/spawn.rs:
--------------------------------------------------------------------------------
  1 | // A low-level module for spawning a child process and figuring out if we are the parent or the
  2 | // child using the same binary.
  3 | //
  4 | // The parent spawns a child process and passes a socket to it. The socket is passed to the child
  5 | // via a predefined file descriptor. The child then uses this file descriptor to communicate with
  6 | // the parent.
  7 | //
  8 | // For a process launched using the common binary, it can check if it is a child by checking if the
  9 | // [`CANARY_SOCKET_FD`] is valid.
 10 | //
 11 | // The main goal of this module is to tuck away the low-level machinery like working with libc and
 12 | // nix into a single place.
 13 | 
 14 | use anyhow::Result;
 15 | use cfg_if::cfg_if;
 16 | use std::{
 17 |     os::{
 18 |         fd::{AsRawFd as _, FromRawFd as _, RawFd},
 19 |         unix::net::UnixStream,
 20 |     },
 21 |     sync::atomic::{AtomicBool, Ordering},
 22 | };
 23 | use tokio::process::{Child, Command};
 24 | use tracing::trace;
 25 | 
 26 | /// A special file descriptor that is used to pass a socket to the child process.
 27 | ///
 28 | /// We pick a high number to avoid conflicts with other file descriptors.
 29 | const CANARY_SOCKET_FD: RawFd = 1000;
 30 | 
 31 | /// Checks for evidence that this process is a child of a parent process that spawned it.
 32 | ///
 33 | /// Returns a UnixStream if the process is a child, otherwise returns None.
 34 | pub fn am_spawned() -> Option<UnixStream> {
 35 |     static CALLED: AtomicBool = AtomicBool::new(false);
 36 | 
 37 |     // Only take ownership of the fd if we haven't already
 38 |     if CALLED.swap(true, Ordering::SeqCst) {
 39 |         return None;
 40 |     }
 41 | 
 42 |     let is_valid_fd = unsafe { libc::fcntl(CANARY_SOCKET_FD, libc::F_GETFD) != -1 };
 43 |     if !is_valid_fd {
 44 |         return None;
 45 |     }
 46 | 
 47 |     // Check if it's actually a Unix domain socket
 48 |     let mut type_: libc::c_int = 0;
 49 |     let mut type_len = std::mem::size_of::<libc::c_int>() as libc::socklen_t;
 50 | 
 51 |     let is_unix_socket = unsafe {
 52 |         libc::getsockopt(
 53 |             CANARY_SOCKET_FD,
 54 |             libc::SOL_SOCKET,
 55 |             libc::SO_TYPE,
 56 |             &mut type_ as *mut _ as *mut _,
 57 |             &mut type_len,
 58 |         ) == 0
 59 |             && type_ == libc::SOCK_STREAM
 60 |     };
 61 | 
 62 |     if !is_unix_socket {
 63 |         return None;
 64 |     }
 65 | 
 66 |     let stream = unsafe {
 67 |         // SAFETY:
 68 |         // - The file descriptor is valid (checked above with fcntl)
 69 |         // - We verified it's actually a Unix domain socket (checked with getsockopt)
 70 |         // - This code can only run once due to the TAKEN atomic bool, ensuring we have exclusive
 71 |         //   ownership, passing it down into the UnixStream instance.
 72 |         // - No other code could have taken ownership as this is the first access (TAKEN was false)
 73 |         UnixStream::from_raw_fd(CANARY_SOCKET_FD)
 74 |     };
 75 |     Some(stream)
 76 | }
 77 | 
 78 | pub fn spawn_child() -> Result<(Child, UnixStream)> {
 79 |     let (sock1, sock2) = UnixStream::pair()?;
 80 | 
 81 |     let child = spawn_child_with_sock(sock2.as_raw_fd())?;
 82 |     drop(sock2); // Close parent's end in child
 83 | 
 84 |     Ok((child, sock1))
 85 | }
 86 | 
 87 | fn spawn_child_with_sock(socket_fd: RawFd) -> Result<Child> {
 88 |     trace!(?socket_fd, "Spawning child process");
 89 | 
 90 |     // Prepare argv for the child process.
 91 |     //
 92 |     // Contains only the program binary path and a null terminator.
 93 |     cfg_if! {
 94 |         if #[cfg(target_os = "linux")] {
 95 |             // Nothing beats the simplicity of /proc/self/exe on Linux.
 96 |             let program = std::ffi::OsString::from("/proc/self/exe");
 97 |         } else {
 98 |             let program = std::env::current_exe()?;
 99 |         }
100 |     }
101 | 
102 |     let mut cmd = Command::new(program);
103 |     // Override the PGID of the spawned process. The motivation for this is ^C handling. To handle
104 |     // ^C the shell will send the SIGINT to all processes in the process group. We are handling
105 |     // SIGINT manually in the supervisor process.
106 |     cmd.process_group(0);
107 |     unsafe {
108 |         cmd.pre_exec(move || {
109 |             // Duplicate the socket_fd to the CANARY_SOCKET_FD.
110 |             // Close the original socket_fd in the child process.
111 |             libc::dup2(socket_fd, CANARY_SOCKET_FD);
112 |             libc::close(socket_fd);
113 |             Ok(())
114 |         });
115 |     }
116 |     let child = cmd.spawn()?;
117 | 
118 |     let pid = child
119 |         .id()
120 |         .map(|pid| pid.to_string())
121 |         .unwrap_or_else(|| "<killed?>".to_string());
122 |     trace!("spawned child process, pid={pid}");
123 |     Ok(child)
124 | }
125 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/cli.rs:
--------------------------------------------------------------------------------
  1 | use clap::{Args, Parser};
  2 | 
  3 | #[derive(Parser, Debug)]
  4 | pub struct Cli {
  5 |     /// The 8-byte seed to use for the random number generator.
  6 |     ///
  7 |     /// If not provided, a random seed will be generated.
  8 |     pub seed: Option<u64>,
  9 | 
 10 |     /// The maximum number of failures before the supervisor stops.
 11 |     ///
 12 |     /// If not provided, the supervisor will stop after the first failure.
 13 |     #[arg(short, long, default_value_t = 1)]
 14 |     pub flag_limit: usize,
 15 | 
 16 |     #[clap(flatten)]
 17 |     pub workload_params: WorkloadParams,
 18 | }
 19 | 
 20 | #[derive(Clone, Debug, Args)]
 21 | pub struct WorkloadParams {
 22 |     /// The probability of a delete operation as opposed to an insert operation.
 23 |     ///
 24 |     /// Accepted values are in the range of 0 to 100
 25 |     #[clap(default_value = "10")]
 26 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 27 |     #[arg(long = "delete-bias", short = 'd')]
 28 |     pub delete: u8,
 29 | 
 30 |     /// When generating a value, the probability of generating a value that will spill into the
 31 |     /// overflow pages.
 32 |     ///
 33 |     /// Accepted values are in the range of 0 to 100
 34 |     #[clap(default_value = "10")]
 35 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 36 |     #[arg(long = "overflow-bias", short = 'o')]
 37 |     pub overflow: u8,
 38 | 
 39 |     /// When generating a key, whether it should be one that was appeared somewhere
 40 |     /// or a brand new key.
 41 |     ///
 42 |     /// Accepted values are in the range of 0 to 100
 43 |     #[clap(default_value = "50")]
 44 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 45 |     #[arg(long = "new-key-bias", short = 'n')]
 46 |     pub new_key: u8,
 47 | 
 48 |     /// The number of times a workload will be executed.
 49 |     #[clap(default_value = "50")]
 50 |     #[arg(long = "iterations", short = 'i')]
 51 |     pub iterations: usize,
 52 | 
 53 |     /// The size of a single workload iteration, the number of changesets per commit.
 54 |     #[clap(default_value = "5000")]
 55 |     #[arg(long = "size", short = 's')]
 56 |     pub size: usize,
 57 | 
 58 |     /// Whether the size of each workload should be random or not.
 59 |     ///
 60 |     /// If specified, the size of each commit will be within `0..size`,
 61 |     /// otherwise it will always be `size`.
 62 |     #[clap(default_value = "false")]
 63 |     #[arg(long = "random-size")]
 64 |     pub random_size: bool,
 65 | 
 66 |     /// When executing a workload iteration ,this is the probability of executing a rollback
 67 |     /// instead of a commit.
 68 |     ///
 69 |     /// Accepted values are in the range of 0 to 100
 70 |     #[clap(default_value = "30")]
 71 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 72 |     #[arg(long = "rollback-bias")]
 73 |     pub rollback: u8,
 74 | 
 75 |     /// When executing a commit this is the probability of causing it to crash.
 76 |     ///
 77 |     /// Accepted values are in the range of 0 to 100
 78 |     #[clap(default_value = "30")]
 79 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 80 |     #[arg(long = "commit-crash-bias")]
 81 |     pub commit_crash: u8,
 82 | 
 83 |     /// When executing a rollback this is the probability of causing it to crash.
 84 |     ///
 85 |     /// Accepted values are in the range of 0 to 100
 86 |     #[clap(default_value = "30")]
 87 |     #[clap(value_parser=clap::value_parser!(u8).range(0..=100))]
 88 |     #[arg(long = "rollback-crash-bias")]
 89 |     pub rollback_crash: u8,
 90 | 
 91 |     /// The max amount of commits involved in a rollback.
 92 |     ///
 93 |     /// The effective number of commits used for each rollback is randomly generated in the range
 94 |     /// 0..max_rollback_commits.
 95 |     #[clap(default_value = "100")]
 96 |     #[arg(long = "max-rollback-commits")]
 97 |     pub max_rollback_commits: u32,
 98 | 
 99 |     /// Whether to ensure the correct application of the changest after every commit.
100 |     #[clap(default_value = "false")]
101 |     #[arg(long = "ensure-changeset")]
102 |     pub ensure_changeset: bool,
103 | 
104 |     /// Whether to ensure the correctness of the entire state after every crash or rollback.
105 |     #[clap(default_value = "false")]
106 |     #[arg(long = "ensure-snapshot", conflicts_with = "sample_snapshot")]
107 |     pub ensure_snapshot: bool,
108 | 
109 |     /// Whether to randomly sample the state after every crash or rollback.
110 |     #[clap(default_value = "false")]
111 |     #[arg(long = "sample-snapshot")]
112 |     pub sample_snapshot: bool,
113 | 
114 |     /// Whether to enable testing using the trickfs.
115 |     ///
116 |     /// Supported on Linux only.
117 |     #[clap(default_value = "false")]
118 |     #[arg(long = "trickfs")]
119 |     pub trickfs: bool,
120 | }
121 | 


--------------------------------------------------------------------------------
/benchtop/src/timer.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     cell::RefCell,
  3 |     collections::hash_map::{Entry, HashMap},
  4 |     rc::Rc,
  5 | };
  6 | 
  7 | // At least three spans are expected to be measured
  8 | // + `workload`
  9 | // + `read`
 10 | // + `commit_and_prove`
 11 | pub struct Timer {
 12 |     name: String,
 13 |     spans: HashMap<&'static str, Rc<RefCell<hdrhistogram::Histogram<u64>>>>,
 14 | }
 15 | 
 16 | impl Timer {
 17 |     pub fn new(name: String) -> Self {
 18 |         Self {
 19 |             name,
 20 |             spans: HashMap::new(),
 21 |         }
 22 |     }
 23 | 
 24 |     pub fn record_span(&mut self, span_name: &'static str) -> impl Drop {
 25 |         struct RecordSpan {
 26 |             h: Rc<RefCell<hdrhistogram::Histogram<u64>>>,
 27 |             start: std::time::Instant,
 28 |         }
 29 |         impl Drop for RecordSpan {
 30 |             fn drop(&mut self) {
 31 |                 let elapsed = self.start.elapsed().as_nanos() as u64;
 32 |                 self.h.borrow_mut().record(elapsed).unwrap();
 33 |             }
 34 |         }
 35 | 
 36 |         let h = self.spans.entry(span_name).or_insert_with(|| {
 37 |             Rc::new(RefCell::new(
 38 |                 hdrhistogram::Histogram::<u64>::new(3).unwrap(),
 39 |             ))
 40 |         });
 41 | 
 42 |         RecordSpan {
 43 |             h: h.clone(),
 44 |             start: std::time::Instant::now(),
 45 |         }
 46 |     }
 47 | 
 48 |     pub fn freeze(self) -> FrozenTimer {
 49 |         FrozenTimer {
 50 |             spans: self
 51 |                 .spans
 52 |                 .into_iter()
 53 |                 .map(|(name, histogram)| (name, Rc::into_inner(histogram).unwrap().into_inner()))
 54 |                 .collect(),
 55 |         }
 56 |     }
 57 | 
 58 |     pub fn add(&mut self, other: FrozenTimer) {
 59 |         for (span_name, new_data) in other.spans {
 60 |             match self.spans.entry(span_name) {
 61 |                 Entry::Occupied(e) => e.get().borrow_mut().add(new_data).unwrap(),
 62 |                 Entry::Vacant(e) => {
 63 |                     let _ = e.insert(Rc::new(RefCell::new(new_data)));
 64 |                 }
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     pub fn get_last_workload_duration(&self) -> anyhow::Result<u64> {
 70 |         let h = self
 71 |             .spans
 72 |             .get("workload")
 73 |             .ok_or(anyhow::anyhow!("`workload` span not recorded"))?;
 74 | 
 75 |         Ok(h.borrow()
 76 |             .iter_recorded()
 77 |             .last()
 78 |             .ok_or(anyhow::anyhow!("No recorded value for `workload` span"))?
 79 |             .value_iterated_to())
 80 |     }
 81 | 
 82 |     pub fn get_mean_workload_duration(&self) -> anyhow::Result<u64> {
 83 |         Ok(self
 84 |             .spans
 85 |             .get("workload")
 86 |             .ok_or(anyhow::anyhow!("`workload` span not recorded"))?
 87 |             .borrow()
 88 |             .mean() as u64)
 89 |     }
 90 | 
 91 |     pub fn print(&mut self, workload_size: u64) {
 92 |         println!("{}", self.name);
 93 | 
 94 |         let expected_spans = ["workload", "read", "commit_and_prove"];
 95 | 
 96 |         // print expectd spans in order
 97 |         for span_name in expected_spans {
 98 |             let h = self.spans.get(span_name);
 99 |             match h {
100 |                 Some(h) => println!(
101 |                     "  mean {}: {}",
102 |                     span_name,
103 |                     pretty_display_ns(h.borrow().mean() as u64)
104 |                 ),
105 |                 None => println!("{} not measured", span_name),
106 |             };
107 |         }
108 | 
109 |         if let Ok(workload_mean_ns) = self.get_mean_workload_duration() {
110 |             let ops_per_second = workload_size as f64 / (workload_mean_ns as f64 / 1_000_000_000.0);
111 |             println!("  mean throughput: {ops_per_second:.1} ops/s");
112 |         }
113 | 
114 |         // print all other measured spans
115 |         for (span_name, h) in &self.spans {
116 |             if expected_spans.contains(span_name) {
117 |                 continue;
118 |             }
119 | 
120 |             println!(
121 |                 "  mean {}: {}",
122 |                 span_name,
123 |                 pretty_display_ns(h.borrow().mean() as u64)
124 |             )
125 |         }
126 |     }
127 | }
128 | 
129 | pub struct FrozenTimer {
130 |     spans: HashMap<&'static str, hdrhistogram::Histogram<u64>>,
131 | }
132 | 
133 | pub fn pretty_display_ns(ns: u64) -> String {
134 |     // preserve 3 sig figs at minimum.
135 |     let (val, unit) = if ns > 100 * 1_000_000_000 {
136 |         (ns / 1_000_000_000, "s")
137 |     } else if ns > 100 * 1_000_000 {
138 |         (ns / 1_000_000, "ms")
139 |     } else if ns > 100 * 1_000 {
140 |         (ns / 1_000, "us")
141 |     } else {
142 |         (ns, "ns")
143 |     };
144 | 
145 |     format!("{val} {unit}")
146 | }
147 | 


--------------------------------------------------------------------------------
/torture/src/logging.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, IsTerminal as _};
  2 | use std::path::Path;
  3 | 
  4 | use tracing::level_filters::LevelFilter;
  5 | use tracing::{span, Level};
  6 | use tracing_subscriber::{fmt, EnvFilter};
  7 | use tracing_subscriber::{prelude::*, registry::Registry, Layer};
  8 | 
  9 | const ENV_NAME_COMMON: &str = "TORTURE_ALL_LOG";
 10 | const ENV_NAME_AGENT: &str = "TORTURE_AGENT_LOG";
 11 | const ENV_NAME_SUPERVISOR: &str = "TORTURE_SUPERVISOR_LOG";
 12 | 
 13 | enum Kind {
 14 |     Agent,
 15 |     Supervisor,
 16 | }
 17 | 
 18 | fn istty() -> bool {
 19 |     io::stdout().is_terminal() && io::stderr().is_terminal()
 20 | }
 21 | 
 22 | /// Creates env filter for the agent or supervisor (depending on the `agent_not_supervisor`
 23 | /// argument).
 24 | ///
 25 | /// This function tries to read the most specific environment variable first, then falls back to
 26 | /// the common one ([`ENV_NAME_COMMON`]).
 27 | fn env_filter(kind: Kind) -> EnvFilter {
 28 |     let specific_env_name = match kind {
 29 |         Kind::Agent => ENV_NAME_AGENT,
 30 |         Kind::Supervisor => ENV_NAME_SUPERVISOR,
 31 |     };
 32 | 
 33 |     return try_parse_env(specific_env_name).unwrap_or_else(|| {
 34 |         try_parse_env(ENV_NAME_COMMON).unwrap_or_else(|| {
 35 |             EnvFilter::builder()
 36 |                 .with_default_directive(LevelFilter::INFO.into())
 37 |                 .parse("")
 38 |                 .unwrap()
 39 |         })
 40 |     });
 41 | 
 42 |     fn try_parse_env(var_name: &str) -> Option<EnvFilter> {
 43 |         match std::env::var(var_name) {
 44 |             Ok(env) => Some(
 45 |                 EnvFilter::builder()
 46 |                     .with_default_directive(LevelFilter::INFO.into())
 47 |                     .parse(env)
 48 |                     .unwrap(),
 49 |             ),
 50 |             Err(std::env::VarError::NotPresent) => {
 51 |                 return None;
 52 |             }
 53 |             Err(std::env::VarError::NotUnicode(_)) => {
 54 |                 panic!("Environment variable {} is not unicode", var_name);
 55 |             }
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | pub fn init_supervisor() {
 61 |     let format = fmt::format()
 62 |         .with_level(true)
 63 |         .with_target(false)
 64 |         .with_thread_ids(false)
 65 |         .with_thread_names(false)
 66 |         .compact()
 67 |         .with_timer(fmt::time::SystemTime::default());
 68 |     let subscriber = fmt::Subscriber::builder()
 69 |         .with_env_filter(env_filter(Kind::Supervisor))
 70 |         .with_writer(io::stdout)
 71 |         .with_ansi(istty())
 72 |         .event_format(format)
 73 |         .finish();
 74 | 
 75 |     tracing::subscriber::set_global_default(subscriber)
 76 |         .expect("Failed to set supervisor subscriber");
 77 | }
 78 | 
 79 | pub fn init_agent(agent_id: &str, workdir: &impl AsRef<Path>) {
 80 |     // Console layer with ANSI colors
 81 |     let console_layer = fmt::layer()
 82 |         .with_writer(io::stdout)
 83 |         .event_format(
 84 |             fmt::format()
 85 |                 .with_level(true)
 86 |                 .with_target(false)
 87 |                 .with_thread_ids(false)
 88 |                 .with_thread_names(false)
 89 |                 .with_ansi(istty())
 90 |                 .compact()
 91 |                 .with_timer(fmt::time::SystemTime::default()),
 92 |         )
 93 |         .with_filter(env_filter(Kind::Agent));
 94 | 
 95 |     // File layer with ANSI disabled
 96 |     // TODO: this has an issue currently. While the ANSI is false the colors are not disabled
 97 |     // everywhere.
 98 |     let file = std::fs::File::options()
 99 |         .create(true)
100 |         .append(true)
101 |         .open(workdir.as_ref().join("agent.log"))
102 |         .unwrap();
103 | 
104 |     // TODO: this has an issue currently. While the ANSI is false the colors are not disabled
105 |     // everywhere.
106 |     let file_layer = fmt::layer()
107 |         .with_writer(file)
108 |         .event_format(
109 |             fmt::format()
110 |                 .with_level(true)
111 |                 .with_target(false)
112 |                 .with_thread_ids(false)
113 |                 .with_thread_names(false)
114 |                 .with_ansi(false)
115 |                 .compact()
116 |                 .with_timer(fmt::time::SystemTime::default()),
117 |         )
118 |         .with_filter(env_filter(Kind::Agent));
119 | 
120 |     // Combine both layers in a single Subscriber
121 |     let subscriber = Registry::default().with(console_layer).with(file_layer);
122 | 
123 |     // Set the global subscriber
124 |     tracing::subscriber::set_global_default(subscriber).expect("Failed to set agent subscriber");
125 | 
126 |     let pid = std::process::id();
127 |     let span = span!(Level::INFO, "agent", agent_id, pid);
128 |     let _enter = span.enter();
129 |     // We intentionally `forget` the guard so the span remains open
130 |     // for the lifetime of the entire agent process if desired.
131 |     std::mem::forget(_enter);
132 | }
133 | 


--------------------------------------------------------------------------------
/benchtop/src/bench.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Backend,
  3 |     cli::bench::BenchType,
  4 |     timer::Timer,
  5 |     workload,
  6 |     workload::{Init, Workload},
  7 | };
  8 | use anyhow::Result;
  9 | 
 10 | pub fn bench(bench_type: BenchType) -> Result<()> {
 11 |     let common_params = match bench_type {
 12 |         BenchType::Isolate(ref params) => &params.common_params,
 13 |         BenchType::Sequential(ref params) => &params.common_params,
 14 |     };
 15 | 
 16 |     let (init, workload) = workload::parse(
 17 |         common_params.workload.name.as_str(),
 18 |         common_params.workload.size,
 19 |         common_params
 20 |             .workload
 21 |             .initial_capacity
 22 |             .map(|s| 1u64 << s)
 23 |             .unwrap_or(0),
 24 |         common_params.workload.percentage_cold,
 25 |     )?;
 26 |     let commit_concurrency = common_params.workload.commit_concurrency;
 27 |     let io_workers = common_params.workload.io_workers;
 28 | 
 29 |     let backends = if common_params.backends.is_empty() {
 30 |         Backend::all_backends()
 31 |     } else {
 32 |         common_params.backends.clone()
 33 |     };
 34 | 
 35 |     match bench_type {
 36 |         BenchType::Isolate(params) => bench_isolate(
 37 |             init,
 38 |             workload,
 39 |             backends,
 40 |             params.iterations,
 41 |             true,
 42 |             commit_concurrency,
 43 |             io_workers,
 44 |         )
 45 |         .map(|_| ()),
 46 |         BenchType::Sequential(params) => bench_sequential(
 47 |             init,
 48 |             workload,
 49 |             backends,
 50 |             params.op_limit,
 51 |             params.time_limit,
 52 |             true,
 53 |             commit_concurrency,
 54 |             io_workers,
 55 |         )
 56 |         .map(|_| ()),
 57 |     }
 58 | }
 59 | 
 60 | // Benchmark the workload across multiple backends multiple times.
 61 | // Each iteration will be executed on a freshly initialized database.
 62 | //
 63 | // Return the mean execution time of the workloads for each backends
 64 | // in the order the backends are provided
 65 | pub fn bench_isolate(
 66 |     mut init: Init,
 67 |     mut workload: Box<dyn Workload>,
 68 |     backends: Vec<Backend>,
 69 |     iterations: u64,
 70 |     print: bool,
 71 |     commit_concurrency: usize,
 72 |     io_workers: usize,
 73 | ) -> Result<Vec<u64>> {
 74 |     let mut mean_results = vec![];
 75 |     for backend in backends {
 76 |         let mut timer = Timer::new(format!("{}", backend));
 77 | 
 78 |         for _ in 0..iterations {
 79 |             let mut db = backend.instantiate(true, commit_concurrency, io_workers);
 80 |             db.execute(None, &mut init);
 81 |             db.execute(Some(&mut timer), &mut *workload);
 82 |             db.print_metrics();
 83 |         }
 84 | 
 85 |         if print {
 86 |             timer.print();
 87 |         }
 88 |         mean_results.push(timer.get_mean_workload_duration()?);
 89 |     }
 90 | 
 91 |     Ok(mean_results)
 92 | }
 93 | 
 94 | // Benchmark the workload across multiple backends multiple times.
 95 | // Each iteration will be executed on the same db repeatedly
 96 | // without clearing it until a time or operation count limit is reaced.
 97 | //
 98 | // Return the mean execution time of the workloads for each backends
 99 | // in the order the backends are provided
100 | pub fn bench_sequential(
101 |     mut init: Init,
102 |     mut workload: Box<dyn Workload>,
103 |     backends: Vec<Backend>,
104 |     op_limit: Option<u64>,
105 |     time_limit: Option<u64>,
106 |     print: bool,
107 |     commit_concurrency: usize,
108 |     io_workers: usize,
109 | ) -> Result<Vec<u64>> {
110 |     if let (None, None) = (op_limit, time_limit) {
111 |         anyhow::bail!("You need to specify at least one limiter between operations and time")
112 |     }
113 | 
114 |     let mut mean_results = vec![];
115 | 
116 |     for backend in backends {
117 |         let mut timer = Timer::new(format!("{}", backend));
118 |         let mut db = backend.instantiate(true, commit_concurrency, io_workers);
119 | 
120 |         let mut elapsed_time = 0;
121 |         let mut op_count = 0;
122 | 
123 |         db.execute(None, &mut init);
124 | 
125 |         loop {
126 |             db.execute(Some(&mut timer), &mut *workload);
127 | 
128 |             // check if time limit exceeded
129 |             elapsed_time += timer.get_last_workload_duration()?;
130 |             match time_limit {
131 |                 Some(limit) if elapsed_time >= (limit * 1000000) => break,
132 |                 _ => (),
133 |             };
134 | 
135 |             // check if op limit exceeded
136 |             op_count += workload.size() as u64;
137 |             match op_limit {
138 |                 Some(limit) if op_count >= limit => break,
139 |                 _ => (),
140 |             };
141 |         }
142 | 
143 |         db.print_metrics();
144 | 
145 |         if print {
146 |             timer.print();
147 |         }
148 |         mean_results.push(timer.get_mean_workload_duration()?);
149 |     }
150 |     Ok(mean_results)
151 | }
152 | 


--------------------------------------------------------------------------------
/nomt/src/seglog/segment_filename.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | 
  3 | pub fn format(prefix: &str, segment_id: u32) -> String {
  4 |     // The format string specifies a 10-digit number, so we pad with leading zeros from
  5 |     // the left. This assumes that segment_id is a 32-bit integer, which is confirmed by
  6 |     // the assert below. If you came here because it failed due to changing it to u64,
  7 |     // you will need to update the format string as well.
  8 |     assert_eq!(segment_id.to_le_bytes().len(), 4);
  9 |     format!("{prefix}.{segment_id:0>10}.log")
 10 | }
 11 | 
 12 | pub fn parse(prefix: &str, filename: &str) -> Result<u32> {
 13 |     // The filename of a segment file consists of a configurable prefix, a 10-digit segment ID,
 14 |     // and a ".log" suffix.
 15 |     //
 16 |     // Example: "prefix.0000000001.log".
 17 |     // Extract the segment ID from the filename
 18 |     assert!(!prefix.is_empty());
 19 |     let without_prefix = match filename.strip_prefix(prefix) {
 20 |         Some(s) => s,
 21 |         None => {
 22 |             return Err(anyhow::anyhow!(
 23 |                 "Invalid segment filename format: missing prefix"
 24 |             ))
 25 |         }
 26 |     };
 27 | 
 28 |     let without_suffix = match without_prefix.strip_suffix(".log") {
 29 |         Some(s) => s,
 30 |         None => {
 31 |             return Err(anyhow::anyhow!(
 32 |                 "Invalid segment filename format: missing .log suffix"
 33 |             ))
 34 |         }
 35 |     };
 36 | 
 37 |     let segment_id_str = match without_suffix.strip_prefix('.') {
 38 |         Some(s) => s,
 39 |         None => {
 40 |             return Err(anyhow::anyhow!(
 41 |                 "Invalid segment filename format: missing dot separator"
 42 |             ))
 43 |         }
 44 |     };
 45 | 
 46 |     // Check that the segment ID string has exactly 10 digits
 47 |     if segment_id_str.len() != 10 {
 48 |         return Err(anyhow::anyhow!(
 49 |             "Invalid segment filename format: segment ID must be exactly 10 digits"
 50 |         ));
 51 |     }
 52 | 
 53 |     // Parse the segment ID as a u32
 54 |     let segment_id = segment_id_str
 55 |         .parse::<u32>()
 56 |         .context("Failed to parse segment ID")?;
 57 | 
 58 |     Ok(segment_id)
 59 | }
 60 | 
 61 | #[cfg(test)]
 62 | mod tests {
 63 |     use super::{format, parse};
 64 | 
 65 |     #[test]
 66 |     fn test_filename_isomorphism() {
 67 |         let test_cases = vec![
 68 |             ("prefix", 0),
 69 |             ("prefix", 1),
 70 |             ("prefix", 9999),
 71 |             ("prefix", u32::MAX),
 72 |             ("log", 42),
 73 |             ("segment", 1000000),
 74 |             ("very_long_prefix_name", 12345),
 75 |             ("a", 987654321),
 76 |         ];
 77 | 
 78 |         for (prefix, id) in test_cases {
 79 |             let filename = format(prefix, id);
 80 |             let parsed_id = parse(prefix, &filename).unwrap();
 81 |             assert_eq!(
 82 |                 id, parsed_id,
 83 |                 "Mismatch for prefix '{}' and id {}",
 84 |                 prefix, id
 85 |             );
 86 |         }
 87 |     }
 88 | 
 89 |     #[test]
 90 |     fn test_parse_segment_filename_edge_cases() {
 91 |         // Valid cases
 92 |         assert_eq!(parse("prefix", "prefix.0000000000.log").unwrap(), 0);
 93 |         assert_eq!(parse("prefix", "prefix.0000000001.log").unwrap(), 1);
 94 |         assert_eq!(parse("prefix", "prefix.4294967295.log").unwrap(), u32::MAX);
 95 |         assert_eq!(parse("a", "a.0000000042.log").unwrap(), 42);
 96 | 
 97 |         // Invalid cases
 98 |         assert!(parse("prefix", "prefix.00000000001.log").is_err()); // Too many digits
 99 |         assert!(parse("prefix", "prefix.000000001.log").is_err()); // Too few digits
100 |         assert!(parse("prefix", "prefix.000000000a.log").is_err()); // Non-numeric ID
101 |         assert!(parse("prefix", "prefix.0000000000").is_err()); // Missing .log suffix
102 |         assert!(parse("prefix", "prefix0000000000.log").is_err()); // Missing dot after prefix
103 |         assert!(parse("prefix", "wrongprefix.0000000000.log").is_err()); // Wrong prefix
104 |         assert!(parse("prefix", ".0000000000.log").is_err()); // Missing prefix
105 |         assert!(parse("prefix", "prefix..log").is_err()); // Missing ID
106 |         assert!(parse("prefix", "prefix.0000000000.wrongsuffix").is_err()); // Wrong suffix
107 | 
108 |         // Adversarial cases
109 |         assert!(parse("prefix", "prefix.0000000000.logx").is_err()); // Extra character after .log
110 |         assert!(parse("prefix", "xprefix.0000000000.log").is_err()); // Extra character before prefix
111 |         assert!(parse("prefix", "prefix.00000000001log").is_err()); // Missing dot before log
112 |         assert!(parse("prefix", "prefix.0000000000.log.").is_err()); // Extra dot at the end
113 |         assert!(parse("prefix", "prefix.4294967296.log").is_err()); // ID overflow (u32::MAX + 1)
114 |         assert!(parse("prefix", "prefix.0x0000000A.log").is_err()); // Hexadecimal ID
115 |         assert_eq!(
116 |             parse("prefix.with.dots", "prefix.with.dots.0000000000.log").unwrap(),
117 |             0
118 |         ); // Prefix with dots
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/nomt/tests/overlay.rs:
--------------------------------------------------------------------------------
  1 | mod common;
  2 | 
  3 | use common::Test;
  4 | 
  5 | fn expected_root(items: Vec<([u8; 32], Vec<u8>)>) -> nomt_core::trie::Node {
  6 |     nomt_core::update::build_trie::<nomt::hasher::Blake3Hasher>(
  7 |         0,
  8 |         items
  9 |             .into_iter()
 10 |             .map(|(k, v)| (k, *blake3::hash(&v).as_bytes())),
 11 |         |_| {},
 12 |     )
 13 | }
 14 | 
 15 | #[test]
 16 | fn overlay_multiple_forks() {
 17 |     let mut test = Test::new("overlay_multiple_forks");
 18 | 
 19 |     let overlay_a = test.update().0;
 20 |     let overlay_b1 = {
 21 |         test.start_overlay_session([&overlay_a]);
 22 |         test.write([1; 32], Some(vec![1, 2, 3]));
 23 |         test.update().0
 24 |     };
 25 |     let overlay_b2 = {
 26 |         test.start_overlay_session([&overlay_a]);
 27 |         test.write([1; 32], Some(vec![4, 5, 6]));
 28 |         test.update().0
 29 |     };
 30 | 
 31 |     {
 32 |         test.start_overlay_session([&overlay_b1, &overlay_a]);
 33 |         assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3]));
 34 |     }
 35 | 
 36 |     {
 37 |         test.start_overlay_session([&overlay_b2, &overlay_a]);
 38 |         assert_eq!(test.read([1; 32]), Some(vec![4, 5, 6]));
 39 |     }
 40 | }
 41 | 
 42 | #[test]
 43 | fn overlay_root_calculation() {
 44 |     let mut test = Test::new("overlay_root_calculation");
 45 |     test.write([1; 32], Some(vec![1, 2, 3]));
 46 |     let overlay_a = test.update().0;
 47 | 
 48 |     assert_eq!(
 49 |         overlay_a.root().into_inner(),
 50 |         expected_root(vec![([1; 32], vec![1, 2, 3])]),
 51 |     );
 52 | 
 53 |     test.start_overlay_session([&overlay_a]);
 54 |     test.write([2; 32], Some(vec![4, 5, 6]));
 55 |     let overlay_b = test.update().0;
 56 | 
 57 |     assert_eq!(
 58 |         overlay_b.root().into_inner(),
 59 |         expected_root(vec![([1; 32], vec![1, 2, 3]), ([2; 32], vec![4, 5, 6])]),
 60 |     );
 61 | 
 62 |     test.start_overlay_session([&overlay_b, &overlay_a]);
 63 |     test.write([1; 32], Some(vec![7, 8, 9]));
 64 |     test.write([3; 32], Some(vec![0, 1, 0]));
 65 |     let overlay_c = test.update().0;
 66 | 
 67 |     assert_eq!(
 68 |         overlay_c.root().into_inner(),
 69 |         expected_root(vec![
 70 |             ([1; 32], vec![7, 8, 9]),
 71 |             ([2; 32], vec![4, 5, 6]),
 72 |             ([3; 32], vec![0, 1, 0])
 73 |         ]),
 74 |     );
 75 | }
 76 | 
 77 | #[test]
 78 | #[should_panic]
 79 | fn overlays_must_be_committed_in_order() {
 80 |     let mut test = Test::new("overlays_committed_in_order");
 81 |     let overlay_a = test.update().0;
 82 |     test.start_overlay_session([&overlay_a]);
 83 |     let overlay_b = test.update().0;
 84 | 
 85 |     test.commit_overlay(overlay_b);
 86 | }
 87 | 
 88 | #[test]
 89 | #[should_panic]
 90 | fn overlay_competing_committed() {
 91 |     let mut test = Test::new("overlays_competing_committed");
 92 |     let overlay_a = test.update().0;
 93 |     test.start_overlay_session([&overlay_a]);
 94 |     let overlay_b1 = test.update().0;
 95 |     test.start_overlay_session([&overlay_a]);
 96 |     let overlay_b2 = test.update().0;
 97 | 
 98 |     test.commit_overlay(overlay_a);
 99 |     test.commit_overlay(overlay_b1);
100 | 
101 |     test.commit_overlay(overlay_b2);
102 | }
103 | 
104 | #[test]
105 | fn overlay_commit_in_order_works() {
106 |     let mut test = Test::new("overlays_commit_in_order_works");
107 |     let overlay_a = test.update().0;
108 |     test.start_overlay_session([&overlay_a]);
109 |     let overlay_b = test.update().0;
110 | 
111 |     test.commit_overlay(overlay_a);
112 |     test.commit_overlay(overlay_b);
113 | }
114 | 
115 | #[test]
116 | fn overlay_changes_land_on_disk_when_committed() {
117 |     {
118 |         let mut test = Test::new("overlay_changes_land_on_disk");
119 |         test.write([1; 32], Some(vec![1, 2, 3]));
120 |         test.write([2; 32], Some(vec![4, 5, 6]));
121 |         test.write([3; 32], Some(vec![7, 8, 9]));
122 | 
123 |         let overlay = test.update().0;
124 |         test.commit_overlay(overlay);
125 |     }
126 | 
127 |     let mut test = Test::new_with_params(
128 |         "overlay_changes_land_on_disk",
129 |         /* commit_concurrency */ 1,
130 |         /* hashtable_buckets */ 1,
131 |         /* panic_on_sync */ None,
132 |         /* cleanup_dir */ false,
133 |     );
134 | 
135 |     assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3]));
136 |     assert_eq!(test.read([2; 32]), Some(vec![4, 5, 6]));
137 |     assert_eq!(test.read([3; 32]), Some(vec![7, 8, 9]));
138 | }
139 | 
140 | #[test]
141 | fn overlay_uncommitted_not_on_disk() {
142 |     {
143 |         let mut test = Test::new("overlay_uncommitted_not_on_disk");
144 |         test.write([1; 32], Some(vec![1, 2, 3]));
145 |         test.write([2; 32], Some(vec![4, 5, 6]));
146 |         test.write([3; 32], Some(vec![7, 8, 9]));
147 | 
148 |         let _overlay = test.update().0;
149 |     }
150 | 
151 |     let mut test = Test::new_with_params(
152 |         "overlay_uncommitted_not_on_disk",
153 |         /* commit_concurrency */ 1,
154 |         /* hashtable_buckets */ 1,
155 |         /* panic_on_sync */ None,
156 |         /* cleanup_dir */ false,
157 |     );
158 | 
159 |     assert_eq!(test.read([1; 32]), None);
160 |     assert_eq!(test.read([2; 32]), None);
161 |     assert_eq!(test.read([3; 32]), None);
162 | }
163 | 


--------------------------------------------------------------------------------
/benchtop/src/custom_workload.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Transaction,
  3 |     cli::StateItemDistribution,
  4 |     workload::{Distribution, Workload},
  5 | };
  6 | use rand::Rng;
  7 | 
  8 | #[derive(Clone)]
  9 | pub struct RwInit {
 10 |     cur_val: u64,
 11 |     num_vals: u64,
 12 | }
 13 | 
 14 | impl Workload for RwInit {
 15 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
 16 |         const MAX_INIT_PER_ITERATION: u64 = 64 * 1024 * 1024;
 17 | 
 18 |         if self.num_vals == 0 {
 19 |             return;
 20 |         }
 21 | 
 22 |         let count = std::cmp::min(self.num_vals - self.cur_val, MAX_INIT_PER_ITERATION);
 23 |         for _ in 0..count {
 24 |             transaction.write(&encode_id(self.cur_val), Some(&[64u8; 32]));
 25 |             self.cur_val += 1;
 26 |         }
 27 |         println!(
 28 |             "populating {:.1}%",
 29 |             100.0 * (self.cur_val as f64) / (self.num_vals as f64)
 30 |         );
 31 |     }
 32 | 
 33 |     fn is_done(&self) -> bool {
 34 |         self.num_vals == self.cur_val
 35 |     }
 36 | }
 37 | 
 38 | /// Greate a workload for initializing a database with the given amount of key-value pairs.
 39 | pub fn init(db_size: u64) -> RwInit {
 40 |     RwInit {
 41 |         cur_val: 0,
 42 |         num_vals: db_size,
 43 |     }
 44 | }
 45 | 
 46 | fn encode_id(id: u64) -> [u8; 8] {
 47 |     id.to_be_bytes()
 48 | }
 49 | 
 50 | /// Build N `RwWorkload`s, one for each thread.
 51 | pub fn build(
 52 |     reads: u8,
 53 |     writes: u8,
 54 |     workload_size: u64,
 55 |     fresh: u8,
 56 |     db_size: u64,
 57 |     op_limit: u64,
 58 |     threads: usize,
 59 |     distribution: StateItemDistribution,
 60 | ) -> Vec<RwWorkload> {
 61 |     let thread_workload_size = workload_size / threads as u64;
 62 |     let db_step = db_size / threads as u64;
 63 | 
 64 |     (0..threads)
 65 |         .map(|i| {
 66 |             let db_start = db_step * i as u64;
 67 | 
 68 |             RwWorkload {
 69 |                 reads,
 70 |                 writes,
 71 |                 fresh,
 72 |                 workload_size: if i == threads - 1 {
 73 |                     thread_workload_size + workload_size % threads as u64
 74 |                 } else {
 75 |                     thread_workload_size
 76 |                 },
 77 |                 ops_remaining: op_limit / threads as u64,
 78 |                 distribution: Distribution::new(distribution, db_start, db_start + db_step),
 79 |             }
 80 |         })
 81 |         .collect()
 82 | }
 83 | 
 84 | // The read-write workload will follow these rules:
 85 | // 1. Reads and writes are randomly and uniformly distributed across the key space.
 86 | // 2. The DB size indicates the number of entries in the database.
 87 | // 3. The workload size represents the total number of operations, where reads and writes
 88 | //     are numbers that need to sum to 100 and represent a percentage of the total size.
 89 | // 4. Fresh indicates the percentage of reads and writes that will be performed on
 90 | //     non-existing keys
 91 | pub struct RwWorkload {
 92 |     pub reads: u8,
 93 |     pub writes: u8,
 94 |     pub workload_size: u64,
 95 |     pub fresh: u8,
 96 |     pub ops_remaining: u64,
 97 |     pub distribution: Distribution,
 98 | }
 99 | 
100 | impl Workload for RwWorkload {
101 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
102 |         let from_percentage = |p: u8| (self.workload_size as f64 * p as f64 / 100.0) as u64;
103 |         let fresh = |size: u64| (size as f64 * self.fresh as f64 / 100.0) as u64;
104 | 
105 |         // total reads and writes
106 |         let n_reads = from_percentage(self.reads);
107 |         let n_writes = from_percentage(self.writes);
108 |         // fresh reads and writes
109 |         let n_reads_fresh = fresh(n_reads);
110 |         let n_writes_fresh = fresh(n_writes);
111 | 
112 |         let mut rng = rand::thread_rng();
113 | 
114 |         for i in 0..n_reads {
115 |             let _ = if i < n_reads_fresh {
116 |                 // fresh read, technically there is a chance to generate
117 |                 // a random key that is already present in the database,
118 |                 // but it is very unlikely
119 |                 transaction.read(&rand_key(&mut rng))
120 |             } else {
121 |                 // read already existing key
122 |                 let key = self.distribution.sample(&mut rng);
123 |                 transaction.read(&encode_id(key))
124 |             };
125 |         }
126 | 
127 |         for i in 0..n_writes {
128 |             let value = rand_key(&mut rng);
129 |             if i < n_writes_fresh {
130 |                 // fresh write
131 |                 transaction.write(&rand_key(&mut rng), Some(&value));
132 |             } else {
133 |                 // substitute key
134 |                 let key = self.distribution.sample(&mut rng);
135 |                 transaction.write(&encode_id(key), Some(&value));
136 |             };
137 |         }
138 | 
139 |         self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size);
140 |     }
141 | 
142 |     fn is_done(&self) -> bool {
143 |         self.ops_remaining == 0
144 |     }
145 | }
146 | 
147 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] {
148 |     // keys must be uniformly distributed
149 |     let mut key = [0; 32];
150 |     rng.fill(&mut key[..16]);
151 |     key
152 | }
153 | 


--------------------------------------------------------------------------------
/nomt/src/rollback/delta.rs:
--------------------------------------------------------------------------------
  1 | use nomt_core::trie::KeyPath;
  2 | use std::{
  3 |     collections::HashMap,
  4 |     io::{Cursor, Read as _},
  5 | };
  6 | 
  7 | /// A delta that should be applied to reverse a commit.
  8 | #[derive(Debug, Clone)]
  9 | pub struct Delta {
 10 |     /// This map contains the prior value for each key that was written by the commit this delta
 11 |     /// reverses. `None` indicates that the key did not exist before the commit.
 12 |     pub(crate) priors: HashMap<KeyPath, Option<Vec<u8>>>,
 13 | }
 14 | 
 15 | impl Delta {
 16 |     #[cfg(test)]
 17 |     fn empty() -> Self {
 18 |         Self {
 19 |             priors: HashMap::new(),
 20 |         }
 21 |     }
 22 | 
 23 |     /// Encode the delta into a buffer.
 24 |     ///
 25 |     /// Returns the number of bytes written.
 26 |     pub(super) fn encode(&self) -> Vec<u8> {
 27 |         // The serialization format has the following layout.
 28 |         //
 29 |         // The keys are split into two groups and written as separate arrays. Those groups are:
 30 |         //
 31 |         // 1. erase: The keys that did not exist before the commit.
 32 |         // 2. reinstateThe keys that had prior values.
 33 |         //
 34 |         // The keys that did not exist are written first. The keys that had prior values are
 35 |         // written second.
 36 |         //
 37 |         // For each kind of key, we first write out the length of the array encoded as a u32.
 38 |         // This is followed by the keys themselves, written contiguously in little-endian order.
 39 |         //
 40 |         // The keys are written as 32-byte big-endian values.
 41 | 
 42 |         // Sort the keys into two groups.
 43 |         let mut to_erase = Vec::with_capacity(self.priors.len());
 44 |         let mut to_reinstate = Vec::with_capacity(self.priors.len());
 45 |         for (key, value) in self.priors.iter() {
 46 |             match value {
 47 |                 None => to_erase.push(key),
 48 |                 Some(value) => to_reinstate.push((key, value)),
 49 |             }
 50 |         }
 51 | 
 52 |         let to_erase_len = to_erase.len() as u32;
 53 |         let mut buf = Vec::with_capacity(4 + 32 * to_erase.len());
 54 |         buf.extend_from_slice(&to_erase_len.to_le_bytes());
 55 |         for key in to_erase {
 56 |             buf.extend_from_slice(&key[..]);
 57 |         }
 58 | 
 59 |         let to_reinstate_len = to_reinstate.len() as u32;
 60 |         buf.extend_from_slice(&to_reinstate_len.to_le_bytes());
 61 |         for (key, value) in to_reinstate {
 62 |             buf.extend_from_slice(&key[..]);
 63 |             let value_len = value.len() as u32;
 64 |             buf.extend_from_slice(&value_len.to_le_bytes());
 65 |             buf.extend_from_slice(value);
 66 |         }
 67 | 
 68 |         buf
 69 |     }
 70 | 
 71 |     /// Decodes the delta from a buffer.
 72 |     pub(super) fn decode(reader: &mut Cursor<impl AsRef<[u8]>>) -> anyhow::Result<Self> {
 73 |         let mut priors = HashMap::new();
 74 | 
 75 |         // Read the number of keys to erase.
 76 |         let mut buf = [0; 4];
 77 |         reader.read_exact(&mut buf)?;
 78 |         let to_erase_len = u32::from_le_bytes(buf);
 79 |         // Read the keys to erase.
 80 |         for _ in 0..to_erase_len {
 81 |             let mut key_path = [0; 32];
 82 |             reader.read_exact(&mut key_path)?;
 83 |             let preemted = priors.insert(key_path, None).is_some();
 84 |             if preemted {
 85 |                 anyhow::bail!("duplicate key path (erase): {:?}", key_path);
 86 |             }
 87 |         }
 88 | 
 89 |         // Read the number of keys to reinstate.
 90 |         reader.read_exact(&mut buf)?;
 91 |         let to_reinsate_len = u32::from_le_bytes(buf);
 92 |         // Read the keys to reinstate along with their values.
 93 |         for _ in 0..to_reinsate_len {
 94 |             // Read the key path.
 95 |             let mut key_path = [0; 32];
 96 |             reader.read_exact(&mut key_path)?;
 97 |             // Read the value.
 98 |             let mut value = Vec::new();
 99 |             reader.read_exact(&mut buf)?;
100 |             let value_len = u32::from_le_bytes(buf);
101 |             value.resize(value_len as usize, 0);
102 |             reader.read_exact(&mut value)?;
103 |             let preempted = priors.insert(key_path, Some(value)).is_some();
104 |             if preempted {
105 |                 anyhow::bail!("duplicate key path (reinstate): {:?}", key_path);
106 |             }
107 |         }
108 |         Ok(Delta { priors })
109 |     }
110 | }
111 | 
112 | #[cfg(test)]
113 | mod tests {
114 |     use super::*;
115 | 
116 |     #[test]
117 |     fn delta_roundtrip() {
118 |         let mut delta = Delta::empty();
119 |         delta.priors.insert([1; 32], Some(b"value1".to_vec()));
120 |         delta.priors.insert([2; 32], None);
121 |         delta.priors.insert([3; 32], Some(b"value3".to_vec()));
122 | 
123 |         let mut buf = delta.encode();
124 |         let mut cursor = Cursor::new(&mut buf);
125 |         let delta2 = Delta::decode(&mut cursor).unwrap();
126 |         assert_eq!(delta.priors, delta2.priors);
127 |     }
128 | 
129 |     #[test]
130 |     fn delta_roundtrip_empty() {
131 |         let delta = Delta::empty();
132 |         let mut buf = delta.encode();
133 |         let mut cursor = Cursor::new(&mut buf);
134 |         let delta2 = Delta::decode(&mut cursor).unwrap();
135 |         assert_eq!(delta.priors, delta2.priors);
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/ht_file.rs:
--------------------------------------------------------------------------------
  1 | /// The HT file.
  2 | ///
  3 | /// The file that stores the hash-table buckets and the meta map.
  4 | use super::meta_map::MetaMap;
  5 | use crate::io::{self, PagePool, PAGE_SIZE};
  6 | use std::{
  7 |     fs::{File, OpenOptions},
  8 |     path::PathBuf,
  9 | };
 10 | 
 11 | /// The offsets of the HT file.
 12 | #[derive(Clone)]
 13 | pub struct HTOffsets {
 14 |     // the number of pages to add to a page number to find its real location in the file,
 15 |     // taking account of the meta page and meta byte pages.
 16 |     data_page_offset: u64,
 17 | }
 18 | 
 19 | impl HTOffsets {
 20 |     /// Returns the page number of the `ix`th item in the data section of the store.
 21 |     pub fn data_page_index(&self, ix: u64) -> u64 {
 22 |         self.data_page_offset + ix
 23 |     }
 24 | 
 25 |     /// Returns the page number of the `ix`th item in the meta bytes section of the store.
 26 |     pub fn meta_bytes_index(&self, ix: u64) -> u64 {
 27 |         ix
 28 |     }
 29 | }
 30 | 
 31 | fn expected_file_len(num_pages: u32) -> u64 {
 32 |     (num_meta_byte_pages(num_pages) + num_pages) as u64 * PAGE_SIZE as u64
 33 | }
 34 | 
 35 | fn num_meta_byte_pages(num_pages: u32) -> u32 {
 36 |     (num_pages + 4095) / PAGE_SIZE as u32
 37 | }
 38 | 
 39 | /// Opens the HT file, checks its length and reads the meta map.
 40 | pub fn open(
 41 |     num_pages: u32,
 42 |     page_pool: &PagePool,
 43 |     ht_fd: &File,
 44 | ) -> anyhow::Result<(HTOffsets, MetaMap)> {
 45 |     if ht_fd.metadata()?.len() != expected_file_len(num_pages) {
 46 |         anyhow::bail!("Store corrupted; unexpected file length");
 47 |     }
 48 | 
 49 |     let num_meta_byte_pages = num_meta_byte_pages(num_pages);
 50 |     let mut meta_bytes = Vec::with_capacity(num_meta_byte_pages as usize * PAGE_SIZE);
 51 |     for pn in 0..num_meta_byte_pages {
 52 |         let extra_meta_page = io::read_page(page_pool, ht_fd, pn as u64)?;
 53 |         meta_bytes.extend_from_slice(&*extra_meta_page);
 54 |     }
 55 | 
 56 |     let data_page_offset = num_meta_byte_pages as u64;
 57 |     Ok((
 58 |         HTOffsets { data_page_offset },
 59 |         MetaMap::from_bytes(meta_bytes, num_pages as usize),
 60 |     ))
 61 | }
 62 | 
 63 | /// Creates the store file. Fails if store file already exists.
 64 | ///
 65 | /// Lays out the meta page. If `preallocate` is true, preallocates the blocks for the file.
 66 | pub fn create(path: PathBuf, num_pages: u32, preallocate: bool) -> std::io::Result<()> {
 67 |     let start = std::time::Instant::now();
 68 |     let ht_path = path.join("ht");
 69 |     let ht_file = OpenOptions::new().write(true).create(true).open(ht_path)?;
 70 | 
 71 |     // number of pages + pages required for meta bits.
 72 |     let page_count = num_pages + num_meta_byte_pages(num_pages);
 73 |     let len = page_count as usize * PAGE_SIZE;
 74 | 
 75 |     resize_and_prealloc(&ht_file, len as u64, preallocate)?;
 76 | 
 77 |     ht_file.sync_all()?;
 78 |     drop(ht_file);
 79 | 
 80 |     let wal_path = path.join("wal");
 81 |     let wal_file = OpenOptions::new().write(true).create(true).open(wal_path)?;
 82 |     wal_file.sync_all()?;
 83 |     drop(wal_file);
 84 | 
 85 |     println!(
 86 |         "Created file with {} total pages in {}ms",
 87 |         page_count,
 88 |         start.elapsed().as_millis()
 89 |     );
 90 |     Ok(())
 91 | }
 92 | 
 93 | /// Sets the file size and attempts to preallocate the file if `preallocate` is true.
 94 | ///
 95 | /// Returns an error if setting the file size fails. File preallocation is done on a best-effort basis
 96 | /// and may silently fall back to regular allocation.
 97 | ///
 98 | /// After this call, if successful, the file size is set to `len` bytes.
 99 | fn resize_and_prealloc(ht_file: &File, len: u64, preallocate: bool) -> std::io::Result<()> {
100 |     if !preallocate {
101 |         // If not preallocating, just set the file size and return.
102 |         ht_file.set_len(len)?;
103 |         return Ok(());
104 |     }
105 | 
106 |     cfg_if::cfg_if! {
107 |         if #[cfg(target_os = "linux")] {
108 |             // To preallocate on Linux systems, try using fallocate with ZERO_RANGE first as it's more
109 |             // efficient. fallocate sets the file size as well, so ftruncate (aka file.set_len()) is
110 |             // not needed.
111 |             if crate::sys::linux::fs_check(ht_file).map_or(false, |fsck| fsck.is_tmpfs()) {
112 |                 // Skip preallocation for tmpfs. It doesn't support fallocate and it's
113 |                 // memory-backed anyway. ftruncate and bail.
114 |                 ht_file.set_len(len)?;
115 |                 return Ok(());
116 |             }
117 |             if let Err(_) = crate::sys::linux::falloc_zero_file(ht_file, len) {
118 |                 // If fallocate fails, fall back to zeroing the file with write.
119 |                 resize_and_zero_file(ht_file, len)?;
120 |             }
121 |         } else {
122 |             resize_and_zero_file(ht_file, len)?;
123 |         }
124 |     }
125 | 
126 |     Ok(())
127 | }
128 | 
129 | // Fallback method for allocating extents for the file: just incrementally write zeroes to the file.
130 | fn resize_and_zero_file(mut file: &File, len: u64) -> std::io::Result<()> {
131 |     use std::io::Write;
132 | 
133 |     // Set the file size first.
134 |     file.set_len(len)?;
135 | 
136 |     // Zero the file.
137 |     let len = len as usize;
138 |     let buf = [0u8; PAGE_SIZE * 4];
139 |     let mut remaining = len;
140 |     while remaining > 0 {
141 |         let len = std::cmp::min(remaining, buf.len());
142 |         file.write_all(&buf[..len])?;
143 |         remaining -= len;
144 |     }
145 |     Ok(())
146 | }
147 | 


--------------------------------------------------------------------------------
/benchtop/src/backend.rs:
--------------------------------------------------------------------------------
  1 | use crate::{nomt::NomtDB, timer::Timer, workload::Workload};
  2 | 
  3 | #[cfg(feature = "sov-db")]
  4 | use crate::sov_db::SovDB;
  5 | 
  6 | #[cfg(feature = "sp-trie")]
  7 | use crate::sp_trie::SpTrieDB;
  8 | 
  9 | #[derive(Debug, Clone, clap::ValueEnum)]
 10 | pub enum Backend {
 11 |     SovDB,
 12 |     Nomt,
 13 |     SpTrie,
 14 | }
 15 | 
 16 | impl Backend {
 17 |     pub fn all_backends() -> Vec<Self> {
 18 |         vec![Backend::SovDB, Backend::SpTrie, Backend::Nomt]
 19 |     }
 20 | 
 21 |     // If reset is true, then erase any previous backend's database
 22 |     // and restart from an empty database.
 23 |     // Otherwise, use the already present database.
 24 |     pub fn instantiate(
 25 |         &self,
 26 |         reset: bool,
 27 |         commit_concurrency: usize,
 28 |         io_workers: usize,
 29 |         hashtable_buckets: Option<u32>,
 30 |         page_cache_size: Option<usize>,
 31 |         leaf_cache_size: Option<usize>,
 32 |         page_cache_upper_levels: usize,
 33 |         prepopulate_page_cache: bool,
 34 |         overlay_window_length: usize,
 35 |     ) -> DB {
 36 |         match self {
 37 |             Backend::SovDB => {
 38 |                 #[cfg(not(feature = "sov-db"))]
 39 |                 panic!("benchtop not compiled with feature sov-db. rebuild");
 40 |                 #[cfg(feature = "sov-db")]
 41 |                 DB::Sov(SovDB::open(reset))
 42 |             }
 43 |             Backend::Nomt => DB::Nomt(NomtDB::open(
 44 |                 reset,
 45 |                 commit_concurrency,
 46 |                 io_workers,
 47 |                 hashtable_buckets,
 48 |                 page_cache_size,
 49 |                 leaf_cache_size,
 50 |                 page_cache_upper_levels,
 51 |                 prepopulate_page_cache,
 52 |                 overlay_window_length,
 53 |             )),
 54 |             Backend::SpTrie => {
 55 |                 #[cfg(not(feature = "sp-trie"))]
 56 |                 panic!("benchtop not compiled with feature sp-trie. rebuild");
 57 |                 #[cfg(feature = "sp-trie")]
 58 |                 DB::SpTrie(SpTrieDB::open(reset))
 59 |             }
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | /// A transaction over the database which allows reading and writing.
 65 | pub trait Transaction {
 66 |     /// Read a value from the database. If a value was previously written, return that.
 67 |     fn read(&mut self, key: &[u8]) -> Option<Vec<u8>>;
 68 | 
 69 |     /// Note that a value was read from a cache, for inclusion in a storage proof.
 70 |     fn note_read(&mut self, key: &[u8], value: Option<Vec<u8>>);
 71 | 
 72 |     /// Write a value to the database. `None` means to delete the previous value.
 73 |     fn write(&mut self, key: &[u8], value: Option<&[u8]>);
 74 | }
 75 | 
 76 | /// A wrapper around all databases implemented in this tool.
 77 | pub enum DB {
 78 |     #[cfg(feature = "sov-db")]
 79 |     Sov(SovDB),
 80 |     #[cfg(feature = "sp-trie")]
 81 |     SpTrie(SpTrieDB),
 82 |     Nomt(NomtDB),
 83 | }
 84 | 
 85 | impl DB {
 86 |     /// Execute a workload repeatedly until done or a time limit is reached.
 87 |     pub fn execute(
 88 |         &mut self,
 89 |         mut timer: Option<&mut Timer>,
 90 |         workload: &mut dyn Workload,
 91 |         timeout: Option<std::time::Instant>,
 92 |     ) {
 93 |         while !workload.is_done() {
 94 |             if timeout
 95 |                 .as_ref()
 96 |                 .map_or(false, |t| std::time::Instant::now() > *t)
 97 |             {
 98 |                 break;
 99 |             }
100 |             let timer = timer.as_deref_mut();
101 |             match self {
102 |                 #[cfg(feature = "sov-db")]
103 |                 DB::Sov(db) => db.execute(timer, workload),
104 |                 #[cfg(feature = "sp-trie")]
105 |                 DB::SpTrie(db) => db.execute(timer, workload),
106 |                 DB::Nomt(db) => db.execute(timer, workload),
107 |             }
108 |         }
109 |     }
110 | 
111 |     /// Execute several workloads in parallel, repeatedly, until all done or a time limit is reached.
112 |     ///
113 |     /// Only works with the NOMT backend.
114 |     pub fn parallel_execute(
115 |         &mut self,
116 |         mut timer: Option<&mut Timer>,
117 |         thread_pool: &rayon::ThreadPool,
118 |         workloads: &mut [Box<dyn Workload>],
119 |         timeout: Option<std::time::Instant>,
120 |     ) -> anyhow::Result<()> {
121 |         while workloads.iter().any(|w| !w.is_done()) {
122 |             if timeout
123 |                 .as_ref()
124 |                 .map_or(false, |t| std::time::Instant::now() > *t)
125 |             {
126 |                 break;
127 |             }
128 |             let timer = timer.as_deref_mut();
129 |             match self {
130 |                 #[cfg(feature = "sov-db")]
131 |                 DB::Sov(_) => {
132 |                     anyhow::bail!("parallel execution is only supported with the NOMT backend.")
133 |                 }
134 |                 #[cfg(feature = "sp-trie")]
135 |                 DB::SpTrie(_) => {
136 |                     anyhow::bail!("parallel execution is only supported with the NOMT backend.")
137 |                 }
138 |                 DB::Nomt(db) => db.parallel_execute(timer, thread_pool, workloads),
139 |             }
140 |         }
141 | 
142 |         Ok(())
143 |     }
144 | 
145 |     /// Print metrics collected by the Backend if it supports metrics collection
146 |     pub fn print_metrics(&self) {
147 |         match self {
148 |             DB::Nomt(db) => db.print_metrics(),
149 |             #[cfg(any(feature = "sp-trie", feature = "sov-db"))]
150 |             _ => (),
151 |         }
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/nomt/src/metrics.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::{
  2 |     atomic::{AtomicU64, Ordering},
  3 |     Arc,
  4 | };
  5 | 
  6 | /// Metrics collector, if active, it provides Counters and Timers
  7 | #[derive(Clone)]
  8 | pub struct Metrics {
  9 |     metrics: Option<Arc<ActiveMetrics>>,
 10 | }
 11 | 
 12 | /// Metrics that can be collected during execution
 13 | #[derive(PartialEq, Eq, Hash)]
 14 | pub enum Metric {
 15 |     /// Counter of total page requests
 16 |     PageRequests,
 17 |     /// Counter of page requests cache misses over all page requests
 18 |     PageCacheMisses,
 19 |     /// Timer used to record average page fetch time
 20 |     PageFetchTime,
 21 |     /// Timer used to record average value fetch time during reads
 22 |     ValueFetchTime,
 23 | }
 24 | 
 25 | struct ActiveMetrics {
 26 |     page_requests: AtomicU64,
 27 |     page_cache_misses: AtomicU64,
 28 |     page_fetch_time: Timer,
 29 |     value_fetch_time: Timer,
 30 | }
 31 | 
 32 | impl Metrics {
 33 |     /// Returns the Metrics object, active or not based on the specified input
 34 |     pub fn new(active: bool) -> Self {
 35 |         Self {
 36 |             metrics: if active {
 37 |                 Some(Arc::new(ActiveMetrics {
 38 |                     page_requests: AtomicU64::new(0),
 39 |                     page_cache_misses: AtomicU64::new(0),
 40 |                     page_fetch_time: Timer::new(),
 41 |                     value_fetch_time: Timer::new(),
 42 |                 }))
 43 |             } else {
 44 |                 None
 45 |             },
 46 |         }
 47 |     }
 48 | 
 49 |     /// Increase the Counter specified by the input
 50 |     ///
 51 |     /// panics if the specified [`Metric`] is not a Counter
 52 |     pub fn count(&self, metric: Metric) {
 53 |         if let Some(ref metrics) = self.metrics {
 54 |             let counter = match metric {
 55 |                 Metric::PageRequests => &metrics.page_requests,
 56 |                 Metric::PageCacheMisses => &metrics.page_cache_misses,
 57 |                 _ => panic!("Specified metric is not a Counter"),
 58 |             };
 59 | 
 60 |             counter.fetch_add(1, Ordering::Relaxed);
 61 |         }
 62 |     }
 63 | 
 64 |     /// Returns a guard that, when dropped, will record the time passed since creation
 65 |     ///
 66 |     /// panics if the specified [`Metric`] is not a Timer
 67 |     pub fn record<'a>(&'a self, metric: Metric) -> Option<impl Drop + 'a> {
 68 |         self.metrics.as_ref().and_then(|metrics| {
 69 |             let timer = match metric {
 70 |                 Metric::PageFetchTime => &metrics.page_fetch_time,
 71 |                 Metric::ValueFetchTime => &metrics.value_fetch_time,
 72 |                 _ => panic!("Specified metric is not a Timer"),
 73 |             };
 74 | 
 75 |             Some(timer.record())
 76 |         })
 77 |     }
 78 | 
 79 |     /// Print collected metrics to stdout
 80 |     pub fn print(&self) {
 81 |         if let Some(ref metrics) = self.metrics {
 82 |             println!("metrics");
 83 | 
 84 |             let tot_page_requests = metrics.page_requests.load(Ordering::Relaxed);
 85 |             println!("  page requests         {}", tot_page_requests);
 86 | 
 87 |             if tot_page_requests != 0 {
 88 |                 let cache_misses = metrics.page_cache_misses.load(Ordering::Relaxed);
 89 |                 let percentage_cache_misses =
 90 |                     (cache_misses as f64 / tot_page_requests as f64) * 100.0;
 91 | 
 92 |                 println!(
 93 |                     "  page cache misses     {} - {:.2}% of page requests",
 94 |                     cache_misses, percentage_cache_misses
 95 |                 );
 96 |             }
 97 | 
 98 |             if let Some(mean) = metrics.page_fetch_time.mean() {
 99 |                 println!("  page fetch mean       {}", pretty_display_ns(mean));
100 |             }
101 | 
102 |             if let Some(mean) = metrics.value_fetch_time.mean() {
103 |                 println!("  value fetch mean      {}", pretty_display_ns(mean));
104 |             }
105 |         } else {
106 |             println!("Metrics collection was not activated")
107 |         }
108 |     }
109 | }
110 | 
111 | fn pretty_display_ns(ns: u64) -> String {
112 |     // preserve 3 sig figs at minimum.
113 |     let (val, unit) = if ns > 100 * 1_000_000_000 {
114 |         (ns / 1_000_000_000, "s")
115 |     } else if ns > 100 * 1_000_000 {
116 |         (ns / 1_000_000, "ms")
117 |     } else if ns > 100 * 1_000 {
118 |         (ns / 1_000, "us")
119 |     } else {
120 |         (ns, "ns")
121 |     };
122 | 
123 |     format!("{val} {unit}")
124 | }
125 | 
126 | struct Timer {
127 |     number_of_records: AtomicU64,
128 |     sum: AtomicU64,
129 | }
130 | 
131 | impl Timer {
132 |     fn new() -> Self {
133 |         Timer {
134 |             number_of_records: AtomicU64::new(0),
135 |             sum: AtomicU64::new(0),
136 |         }
137 |     }
138 | 
139 |     fn mean(&self) -> Option<u64> {
140 |         let n = self.number_of_records.load(Ordering::Relaxed);
141 |         let sum = self.sum.load(Ordering::Relaxed);
142 |         sum.checked_div(n)
143 |     }
144 | 
145 |     fn record<'a>(&'a self) -> impl Drop + 'a {
146 |         struct TimerGuard<'a> {
147 |             start: std::time::Instant,
148 |             n: &'a AtomicU64,
149 |             sum: &'a AtomicU64,
150 |         }
151 | 
152 |         impl Drop for TimerGuard<'_> {
153 |             fn drop(&mut self) {
154 |                 let elapsed = self.start.elapsed().as_nanos() as u64;
155 |                 self.n.fetch_add(1, Ordering::Relaxed);
156 |                 self.sum.fetch_add(elapsed, Ordering::Relaxed);
157 |             }
158 |         }
159 | 
160 |         TimerGuard {
161 |             start: std::time::Instant::now(),
162 |             n: &self.number_of_records,
163 |             sum: &self.sum,
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/benchtop/src/transfer_workload.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Transaction,
  3 |     cli::StateItemDistribution,
  4 |     workload::{Distribution, Workload},
  5 | };
  6 | use rand::Rng;
  7 | 
  8 | #[derive(Clone)]
  9 | pub struct TransferInit {
 10 |     cur_account: u64,
 11 |     num_accounts: u64,
 12 | }
 13 | 
 14 | impl Workload for TransferInit {
 15 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
 16 |         const MAX_INIT_PER_ITERATION: u64 = 64 * 1024;
 17 | 
 18 |         if self.num_accounts == 0 {
 19 |             return;
 20 |         }
 21 | 
 22 |         let count = std::cmp::min(self.num_accounts - self.cur_account, MAX_INIT_PER_ITERATION);
 23 |         for _ in 0..count {
 24 |             transaction.write(&encode_id(self.cur_account), Some(&encode_balance(1000)));
 25 |             self.cur_account += 1;
 26 |         }
 27 |         println!(
 28 |             "populating {:.1}%",
 29 |             100.0 * (self.cur_account as f64) / (self.num_accounts as f64)
 30 |         );
 31 |     }
 32 | 
 33 |     fn is_done(&self) -> bool {
 34 |         self.cur_account == self.num_accounts
 35 |     }
 36 | }
 37 | 
 38 | /// Create an initialization command for a transfer database.
 39 | pub fn init(num_accounts: u64) -> TransferInit {
 40 |     TransferInit {
 41 |         cur_account: 0,
 42 |         num_accounts,
 43 |     }
 44 | }
 45 | 
 46 | fn encode_id(id: u64) -> [u8; 8] {
 47 |     id.to_be_bytes()
 48 | }
 49 | 
 50 | fn encode_balance(balance: u64) -> [u8; 8] {
 51 |     balance.to_be_bytes()
 52 | }
 53 | 
 54 | fn decode_balance(encoded: &[u8]) -> u64 {
 55 |     let mut buf = [0; 8];
 56 |     buf.copy_from_slice(encoded);
 57 |     u64::from_be_bytes(buf)
 58 | }
 59 | 
 60 | /// Build a new workload meant to emulate transfers.
 61 | ///
 62 | /// `num_accounts` refers to the amount of accounts in the database.
 63 | ///
 64 | /// `percentage_cold_transfer` ranges from 0 to 100 and indicates the proportion of transfers
 65 | /// which should be sent to a fresh account.
 66 | pub fn build(
 67 |     num_accounts: u64,
 68 |     workload_size: u64,
 69 |     percentage_cold_transfer: u8,
 70 |     op_limit: u64,
 71 |     threads: usize,
 72 |     distribution: StateItemDistribution,
 73 | ) -> Vec<TransferWorkload> {
 74 |     let thread_workload_size = workload_size / threads as u64;
 75 |     let num_accounts_step = num_accounts / threads as u64;
 76 | 
 77 |     (0..threads)
 78 |         .map(|i| {
 79 |             let start_account = num_accounts_step * i as u64;
 80 |             let end_account = if i == threads - 1 {
 81 |                 num_accounts
 82 |             } else {
 83 |                 num_accounts_step * (i as u64 + 1)
 84 |             };
 85 |             TransferWorkload {
 86 |                 num_accounts,
 87 |                 workload_size: thread_workload_size,
 88 |                 percentage_cold_transfer,
 89 |                 ops_remaining: op_limit / threads as u64,
 90 |                 distribution: Distribution::new(distribution, start_account, end_account),
 91 |             }
 92 |         })
 93 |         .collect()
 94 | }
 95 | 
 96 | /// A transfer-like workload.
 97 | pub struct TransferWorkload {
 98 |     /// The number of accounts in the system.
 99 |     pub num_accounts: u64,
100 |     /// The size of the workload.
101 |     pub workload_size: u64,
102 |     /// The percentage of transfers to make to fresh accounts.
103 |     pub percentage_cold_transfer: u8,
104 |     /// The number of remaining operations before being considered 'done'.
105 |     pub ops_remaining: u64,
106 |     /// The random distribution to use to sample state items.
107 |     pub distribution: Distribution,
108 | }
109 | 
110 | impl Workload for TransferWorkload {
111 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
112 |         let cold_sends =
113 |             (self.workload_size as f64 * (self.percentage_cold_transfer as f64 / 100.0)) as u64;
114 |         let warm_sends = self.workload_size - cold_sends;
115 | 
116 |         let mut rng = rand::thread_rng();
117 |         for i in 0..self.workload_size {
118 |             let send_account = self.distribution.sample(&mut rng);
119 |             let recv_account = if i < warm_sends {
120 |                 let mut r = self.distribution.sample(&mut rng);
121 |                 while r == send_account {
122 |                     r = self.distribution.sample(&mut rng);
123 |                 }
124 |                 r
125 |             } else {
126 |                 // odds of two threads generating the same random account here are
127 |                 // incredibly low.
128 |                 rng.gen_range(self.num_accounts..u64::max_value())
129 |             };
130 | 
131 |             let send_balance = decode_balance(
132 |                 &transaction
133 |                     .read(&encode_id(send_account))
134 |                     .expect("account exists"),
135 |             );
136 |             let recv_balance = transaction
137 |                 .read(&encode_id(recv_account))
138 |                 .map_or(0, |v| decode_balance(&v));
139 | 
140 |             let new_send_balance = if send_balance == 0 {
141 |                 1000 // yay, free money.
142 |             } else {
143 |                 send_balance - 1
144 |             };
145 |             let new_recv_balance = recv_balance + 1;
146 | 
147 |             transaction.write(
148 |                 &encode_id(send_account),
149 |                 Some(&encode_balance(new_send_balance)),
150 |             );
151 |             transaction.write(
152 |                 &encode_id(recv_account),
153 |                 Some(&encode_balance(new_recv_balance)),
154 |             );
155 |         }
156 | 
157 |         self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size);
158 |     }
159 | 
160 |     fn is_done(&self) -> bool {
161 |         self.ops_remaining == 0
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/core/src/hasher.rs:
--------------------------------------------------------------------------------
  1 | //! Hashers (feature-gated) and utilities for implementing them.
  2 | 
  3 | use crate::trie::{InternalData, LeafData, Node, NodeKind, TERMINATOR};
  4 | 
  5 | /// A trie node hash function specialized for 64 bytes of data.
  6 | ///
  7 | /// Note that it is illegal for the produced hash to equal [0; 32], as this value is reserved
  8 | /// for the terminator node.
  9 | ///
 10 | /// A node hasher should domain-separate internal and leaf nodes in some specific way. The
 11 | /// recommended approach for binary hashes is to set the MSB to 0 or 1 depending on the node kind.
 12 | /// However, for other kinds of hashes (e.g. Poseidon2 or other algebraic hashes), other labeling
 13 | /// schemes may be required.
 14 | pub trait NodeHasher {
 15 |     /// Hash a leaf. This should domain-separate the hash
 16 |     /// according to the node kind.
 17 |     fn hash_leaf(data: &LeafData) -> [u8; 32];
 18 | 
 19 |     /// Hash an internal node. This should domain-separate
 20 |     /// the hash according to the node kind.
 21 |     fn hash_internal(data: &InternalData) -> [u8; 32];
 22 | 
 23 |     /// Get the kind of the given node.
 24 |     fn node_kind(node: &Node) -> NodeKind;
 25 | }
 26 | 
 27 | /// A hasher for arbitrary-length values.
 28 | pub trait ValueHasher {
 29 |     /// Hash an arbitrary-length value.
 30 |     fn hash_value(value: &[u8]) -> [u8; 32];
 31 | }
 32 | 
 33 | /// Get the node kind, according to a most-significant bit labeling scheme.
 34 | ///
 35 | /// If the MSB is true, it's a leaf. If the node is empty, it's a [`TERMINATOR`]. Otherwise, it's
 36 | /// an internal node.
 37 | pub fn node_kind_by_msb(node: &Node) -> NodeKind {
 38 |     if node[0] >> 7 == 1 {
 39 |         NodeKind::Leaf
 40 |     } else if node == &TERMINATOR {
 41 |         NodeKind::Terminator
 42 |     } else {
 43 |         NodeKind::Internal
 44 |     }
 45 | }
 46 | 
 47 | /// Set the most-significant bit of the node.
 48 | pub fn set_msb(node: &mut Node) {
 49 |     node[0] |= 0b10000000;
 50 | }
 51 | 
 52 | pub fn unset_msb(node: &mut Node) {
 53 |     node[0] &= 0b01111111;
 54 | }
 55 | 
 56 | /// A simple trait for representing binary hash functions.
 57 | pub trait BinaryHash {
 58 |     /// Given a bit-string, produce a 32-bit hash.
 59 |     fn hash(input: &[u8]) -> [u8; 32];
 60 | 
 61 |     /// An optional specialization of `hash` where there are two 32-byte inputs, left and right.
 62 |     fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
 63 |         let mut buf = [0u8; 64];
 64 |         buf[0..32].copy_from_slice(left);
 65 |         buf[32..64].copy_from_slice(right);
 66 |         Self::hash(&buf)
 67 |     }
 68 | }
 69 | 
 70 | /// A node and value hasher constructed from a simple binary hasher.
 71 | ///
 72 | /// This implements a [`ValueHasher`] and [`NodeHasher`] where the node kind is tagged by setting
 73 | /// or unsetting the MSB of the hash value.
 74 | ///
 75 | /// The binary hash wrapped by this structure must behave approximately like a random oracle over
 76 | /// the space 2^256, i.e. all 256 bit outputs are valid and inputs are uniformly distributed.
 77 | ///
 78 | /// Functions like Sha2/Blake3/Keccak/Groestl all meet these criteria.
 79 | pub struct BinaryHasher<H>(std::marker::PhantomData<H>);
 80 | 
 81 | impl<H: BinaryHash> ValueHasher for BinaryHasher<H> {
 82 |     fn hash_value(value: &[u8]) -> [u8; 32] {
 83 |         H::hash(value)
 84 |     }
 85 | }
 86 | 
 87 | impl<H: BinaryHash> NodeHasher for BinaryHasher<H> {
 88 |     fn hash_leaf(data: &LeafData) -> [u8; 32] {
 89 |         let mut h = H::hash2_32_concat(&data.key_path, &data.value_hash);
 90 |         set_msb(&mut h);
 91 |         h
 92 |     }
 93 | 
 94 |     fn hash_internal(data: &InternalData) -> [u8; 32] {
 95 |         let mut h = H::hash2_32_concat(&data.left, &data.right);
 96 |         unset_msb(&mut h);
 97 |         h
 98 |     }
 99 | 
100 |     fn node_kind(node: &Node) -> NodeKind {
101 |         node_kind_by_msb(node)
102 |     }
103 | }
104 | 
105 | #[cfg(any(feature = "blake3-hasher", test))]
106 | pub use blake3::Blake3Hasher;
107 | 
108 | /// A node hasher making use of blake3.
109 | #[cfg(any(feature = "blake3-hasher", test))]
110 | pub mod blake3 {
111 |     use super::{BinaryHash, BinaryHasher};
112 | 
113 |     /// A [`BinaryHash`] implementation for Blake3.
114 |     pub struct Blake3BinaryHasher;
115 | 
116 |     /// A wrapper around Blake3 for use in NOMT.
117 |     pub type Blake3Hasher = BinaryHasher<Blake3BinaryHasher>;
118 | 
119 |     impl BinaryHash for Blake3BinaryHasher {
120 |         fn hash(value: &[u8]) -> [u8; 32] {
121 |             blake3::hash(value).into()
122 |         }
123 | 
124 |         fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
125 |             let mut hasher = blake3::Hasher::new();
126 |             hasher.update(left);
127 |             hasher.update(right);
128 |             hasher.finalize().into()
129 |         }
130 |     }
131 | }
132 | 
133 | #[cfg(feature = "sha2-hasher")]
134 | pub use sha2::Sha2Hasher;
135 | 
136 | /// A node and value hasher making use of sha2-256.
137 | #[cfg(feature = "sha2-hasher")]
138 | pub mod sha2 {
139 |     use super::{BinaryHash, BinaryHasher};
140 |     use sha2::{Digest, Sha256};
141 | 
142 |     /// A [`BinaryHash`] implementation for Sha2.
143 |     pub struct Sha2BinaryHasher;
144 | 
145 |     /// A wrapper around sha2-256 for use in NOMT.
146 |     pub type Sha2Hasher = BinaryHasher<Sha2BinaryHasher>;
147 | 
148 |     impl BinaryHash for Sha2BinaryHasher {
149 |         fn hash(value: &[u8]) -> [u8; 32] {
150 |             let mut hasher = Sha256::new();
151 |             hasher.update(value);
152 |             hasher.finalize().into()
153 |         }
154 | 
155 |         fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
156 |             let mut hasher = Sha256::new();
157 |             hasher.update(left);
158 |             hasher.update(right);
159 |             hasher.finalize().into()
160 |         }
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/wal/read.rs:
--------------------------------------------------------------------------------
  1 | //! The read-path for the WAL.
  2 | 
  3 | use super::{WAL_ENTRY_TAG_CLEAR, WAL_ENTRY_TAG_END, WAL_ENTRY_TAG_START, WAL_ENTRY_TAG_UPDATE};
  4 | use crate::{
  5 |     io::{self, PagePool, PAGE_SIZE},
  6 |     page_diff::PageDiff,
  7 | };
  8 | use anyhow::bail;
  9 | use std::{fs::File, io::Seek};
 10 | 
 11 | #[derive(Debug, PartialEq, Eq)]
 12 | pub enum WalEntry {
 13 |     Update {
 14 |         /// The unique identifier of the page being updated.
 15 |         page_id: [u8; 32],
 16 |         /// A bitmap where each bit indicates whether the node at the corresponding index was
 17 |         /// changed by this update.
 18 |         page_diff: PageDiff,
 19 |         /// Nodes that were changed by this update. The length of this array must be consistent with
 20 |         /// the number of ones in `page_diff`.
 21 |         changed_nodes: Vec<[u8; 32]>,
 22 |         /// The bucket index which is being updated.
 23 |         bucket: u64,
 24 |     },
 25 |     Clear {
 26 |         /// The bucket index which is being cleared.
 27 |         bucket: u64,
 28 |     },
 29 | }
 30 | 
 31 | pub struct WalBlobReader {
 32 |     wal: Vec<u8>,
 33 |     offset: usize,
 34 |     sync_seqn: u32,
 35 | }
 36 | 
 37 | impl WalBlobReader {
 38 |     /// Creates a new WAL blob reader.
 39 |     ///
 40 |     /// The `wal_fd` is expected to be positioned at the start of the WAL file. The file must be
 41 |     /// a multiple of the page size.
 42 |     pub fn new(page_pool: &PagePool, mut wal_fd: &File) -> anyhow::Result<Self> {
 43 |         let stat = wal_fd.metadata()?;
 44 |         let file_size = stat.len() as usize;
 45 |         if file_size % PAGE_SIZE != 0 {
 46 |             anyhow::bail!("WAL file size is not a multiple of the page size");
 47 |         }
 48 | 
 49 |         wal_fd.seek(std::io::SeekFrom::Start(0))?;
 50 | 
 51 |         // Read the entire WAL file into memory. We do it page-by-page because WAL fd is opened
 52 |         // with O_DIRECT flag, and that means we need to provide aligned buffers.
 53 |         let mut wal = Vec::with_capacity(file_size);
 54 |         let mut pn = 0;
 55 |         loop {
 56 |             let page = match io::read_page(page_pool, wal_fd, pn) {
 57 |                 Ok(page) => page,
 58 |                 Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => break,
 59 |                 Err(e) => return Err(e.into()),
 60 |             };
 61 |             pn += 1;
 62 |             wal.extend_from_slice(&*page);
 63 |         }
 64 | 
 65 |         let mut reader = Self {
 66 |             wal,
 67 |             offset: 0,
 68 |             sync_seqn: 0,
 69 |         };
 70 |         reader.read_start()?;
 71 | 
 72 |         Ok(reader)
 73 |     }
 74 | 
 75 |     /// Get the sync sequence number of the WAL file.
 76 |     pub fn sync_seqn(&self) -> u32 {
 77 |         self.sync_seqn
 78 |     }
 79 | 
 80 |     /// Reads the next entry from the WAL file.
 81 |     ///
 82 |     /// Returns `None` if the end of the file is reached.
 83 |     pub fn read_entry(&mut self) -> anyhow::Result<Option<WalEntry>> {
 84 |         let entry_tag = self.read_byte()?;
 85 |         match entry_tag {
 86 |             WAL_ENTRY_TAG_END => Ok(None),
 87 |             WAL_ENTRY_TAG_CLEAR => {
 88 |                 let bucket = self.read_u64()?;
 89 |                 Ok(Some(WalEntry::Clear { bucket }))
 90 |             }
 91 |             WAL_ENTRY_TAG_UPDATE => {
 92 |                 let page_id: [u8; 32] = self.read_buf()?;
 93 |                 let page_diff: [u8; 16] = self.read_buf()?;
 94 |                 let page_diff = PageDiff::from_bytes(page_diff)
 95 |                     .ok_or_else(|| anyhow::anyhow!("Invalid page diff"))?;
 96 | 
 97 |                 let changed_count = page_diff.count();
 98 |                 let mut changed_nodes = Vec::with_capacity(changed_count);
 99 |                 for _ in 0..changed_count {
100 |                     let node = self.read_buf::<32>()?;
101 |                     changed_nodes.push(node);
102 |                 }
103 | 
104 |                 let bucket = self.read_u64()?;
105 | 
106 |                 Ok(Some(WalEntry::Update {
107 |                     page_id,
108 |                     page_diff,
109 |                     changed_nodes,
110 |                     bucket,
111 |                 }))
112 |             }
113 |             _ => bail!("unknown WAL entry tag: {entry_tag}"),
114 |         }
115 |     }
116 | 
117 |     fn read_start(&mut self) -> anyhow::Result<()> {
118 |         let entry_tag = self.read_byte()?;
119 |         if entry_tag == WAL_ENTRY_TAG_START {
120 |             self.sync_seqn = self.read_u32()?;
121 | 
122 |             Ok(())
123 |         } else {
124 |             bail!("unexpected WAL entry tag at start: {entry_tag}");
125 |         }
126 |     }
127 | 
128 |     /// Reads a single byte from the WAL file.
129 |     fn read_byte(&mut self) -> anyhow::Result<u8> {
130 |         if self.offset >= self.wal.len() {
131 |             bail!("Unexpected end of WAL file");
132 |         }
133 |         let byte = self.wal[self.offset];
134 |         self.offset += 1;
135 |         Ok(byte)
136 |     }
137 | 
138 |     /// Reads a [u8; N] array from the WAL file.
139 |     fn read_buf<const N: usize>(&mut self) -> anyhow::Result<[u8; N]> {
140 |         if self.offset + N > self.wal.len() {
141 |             bail!("Unexpected end of WAL file");
142 |         }
143 |         let array = self.wal[self.offset..self.offset + N]
144 |             .try_into()
145 |             .map_err(|_| anyhow::anyhow!("Failed to read [u8; {N}] from WAL file"))?;
146 |         self.offset += N;
147 |         Ok(array)
148 |     }
149 | 
150 |     /// Reads a u64 from the WAL file in little-endian format.
151 |     fn read_u64(&mut self) -> anyhow::Result<u64> {
152 |         let buf = self.read_buf::<8>()?;
153 |         Ok(u64::from_le_bytes(buf))
154 |     }
155 | 
156 |     /// Reads a u32 from the WAL file in little-endian format.
157 |     fn read_u32(&mut self) -> anyhow::Result<u32> {
158 |         let buf = self.read_buf::<4>()?;
159 |         Ok(u32::from_le_bytes(buf))
160 |     }
161 | }
162 | 


--------------------------------------------------------------------------------
/nomt/src/page_diff.rs:
--------------------------------------------------------------------------------
  1 | use crate::page_cache::NODES_PER_PAGE;
  2 | 
  3 | const CLEAR_BIT: u64 = 1 << 63;
  4 | 
  5 | /// A bitfield tracking which nodes have changed within a page.
  6 | #[derive(Debug, Default, Clone, PartialEq, Eq)]
  7 | pub struct PageDiff {
  8 |     /// Each bit indicates whether the node at the corresponding index has changed.
  9 |     ///
 10 |     /// There are only effectively [`NODES_PER_PAGE`] (126) nodes per page. The last two bits are
 11 |     /// reserved.
 12 |     ///
 13 |     /// See [`CLEAR_BIT`].
 14 |     changed_nodes: [u64; 2],
 15 | }
 16 | 
 17 | impl PageDiff {
 18 |     /// Create a new page diff from bytes.
 19 |     ///
 20 |     /// Returns `None` if any of reserved bits are set to 1.
 21 |     pub fn from_bytes(bytes: [u8; 16]) -> Option<Self> {
 22 |         let mut changed_nodes = [0u64; 2];
 23 |         changed_nodes[0] = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
 24 |         changed_nodes[1] = u64::from_le_bytes(bytes[8..16].try_into().unwrap());
 25 | 
 26 |         let diff = PageDiff { changed_nodes };
 27 |         // Check if the two last bits are set to 1
 28 |         if diff.changed(126) || diff.changed(127) {
 29 |             return None;
 30 |         }
 31 |         Some(diff)
 32 |     }
 33 | 
 34 |     /// Note that some 32-byte slot in the page data has changed.
 35 |     ///
 36 |     /// The acceptable range is 0..NODES_PER_PAGE. Erases the clear bit.
 37 |     pub fn set_changed(&mut self, slot_index: usize) {
 38 |         assert!(slot_index < NODES_PER_PAGE);
 39 |         let word = slot_index / 64;
 40 |         let index = slot_index % 64;
 41 |         let mask = 1 << index;
 42 |         self.changed_nodes[word] |= mask;
 43 |         self.changed_nodes[1] &= !CLEAR_BIT;
 44 |     }
 45 | 
 46 |     /// Whether a bit is set within the page data.
 47 |     pub fn changed(&self, slot_index: usize) -> bool {
 48 |         let word = slot_index / 64;
 49 |         let index = slot_index % 64;
 50 |         let mask = 1 << index;
 51 |         self.changed_nodes[word] & mask == mask
 52 |     }
 53 | 
 54 |     /// Mark the page as having been cleared.
 55 |     pub fn set_cleared(&mut self) {
 56 |         self.changed_nodes[1] |= CLEAR_BIT;
 57 |     }
 58 | 
 59 |     /// Whether the page was completely cleared.
 60 |     pub fn cleared(&self) -> bool {
 61 |         self.changed_nodes[1] & CLEAR_BIT == CLEAR_BIT
 62 |     }
 63 | 
 64 |     /// Given the page data, collect the nodes that have changed according to this diff.
 65 |     /// Panics if this is a cleared page-diff.
 66 |     pub fn pack_changed_nodes<'a, 'b: 'a>(
 67 |         &'b self,
 68 |         page: &'a [u8],
 69 |     ) -> impl Iterator<Item = [u8; 32]> + 'a {
 70 |         self.assert_not_cleared();
 71 |         self.iter_ones().map(|node_index| {
 72 |             let start = node_index * 32;
 73 |             let end = start + 32;
 74 |             page[start..end].try_into().unwrap()
 75 |         })
 76 |     }
 77 | 
 78 |     /// Given the changed nodes, apply them to the given page according to the diff.
 79 |     ///
 80 |     /// Panics if the number of changed nodes doesn't equal to the number of nodes
 81 |     /// this diff recorded.
 82 |     pub fn unpack_changed_nodes(&self, nodes: &[[u8; 32]], page: &mut [u8]) {
 83 |         assert_eq!(self.count(), nodes.len());
 84 |         for (node_index, node) in self.iter_ones().zip(nodes) {
 85 |             let start = node_index * 32;
 86 |             let end = start + 32;
 87 |             page[start..end].copy_from_slice(&node[..]);
 88 |         }
 89 |     }
 90 | 
 91 |     /// Returns the number of changed nodes. Capped at [NODES_PER_PAGE].
 92 |     pub fn count(&self) -> usize {
 93 |         (self.changed_nodes[0].count_ones() + self.changed_nodes[1].count_ones()) as usize
 94 |     }
 95 | 
 96 |     /// Get raw bytes representing the PageDiff.
 97 |     ///
 98 |     /// Panics if this is a cleared page-diff.
 99 |     pub fn as_bytes(&self) -> [u8; 16] {
100 |         let mut bytes = [0u8; 16];
101 |         bytes[0..8].copy_from_slice(&self.changed_nodes[0].to_le_bytes());
102 |         bytes[8..16].copy_from_slice(&self.changed_nodes[1].to_le_bytes());
103 |         bytes
104 |     }
105 | 
106 |     fn assert_not_cleared(&self) {
107 |         assert_eq!(self.changed_nodes[1] & (1 << 63), 0);
108 |     }
109 | 
110 |     fn iter_ones(&self) -> impl Iterator<Item = usize> {
111 |         self.assert_not_cleared();
112 |         FastIterOnes(self.changed_nodes[0])
113 |             .chain(FastIterOnes(self.changed_nodes[1]).map(|i| i + 64))
114 |     }
115 | }
116 | 
117 | struct FastIterOnes(u64);
118 | 
119 | impl Iterator for FastIterOnes {
120 |     type Item = usize;
121 | 
122 |     fn next(&mut self) -> Option<usize> {
123 |         match self.0.trailing_zeros() {
124 |             64 => None,
125 |             x => {
126 |                 self.0 &= !(1 << x);
127 |                 Some(x as usize)
128 |             }
129 |         }
130 |     }
131 | }
132 | 
133 | #[cfg(test)]
134 | mod tests {
135 |     use super::PageDiff;
136 |     use crate::page_cache::NODES_PER_PAGE;
137 | 
138 |     #[test]
139 |     fn ensure_cap() {
140 |         assert_eq!(NODES_PER_PAGE, 126);
141 |     }
142 | 
143 |     #[test]
144 |     fn iter_ones() {
145 |         let mut diff = PageDiff::default();
146 | 
147 |         let set_bits = (0..63).map(|i| i * 2).collect::<Vec<_>>();
148 |         for bit in set_bits.iter().cloned() {
149 |             diff.set_changed(bit);
150 |         }
151 | 
152 |         for bit in set_bits.iter().cloned() {
153 |             assert!(diff.changed(bit));
154 |         }
155 | 
156 |         let mut iterated_set_bits = diff.iter_ones().collect::<Vec<_>>();
157 |         iterated_set_bits.sort();
158 | 
159 |         assert_eq!(iterated_set_bits, set_bits);
160 |     }
161 | 
162 |     #[test]
163 |     fn clear_bit() {
164 |         let mut diff = PageDiff::default();
165 | 
166 |         diff.set_cleared();
167 |         assert!(diff.cleared());
168 | 
169 |         // Make sure that setting a node as changed zeros out the clear bit
170 |         diff.set_changed(0);
171 |         assert!(!diff.cleared());
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/benchtop/src/sp_trie.rs:
--------------------------------------------------------------------------------
  1 | use crate::{backend::Transaction, timer::Timer, workload::Workload};
  2 | use hash_db::{AsHashDB, HashDB, Prefix};
  3 | use kvdb::KeyValueDB;
  4 | use kvdb_rocksdb::{Database, DatabaseConfig};
  5 | use sha2::Digest;
  6 | use sp_trie::trie_types::TrieDBMutBuilderV1;
  7 | use sp_trie::{DBValue, LayoutV1, PrefixedMemoryDB, TrieDBMut};
  8 | use std::sync::Arc;
  9 | use trie_db::TrieMut;
 10 | 
 11 | type Hasher = sp_core::Blake2Hasher;
 12 | type Hash = sp_core::H256;
 13 | 
 14 | const SP_TRIE_DB_FOLDER: &str = "sp_trie_db";
 15 | 
 16 | const NUM_COLUMNS: u32 = 2;
 17 | const COL_TRIE: u32 = 0;
 18 | const COL_ROOT: u32 = 1;
 19 | 
 20 | const ROOT_KEY: &[u8] = b"root";
 21 | 
 22 | pub struct SpTrieDB {
 23 |     pub kvdb: Arc<dyn KeyValueDB>,
 24 |     pub root: Hash,
 25 | }
 26 | 
 27 | pub struct Trie<'a> {
 28 |     pub db: Arc<dyn KeyValueDB>,
 29 |     pub overlay: &'a mut PrefixedMemoryDB<Hasher>,
 30 | }
 31 | 
 32 | impl SpTrieDB {
 33 |     pub fn open(reset: bool) -> Self {
 34 |         if reset {
 35 |             // Delete previously existing db
 36 |             let _ = std::fs::remove_dir_all(SP_TRIE_DB_FOLDER);
 37 |         }
 38 | 
 39 |         let db_cfg = DatabaseConfig::with_columns(NUM_COLUMNS);
 40 |         let kvdb =
 41 |             Arc::new(Database::open(&db_cfg, SP_TRIE_DB_FOLDER).expect("Database backend error"));
 42 | 
 43 |         let root = match kvdb.get(COL_ROOT, ROOT_KEY).unwrap() {
 44 |             None => Hash::default(),
 45 |             Some(r) => Hash::from_slice(&r[..32]),
 46 |         };
 47 | 
 48 |         Self { kvdb, root }
 49 |     }
 50 | 
 51 |     pub fn execute(&mut self, mut timer: Option<&mut Timer>, workload: &mut dyn Workload) {
 52 |         let _timer_guard_total = timer.as_mut().map(|t| t.record_span("workload"));
 53 | 
 54 |         let mut new_root = self.root;
 55 |         let mut overlay = PrefixedMemoryDB::default();
 56 | 
 57 |         let mut trie = Trie {
 58 |             db: self.kvdb.clone(),
 59 |             overlay: &mut overlay,
 60 |         };
 61 | 
 62 |         let recorder: sp_trie::recorder::Recorder<Hasher> = Default::default();
 63 |         let _timer_guard_commit = {
 64 |             let mut trie_recorder = recorder.as_trie_recorder(new_root);
 65 | 
 66 |             let trie_db_mut = if self.root == Hash::default() {
 67 |                 TrieDBMutBuilderV1::new(&mut trie, &mut new_root)
 68 |                     .with_recorder(&mut trie_recorder)
 69 |                     .build()
 70 |             } else {
 71 |                 TrieDBMutBuilderV1::from_existing(&mut trie, &mut new_root)
 72 |                     .with_recorder(&mut trie_recorder)
 73 |                     .build()
 74 |             };
 75 | 
 76 |             let mut transaction = Tx {
 77 |                 trie: trie_db_mut,
 78 |                 timer,
 79 |             };
 80 |             workload.run_step(&mut transaction);
 81 |             let Tx {
 82 |                 trie: mut trie_db_mut,
 83 |                 mut timer,
 84 |             } = transaction;
 85 | 
 86 |             let timer_guard_commit = timer.as_mut().map(|t| t.record_span("commit_and_prove"));
 87 | 
 88 |             trie_db_mut.commit();
 89 |             timer_guard_commit
 90 |         };
 91 | 
 92 |         let _proof = recorder.drain_storage_proof().is_empty();
 93 | 
 94 |         let mut transaction = self.kvdb.transaction();
 95 |         for (key, (value, ref_count)) in overlay.drain() {
 96 |             if ref_count > 0 {
 97 |                 transaction.put(COL_TRIE, &key[..], &value[..])
 98 |             } else if ref_count < 0 {
 99 |                 transaction.delete(COL_TRIE, &key[..])
100 |             }
101 |         }
102 |         transaction.put(COL_ROOT, ROOT_KEY, new_root.as_bytes());
103 |         self.kvdb
104 |             .write(transaction)
105 |             .expect("Failed to write transaction");
106 | 
107 |         self.root = new_root;
108 |     }
109 | }
110 | 
111 | struct Tx<'a> {
112 |     trie: TrieDBMut<'a, LayoutV1<Hasher>>,
113 |     timer: Option<&'a mut Timer>,
114 | }
115 | 
116 | // sp_trie does not require hashed keys,
117 | // but if keys are not hashed, the comparison does not seem to be efficient.
118 | // Not applying hashing to keys would significantly speed up sp_trie.
119 | impl<'a> Transaction for Tx<'a> {
120 |     fn read(&mut self, key: &[u8]) -> Option<Vec<u8>> {
121 |         let key_path = sha2::Sha256::digest(key);
122 | 
123 |         let _timer_guard_read = self.timer.as_mut().map(|t| t.record_span("read"));
124 |         self.trie
125 |             .get(&key_path)
126 |             .expect("Impossible fetching from sp-trie db")
127 |     }
128 | 
129 |     fn note_read(&mut self, key: &[u8], _value: Option<Vec<u8>>) {
130 |         let _ = self.read(key);
131 |     }
132 | 
133 |     fn write(&mut self, key: &[u8], value: Option<&[u8]>) {
134 |         let key_path = sha2::Sha256::digest(key);
135 | 
136 |         self.trie
137 |             .insert(&key_path, &value.unwrap_or(&[]))
138 |             .expect("Impossible writing into sp-trie db");
139 |     }
140 | }
141 | 
142 | impl<'a> AsHashDB<Hasher, DBValue> for Trie<'a> {
143 |     fn as_hash_db(&self) -> &dyn hash_db::HashDB<Hasher, DBValue> {
144 |         self
145 |     }
146 | 
147 |     fn as_hash_db_mut<'b>(&'b mut self) -> &'b mut (dyn HashDB<Hasher, DBValue> + 'b) {
148 |         &mut *self
149 |     }
150 | }
151 | 
152 | impl<'a> HashDB<Hasher, DBValue> for Trie<'a> {
153 |     fn get(&self, key: &Hash, prefix: Prefix) -> Option<DBValue> {
154 |         if let Some(value) = self.overlay.get(key, prefix) {
155 |             return Some(value);
156 |         }
157 | 
158 |         let key = sp_trie::prefixed_key::<Hasher>(key, prefix);
159 |         self.db.get(0, &key).expect("Database backend error")
160 |     }
161 | 
162 |     fn contains(&self, hash: &Hash, prefix: Prefix) -> bool {
163 |         self.get(hash, prefix).is_some()
164 |     }
165 | 
166 |     fn insert(&mut self, prefix: Prefix, value: &[u8]) -> Hash {
167 |         self.overlay.insert(prefix, value)
168 |     }
169 | 
170 |     fn emplace(&mut self, key: Hash, prefix: Prefix, value: DBValue) {
171 |         self.overlay.emplace(key, prefix, value);
172 |     }
173 | 
174 |     fn remove(&mut self, key: &Hash, prefix: Prefix) {
175 |         self.overlay.remove(key, prefix)
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/docs/nomt_specification.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Nearly Optimal Merkle Trie - Specification
  3 | 
  4 | ## Tree Structure
  5 | 
  6 | NOMT is a Binary Merkle Patricia Trie implementing a key-value mapping from 32 byte keys to
  7 | arbitrarily-sized values.
  8 | 
  9 | All values within the trie can be committed to with a single small unique identifier.
 10 | 
 11 | NOMT is a kind of **Merkle Tree**, where each node's value is computed based on the hashes of the
 12 | nodes and values beneath it. This means that the root node accumulates all the information
 13 | contained within the tree and serves as a cryptographic commitment to that information.
 14 | 
 15 | NOMT is an **Addressable Merkle Trie**, where the path from the root node to the location where a
 16 | key's value is stored is based solely upon the key. The trie has 256 levels, each corresponding to
 17 | one of the 256 bits of key material, and 2^256 possible nodes at the lowest level.
 18 | At the i'th level, whether the value is on the left or right branch is determined by the i'th bit
 19 | of the key.
 20 | 
 21 | NOMT gives **efficient proofs of inclusion and non-inclusion** and has a schema that is
 22 | **optimized for SSDs**.
 23 | 
 24 | ### Nodes
 25 | 
 26 | Each node is exactly 256 bits long. NOMT is general over hash function, and can be used with any
 27 | hash function that produces 256 bit hashes.
 28 | 
 29 | The Most-Significant Bit (MSB) of each node is used to differentiate the type of node.
 30 | 
 31 | There are 3 kinds of nodes.
 32 |   1. Internal Nodes. An internal node has 2 child nodes. Its value is given by `hash(left ++ right)`
 33 |      with the MSB set to 1. An internal node must either have two leaves as children or at least one
 34 |      internal node.
 35 |   2. Leaf Nodes. A leaf node encodes a value. Its value is given by `hash(key ++ hash(value))` with
 36 |      the MSB set to 0. A leaf node is a stand-in for a sub-trie with a single value.
 37 |   3. Terminator Node. A terminator node has the value `0x00....00` and can appear at any level of
 38 |      the trie. Terminator nodes are used to stand-in for empty sub-tries: no key whose lookup path
 39 |      would reach a terminator has a value within the trie.
 40 | 
 41 | Leaf and terminator nodes can be referred to as Terminal nodes, because encountering them terminates
 42 | a path lookup.
 43 | 
 44 | To avoid ambiguity in representation, sub-tries which have no values or which have a single value
 45 | must be represented by a leaf or terminator as close to the root as possible. This rule ensures that
 46 | there is only a single valid representation for any key-value set. It also ensures that the trie
 47 | is maximally compressed.
 48 | 
 49 | ##  How is this stored on disk?
 50 | 
 51 | ### Pages
 52 | 
 53 | Nodes will be split into rootless sub-binary trees of depth `d`, so each group will contain `2^(d+1) - 2` nodes.
 54 | 
 55 | ![nomt pages](./images/nomt_pages.jpg)
 56 | 
 57 | We will refer to each group as a *Page*. These pages will construct a **page base-(2^d) tree**, where each Page will have `2^d` child pages below it.
 58 | 
 59 | ### Node Key and PageIds
 60 | 
 61 | A node key represents a `path` in the trie, which is called **KeyPath**. The KeyPath is composed of 256 bits, where each bit defines a fork in the node's tree. When seen as an array of bytes, we traverse from the first byte to the last, with each byte being read from the msb to the lsb. For example, if `KeyPath = [0b01001001, 0b00010000]`, the first byte will be `0b01001001` and the first bit to traverse the tree will be `0` (msb), followed by `1` and so on, with the last bit in the byte being `1` (lsb).
 62 | 
 63 | Each page contains a rootless sub-binary tree; hence, the KeyPath can be divided into multiple segments that reference the path in the each page sub-tree. These segments are called **d_tets**. For instance, with `KeyPath = [0b01001001, 0b00010000]` and `d = 4`, the `d_tet` will be `[0b0100, 0b1001, 0b0001, 0b0000]`.
 64 | 
 65 | Each d_tet specifies the page table to move to next from the previous one, starting from the root page.
 66 | 
 67 | If d is a divisor of the key length, there will be `(256/d) - 1` d_tets, each 'd'-bits long; otherwise, there will be `⎣256/d⎦`. This is because the remaining bits are used to traverse the last page and are not required to access any new page.
 68 | 
 69 | Each page is assigned a unique identifier (**PageId**) created by the d_tets necessary to reach that page.
 70 | 
 71 | ###  From KeyPath to Pages
 72 | 
 73 | Given a Node Key, we can treat it as an array of bits and split it into `256/d` d\_tets. Iterating over all the d\_tets we will construct each needed PageId to fetch all the Pages that will store the path of the KeyPath.
 74 | 
 75 | Let's call `dtets` the array containing all d\_tets and `page_ids` the array of all needed page_ids we want to fill. All PageId will be constructed like this:
 76 | 
 77 | ```pseudo
 78 |     for i in dtets.len()
 79 | 
 80 |         if i > 0
 81 |              prev_page_id = page_ids[i - 1]
 82 |          else
 83 |             prev_page_id = 0
 84 | 
 85 |         page_ids[i] = (prev_page_id << d) + int(dtets[i]) + 1
 86 | ```
 87 | 
 88 | where `int(dtets[i])` just mean to treat as an integer the sequence of `d` bits and `(prev_page_id << d)` is equal to `(prev_page_id * 2^d)`.
 89 | 
 90 | And here's how we can deterministically get the value and all the parent nodes in Merkle from the pages identified by the PageIds extracted from the KeyPath.
 91 | 
 92 | ### Rootless sub-binary tree in Page
 93 | 
 94 | A Page is just an array of bytes that will contain a rootless sub-binary tree, thus an array of Nodes.
 95 | 
 96 | #### Node Layout Within Pages
 97 | 
 98 | Each node in the rootless sub-binary tree is assigned a position. The root is assigned position 1, and at each subsequent level, the identifiers increase from left to right. For example:
 99 | 
100 | ![nomt identifier rule](./images/nomt_number_rule.png)
101 | 
102 | A node at position `p` in the tree will be stored in the array at index `p - 1`. Since it is rootless, `p` will always be greater than 1. The children of a node at position `p` will be at positions `p * 2` and `(p * 2) + 1`, corresponding to the left and right child nodes, respectively.
103 | 
104 | All nodes that are not leaves or internal nodes are empty when viewed from a page perspective. However, it is possible to access only few of them respecting the constraints outlined in the previous section.
105 | 


--------------------------------------------------------------------------------