├── .editorconfig ├── .github ├── actions │ └── install-fuse │ │ └── action.yml └── workflows │ ├── bench.yml │ └── ci.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benchtop ├── Cargo.lock ├── Cargo.toml └── src │ ├── backend.rs │ ├── bench.rs │ ├── cli.rs │ ├── custom_workload.rs │ ├── main.rs │ ├── nomt.rs │ ├── sov_db.rs │ ├── sp_trie.rs │ ├── timer.rs │ ├── transfer_workload.rs │ └── workload.rs ├── core ├── Cargo.toml └── src │ ├── hasher.rs │ ├── lib.rs │ ├── page.rs │ ├── page_id.rs │ ├── proof │ ├── mod.rs │ ├── multi_proof.rs │ └── path_proof.rs │ ├── trie.rs │ ├── trie_pos.rs │ ├── update.rs │ └── witness.rs ├── docs ├── CONTRIBUTING.md ├── images │ ├── binary_merkle_patricia_tree.png │ ├── nomt_number_rule.png │ ├── nomt_pages.jpg │ └── nomt_put.png └── nomt_specification.md ├── examples ├── commit_batch │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ └── main.rs ├── read_value │ ├── Cargo.toml │ └── src │ │ └── main.rs └── witness_verification │ ├── Cargo.toml │ └── src │ └── main.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz_targets │ ├── api_surface.rs │ ├── bitwise_memcpy.rs │ ├── common │ └── mod.rs │ ├── prefix_len.rs │ ├── reconstruct_key.rs │ ├── separate.rs │ └── separator_len.rs ├── nomt ├── Cargo.toml ├── benches │ └── beatree.rs ├── src │ ├── beatree │ │ ├── README.md │ │ ├── allocator │ │ │ ├── free_list.rs │ │ │ └── mod.rs │ │ ├── benches.rs │ │ ├── branch │ │ │ ├── mod.rs │ │ │ └── node.rs │ │ ├── index.rs │ │ ├── iterator.rs │ │ ├── leaf │ │ │ ├── mod.rs │ │ │ └── node.rs │ │ ├── leaf_cache.rs │ │ ├── mod.rs │ │ ├── ops │ │ │ ├── bit_ops.rs │ │ │ ├── mod.rs │ │ │ ├── overflow.rs │ │ │ ├── reconstruction.rs │ │ │ └── update │ │ │ │ ├── branch_ops.rs │ │ │ │ ├── branch_stage.rs │ │ │ │ ├── branch_updater.rs │ │ │ │ ├── extend_range_protocol.rs │ │ │ │ ├── leaf_stage.rs │ │ │ │ ├── leaf_updater.rs │ │ │ │ ├── mod.rs │ │ │ │ └── tests.rs │ │ └── writeout.rs │ ├── bitbox │ │ ├── ht_file.rs │ │ ├── meta_map.rs │ │ ├── mod.rs │ │ ├── wal │ │ │ ├── mod.rs │ │ │ ├── read.rs │ │ │ ├── tests.rs │ │ │ └── write.rs │ │ └── writeout.rs │ ├── io │ │ ├── fsyncer.rs │ │ ├── linux.rs │ │ ├── mod.rs │ │ ├── page_pool.rs │ │ └── unix.rs │ ├── lib.rs │ ├── merkle │ │ ├── cache_prepopulate.rs │ │ ├── mod.rs │ │ ├── page_set.rs │ │ ├── page_walker.rs │ │ ├── seek.rs │ │ └── worker.rs │ ├── metrics.rs │ ├── options.rs │ ├── overlay.rs │ ├── page_cache.rs │ ├── page_diff.rs │ ├── page_region.rs │ ├── rollback │ │ ├── delta.rs │ │ ├── mod.rs │ │ ├── reverse_delta_worker.rs │ │ └── tests.rs │ ├── rw_pass_cell │ │ ├── loom_tests.rs │ │ └── mod.rs │ ├── seglog │ │ ├── mod.rs │ │ ├── segment_filename.rs │ │ └── segment_rw.rs │ ├── store │ │ ├── flock.rs │ │ ├── meta.rs │ │ ├── mod.rs │ │ ├── page_loader.rs │ │ └── sync.rs │ ├── sys │ │ ├── linux.rs │ │ ├── macos.rs │ │ ├── mod.rs │ │ └── unix.rs │ └── task.rs └── tests │ ├── add_remove.rs │ ├── common │ └── mod.rs │ ├── compute_root.rs │ ├── exclusive_dir.rs │ ├── extend_range_protocol.rs │ ├── fill_and_empty.rs │ ├── large_values.rs │ ├── last_layer_trie.rs │ ├── overlay.rs │ ├── prev_root_check.rs │ ├── rollback.rs │ ├── wal.rs │ └── witness_check.rs ├── torture ├── Cargo.toml └── src │ ├── agent.rs │ ├── logging.rs │ ├── main.rs │ ├── message.rs │ ├── panic.rs │ ├── spawn.rs │ └── supervisor │ ├── cli.rs │ ├── comms.rs │ ├── config.rs │ ├── controller.rs │ ├── mod.rs │ ├── pbt.rs │ ├── resource.rs │ ├── swarm.rs │ └── workload.rs └── trickfs ├── Cargo.toml ├── README.md ├── src ├── latency.rs └── lib.rs └── trickmnt ├── Cargo.toml └── src └── main.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | [*] 3 | indent_style=space 4 | indent_size=space 5 | tab_width=4 6 | end_of_line=lf 7 | charset=utf-8 8 | trim_trailing_whitespace=true 9 | max_line_length=100 10 | insert_final_newline=true 11 | 12 | [*.yml] 13 | indent_style=space 14 | indent_size=2 15 | tab_width=8 16 | end_of_line=lf 17 | 18 | [*.sh] 19 | indent_style=space 20 | indent_size=4 21 | tab_width=8 22 | end_of_line=lf 23 | 24 | [*.json] 25 | indent_style=space 26 | indent_size=2 27 | tab_width=8 28 | end_of_line=lf 29 | 30 | -------------------------------------------------------------------------------- /.github/actions/install-fuse/action.yml: -------------------------------------------------------------------------------- 1 | name: Install Ubuntu Dependencies 2 | description: "Installs dependencies on Ubuntu" 3 | 4 | runs: 5 | using: "composite" 6 | steps: 7 | - name: Update apt-get 8 | shell: bash 9 | run: sudo apt-get update 10 | 11 | - name: Install FUSE libraries 12 | shell: bash 13 | run: sudo apt-get install -y libfuse3-dev libfuse-dev 14 | 15 | - name: Allow non-root users to mount FUSE filesystems 16 | shell: bash 17 | run: echo "user_allow_other" | sudo tee -a /etc/fuse.conf 18 | -------------------------------------------------------------------------------- /.github/workflows/bench.yml: -------------------------------------------------------------------------------- 1 | name: Benchtop 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | bench: 14 | name: NOMT - run benchtop 15 | runs-on: ubuntu-latest 16 | env: 17 | SIZE: 22 18 | BUCKETS: 4000000 19 | RUST_BACKTRACE: 1 20 | steps: 21 | - name: Free Disk Space (Ubuntu) 22 | uses: jlumbroso/free-disk-space@main 23 | with: 24 | tool-cache: false 25 | android: true 26 | dotnet: true 27 | haskell: true 28 | large-packages: true 29 | docker-images: true 30 | swap-storage: true 31 | - uses: actions/checkout@v4 32 | - run: | 33 | # Install required dependencies 34 | sudo apt-get update 35 | sudo apt-get install -y libclang-dev 36 | - run: df -h / 37 | - run: | 38 | # First build the binary 39 | cargo build --release --verbose --manifest-path=benchtop/Cargo.toml 40 | 41 | # Verify binary exists before proceeding 42 | if [ ! -f "benchtop/target/release/benchtop" ]; then 43 | echo "Binary not found at benchtop/target/release/benchtop" 44 | exit 1 45 | fi 46 | 47 | # Create directories first to avoid potential issues 48 | mkdir -p /tmp 49 | 50 | # Save our binary 51 | cp benchtop/target/release/benchtop /tmp/benchtop 52 | 53 | # Verify copy succeeded 54 | if [ ! -f "/tmp/benchtop" ]; then 55 | echo "Failed to copy binary to /tmp" 56 | exit 1 57 | fi 58 | 59 | # Now safe to clean up 60 | cargo clean 61 | rm -rf ~/.cargo/registry 62 | rm -rf ~/.cargo/git 63 | rm -rf ~/.rustup 64 | 65 | # Create target directory after cleanup 66 | mkdir -p target/release 67 | 68 | # Move binary to final location 69 | mv /tmp/benchtop target/release/benchtop 70 | 71 | # Final verification 72 | if [ ! -f "target/release/benchtop" ] || [ ! -x "target/release/benchtop" ]; then 73 | echo "Final binary is missing or not executable" 74 | exit 1 75 | fi 76 | 77 | # Make absolutely sure it's executable 78 | chmod +x target/release/benchtop 79 | 80 | - run: >- 81 | ./target/release/benchtop init 82 | -b nomt 83 | -c $SIZE 84 | -w transfer 85 | --buckets $BUCKETS 86 | - run: >- 87 | ./target/release/benchtop run 88 | -w transfer 89 | -b nomt 90 | -s 10000 91 | -c $SIZE 92 | --time-limit 30s 93 | --workload-concurrency 6 94 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | nomt_test: 14 | name: NOMT - test 15 | runs-on: ubuntu-latest 16 | env: 17 | # Avoid shrinking the inputs when an error is found in the leaf/branch stage tests. 18 | NO_STAGES_SHRINKING: true 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: ./.github/actions/install-fuse 22 | - uses: dtolnay/rust-toolchain@stable 23 | - run: cargo build --verbose --workspace --locked 24 | - run: cargo test --verbose --workspace 25 | benchtop_check: 26 | name: NOMT - check benchtop 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v4 30 | - uses: ./.github/actions/install-fuse 31 | - uses: dtolnay/rust-toolchain@stable 32 | - run: cargo check --verbose --manifest-path=benchtop/Cargo.toml --locked 33 | loom_rw_pass_cell: 34 | name: NOMT - loom rw_pass_cell 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v4 38 | - uses: dtolnay/rust-toolchain@stable 39 | - run: RUSTFLAGS="--cfg loom" cargo test -p nomt --release --lib rw_pass_cell 40 | doc: 41 | name: NOMT - doc 42 | runs-on: ubuntu-latest 43 | env: 44 | # Treat rustdoc warnings as errors. 45 | RUSTDOCFLAGS: "-D warnings" 46 | steps: 47 | - uses: actions/checkout@v4 48 | - uses: ./.github/actions/install-fuse 49 | - uses: dtolnay/rust-toolchain@stable 50 | - run: cargo doc --verbose --workspace --document-private-items 51 | fmt: 52 | name: NOMT - fmt 53 | runs-on: ubuntu-latest 54 | steps: 55 | - uses: actions/checkout@v4 56 | - uses: dtolnay/rust-toolchain@stable 57 | - run: cargo fmt --all --check 58 | - run: cargo fmt --manifest-path=benchtop/Cargo.toml --check 59 | darwin_check: 60 | name: NOMT - check darwin target 61 | runs-on: ubuntu-latest 62 | env: 63 | # This is a workaround for the blake3 crate. 64 | CARGO_FEATURE_PURE: 1 65 | steps: 66 | - uses: actions/checkout@v4 67 | - uses: dtolnay/rust-toolchain@stable 68 | with: 69 | targets: x86_64-apple-darwin 70 | # Build only the NOMT crate. Not everything builds cleanly under this configuration. 71 | - run: cargo check --verbose -p nomt --locked --target x86_64-apple-darwin 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Common ignores 2 | .DS_Store 3 | .idea 4 | .vscode 5 | .envrc 6 | 7 | /target 8 | 9 | # samply / benchtop 10 | profile.json 11 | /test 12 | /nomt/test 13 | 14 | # xtask 15 | /benchtop/regression.toml 16 | /benchtop/sov_db* 17 | /benchtop/nomt_db* 18 | /benchtop/sp_trie_db* 19 | /benchtop/target 20 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "core", 5 | "nomt", 6 | "fuzz", 7 | "torture", 8 | "examples/*", 9 | "trickfs", 10 | "trickfs/trickmnt", 11 | ] 12 | exclude = ["benchtop"] 13 | 14 | [workspace.package] 15 | authors = ["thrum"] 16 | homepage = "https://thrum.dev" 17 | repository = "https://github.com/thrumdev/nomt" 18 | edition = "2021" 19 | license = "MIT/Apache-2.0" 20 | 21 | [workspace.dependencies] 22 | borsh = { version = "1.5.7", default-features = false, features = ["derive"] } 23 | bitvec = { version = "1", default-features = false, features = ["alloc"] } 24 | hex = { version = "0.4.3", default-features = false, features = ["alloc"] } 25 | ruint = { version = "1.12.1", default-features = false } 26 | arrayvec = { version = "0.7", default-features = false } 27 | blake3 = { version = "1.5.1", default-features = false } 28 | sha2 = { version = "0.10.6", default-features = false } 29 | anyhow = { version = "1.0.81", features = ["backtrace"] } 30 | parking_lot = { version = "0.12.3", features = ["arc_lock", "send_guard"] } 31 | threadpool = "1.8.1" 32 | twox-hash = "2.1.0" 33 | fxhash = "0.2.1" 34 | dashmap = "5.5.3" 35 | crossbeam = "0.8.4" 36 | crossbeam-channel = "0.5.13" 37 | slab = "0.4.9" 38 | rand = "0.8.5" 39 | ahash = "0.8.11" 40 | imbl = "3.0.0" 41 | lru = "0.12.3" 42 | libc = "0.2.155" 43 | criterion = { version = "0.3" } 44 | thread_local = "1.1.8" 45 | cfg-if = "1.0.0" 46 | io-uring = "0.6.4" 47 | loom = { version = "0.7", features = ["checkpoint"] } 48 | rand_pcg = "0.3.1" 49 | hex-literal = "0.4" 50 | tempfile = "3.8.1" 51 | lazy_static = "1.5.0" 52 | quickcheck = "1.0.3" 53 | nix = { version = "0.29", features = ["process"] } 54 | serde = { version = "1.0.216", default-features = false, features = ["derive"] } 55 | bincode = "1.3.3" 56 | tokio = { version = "1.42.0", features = ["full"] } 57 | tokio-util = { version = "0.7.13", features = ["codec"] } 58 | tokio-stream = "0.1.17" 59 | futures = "0.3.31" 60 | tokio-serde = { version = "0.9.0", features = ["bincode"] } 61 | tracing = { version = "0.1.41", features = ["attributes"] } 62 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } 63 | futures-util = "0.3.31" 64 | clap = { version = "4.5.23", features = ["derive"] } 65 | which = "4" 66 | fuser = { version = "0.15.1", features = ["abi-7-23"] } 67 | log = "0.4.22" 68 | rand_distr = "0.4.3" 69 | env_logger = "0.11.6" 70 | digest = { version = "0.10.7" } 71 | 72 | [profile.release] 73 | debug = 1 74 | debug-assertions = true 75 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NOMT: Nearly Optimal Merkle Trie 2 | 3 | An implementation of a novel binary Merkle Trie and DB, written in Rust. 4 | 5 | NOMT is an embedded key-value store that maintains a Merklized representation of key-value pairs with a simple key-value API, powering high throughput authenticated commits with billions of key-value pairs on relatively inexpensive hardware. It is largely designed for use in a blockchain setting as a drop-in replacement for RocksDB, MDBX, LevelDB, or ParityDB. 6 | 7 | NOMT is optimized for fast random lookups of values, fast merkle tree updates, and fast writeout. It supports the generation of Merkle multiproofs for large batches of changes. 8 | 9 | NOMT is designed to take advantage of hardware improvements in Solid State Drives (SSDs) using NVMe and Linux's io-uring API for asynchronous I/O. NOMT adequately supports generic Unix as well as macOS for daily development and testing, but primarily targets Linux for performance. The impressive trend in performance and capacity in modern SSDs enables us to build a DB that scales along with the hardware. 10 | 11 | NOMT exposes a many-readers-one-writer API organized around batch transactions referred to as `Session`s. Predictable performance in a metered execution environment is a key goal of NOMT, and therefore only one `Session` may be live at a time. 12 | 13 | ## Project Structure 14 | 15 |
16 | NOMT: Project Root.
17 | ├──benchtop: A benchmarking tool for NOMT.
18 | |--core: Core logic, primarily for verifying and updating the NOMT.
19 | |--docs: Documentation
20 | |--fuzz: Fuzzing suite.
21 | ├──examples: Various examples of using NOMT.
22 | │   ├── commit_batch: Demonstration of a simple commit.
23 | │   ├── read_value: Reading a value from the NOMT.
24 | │   ├── witness_verification: Demonstration of how to verify a witness in a light-client setting.
25 | |--nomt: Implementation of the NOMT database.
26 | |──torture: Extensive testing suite for NOMT.
27 | |--trickfs: A FUSE filesystem aiding deeper testing. Experimental.
28 | │   ├──trickmnt: A tool that allows mounting trickfs.
29 | 
30 | 31 | ## Architecture 32 | 33 | Internally, NOMT consists of two parallel stores, Beatree and Bitbox. Beatree stores raw key-value pairs and is based around a B-Tree variant optimized for stable, fast random access patterns and high-entropy keys. Bitbox stores a custom sparse binary merkle tree in an on-disk hashtable in a format amenable to SSDs. 34 | 35 | For more information on NOMT, the thesis behind it, and performance targets, see [this November 2024 presentation](https://x.com/TheKusamarian/status/1855477208762261910) by @rphmeier or [view the slides here](https://hackmd.io/@Xo-wxO7bQkKidH1LrqACsw/rkG0lmjWyg#/). 36 | 37 | We have built a benchmarking tool, `benchtop`, which is located in the `benchtop` directory as a separate subcrate. 38 | 39 | ## Contributing 40 | 41 | See [CONTRIBUTING.md](docs/CONTRIBUTING.md). 42 | 43 | If you would like to discuss the development of NOMT or follow along with contributor discussions, join the official [Telegram Channel](https://t.me/thrum_nomt). 44 | 45 | ## Acknowledgements 46 | 47 | The development of this project is supported financially by [Sovereign Labs](https://www.sovereign.xyz/), creators of the [Sovereign SDK](https://github.com/Sovereign-Labs/sovereign-sdk/). The idea for this project originated in [this post by Preston Evans](https://sovereign.mirror.xyz/jfx_cJ_15saejG9ZuQWjnGnG-NfahbazQH98i1J3NN8). 48 | -------------------------------------------------------------------------------- /benchtop/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "benchtop" 3 | version = "0.1.0" 4 | authors = ["thrum"] 5 | homepage = "https://thrum.dev" 6 | repository = "https://github.com/thrumdev/nomt" 7 | edition = "2021" 8 | license = "MIT/Apache-2.0" 9 | 10 | [dependencies] 11 | 12 | # benchmarking 13 | clap = { version = "4.4.8", features = ["derive"] } 14 | anyhow = { version = "1.0.75" } 15 | hdrhistogram = "7.5.4" 16 | fxhash = "0.2.1" 17 | rand = "0.8.5" 18 | rand_distr = "0.4.3" 19 | sha2 = { version = "0.10.6" } 20 | ruint = { version = "1.12.1" } 21 | toml = "0.8.12" 22 | serde = "1.0.199" 23 | humantime = "2.1.0" 24 | rayon = "1.10" 25 | lru = "0.12.5" 26 | libc = "0.2.155" 27 | 28 | # sov-db 29 | sov-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true } 30 | sov-schema-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true } 31 | sov-prover-storage-manager = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true } 32 | jmt = { git = "https://github.com/penumbra-zone/jmt.git", rev = "1d007e11cb68aa5ca13e9a5af4a12e6439d5f7b6", optional = true } 33 | 34 | # sp-trie 35 | sp-trie = { version = "32.0.0", optional = true } 36 | sp-state-machine = { version = "0.35.0", optional = true } 37 | trie-db = { version = "0.28.0", optional = true } 38 | hash-db = { version = "0.16.0", optional = true } 39 | sp-core = { version = "31.0.0", optional = true } 40 | kvdb = { version = "0.13.0", optional = true } 41 | kvdb-rocksdb = { version = "0.19.0", optional = true } 42 | array-bytes = { version = "6.1", optional = true } 43 | 44 | # nomt 45 | nomt = { path = "../nomt" } 46 | 47 | [profile.release] 48 | debug = true 49 | 50 | [features] 51 | sov-db=["dep:sov-db", "sov-schema-db", "sov-prover-storage-manager", "jmt" ] 52 | sp-trie=["dep:sp-trie", "sp-state-machine", "trie-db", "hash-db", "sp-core", "kvdb", "kvdb-rocksdb", "array-bytes" ] 53 | -------------------------------------------------------------------------------- /benchtop/src/backend.rs: -------------------------------------------------------------------------------- 1 | use crate::{nomt::NomtDB, timer::Timer, workload::Workload}; 2 | 3 | #[cfg(feature = "sov-db")] 4 | use crate::sov_db::SovDB; 5 | 6 | #[cfg(feature = "sp-trie")] 7 | use crate::sp_trie::SpTrieDB; 8 | 9 | #[derive(Debug, Clone, clap::ValueEnum)] 10 | pub enum Backend { 11 | SovDB, 12 | Nomt, 13 | SpTrie, 14 | } 15 | 16 | impl Backend { 17 | pub fn all_backends() -> Vec { 18 | vec![Backend::SovDB, Backend::SpTrie, Backend::Nomt] 19 | } 20 | 21 | // If reset is true, then erase any previous backend's database 22 | // and restart from an empty database. 23 | // Otherwise, use the already present database. 24 | pub fn instantiate( 25 | &self, 26 | reset: bool, 27 | commit_concurrency: usize, 28 | io_workers: usize, 29 | hashtable_buckets: Option, 30 | page_cache_size: Option, 31 | leaf_cache_size: Option, 32 | page_cache_upper_levels: usize, 33 | prepopulate_page_cache: bool, 34 | overlay_window_length: usize, 35 | ) -> DB { 36 | match self { 37 | Backend::SovDB => { 38 | #[cfg(not(feature = "sov-db"))] 39 | panic!("benchtop not compiled with feature sov-db. rebuild"); 40 | #[cfg(feature = "sov-db")] 41 | DB::Sov(SovDB::open(reset)) 42 | } 43 | Backend::Nomt => DB::Nomt(NomtDB::open( 44 | reset, 45 | commit_concurrency, 46 | io_workers, 47 | hashtable_buckets, 48 | page_cache_size, 49 | leaf_cache_size, 50 | page_cache_upper_levels, 51 | prepopulate_page_cache, 52 | overlay_window_length, 53 | )), 54 | Backend::SpTrie => { 55 | #[cfg(not(feature = "sp-trie"))] 56 | panic!("benchtop not compiled with feature sp-trie. rebuild"); 57 | #[cfg(feature = "sp-trie")] 58 | DB::SpTrie(SpTrieDB::open(reset)) 59 | } 60 | } 61 | } 62 | } 63 | 64 | /// A transaction over the database which allows reading and writing. 65 | pub trait Transaction { 66 | /// Read a value from the database. If a value was previously written, return that. 67 | fn read(&mut self, key: &[u8]) -> Option>; 68 | 69 | /// Note that a value was read from a cache, for inclusion in a storage proof. 70 | fn note_read(&mut self, key: &[u8], value: Option>); 71 | 72 | /// Write a value to the database. `None` means to delete the previous value. 73 | fn write(&mut self, key: &[u8], value: Option<&[u8]>); 74 | } 75 | 76 | /// A wrapper around all databases implemented in this tool. 77 | pub enum DB { 78 | #[cfg(feature = "sov-db")] 79 | Sov(SovDB), 80 | #[cfg(feature = "sp-trie")] 81 | SpTrie(SpTrieDB), 82 | Nomt(NomtDB), 83 | } 84 | 85 | impl DB { 86 | /// Execute a workload repeatedly until done or a time limit is reached. 87 | pub fn execute( 88 | &mut self, 89 | mut timer: Option<&mut Timer>, 90 | workload: &mut dyn Workload, 91 | timeout: Option, 92 | ) { 93 | while !workload.is_done() { 94 | if timeout 95 | .as_ref() 96 | .map_or(false, |t| std::time::Instant::now() > *t) 97 | { 98 | break; 99 | } 100 | let timer = timer.as_deref_mut(); 101 | match self { 102 | #[cfg(feature = "sov-db")] 103 | DB::Sov(db) => db.execute(timer, workload), 104 | #[cfg(feature = "sp-trie")] 105 | DB::SpTrie(db) => db.execute(timer, workload), 106 | DB::Nomt(db) => db.execute(timer, workload), 107 | } 108 | } 109 | } 110 | 111 | /// Execute several workloads in parallel, repeatedly, until all done or a time limit is reached. 112 | /// 113 | /// Only works with the NOMT backend. 114 | pub fn parallel_execute( 115 | &mut self, 116 | mut timer: Option<&mut Timer>, 117 | thread_pool: &rayon::ThreadPool, 118 | workloads: &mut [Box], 119 | timeout: Option, 120 | ) -> anyhow::Result<()> { 121 | while workloads.iter().any(|w| !w.is_done()) { 122 | if timeout 123 | .as_ref() 124 | .map_or(false, |t| std::time::Instant::now() > *t) 125 | { 126 | break; 127 | } 128 | let timer = timer.as_deref_mut(); 129 | match self { 130 | #[cfg(feature = "sov-db")] 131 | DB::Sov(_) => { 132 | anyhow::bail!("parallel execution is only supported with the NOMT backend.") 133 | } 134 | #[cfg(feature = "sp-trie")] 135 | DB::SpTrie(_) => { 136 | anyhow::bail!("parallel execution is only supported with the NOMT backend.") 137 | } 138 | DB::Nomt(db) => db.parallel_execute(timer, thread_pool, workloads), 139 | } 140 | } 141 | 142 | Ok(()) 143 | } 144 | 145 | /// Print metrics collected by the Backend if it supports metrics collection 146 | pub fn print_metrics(&self) { 147 | match self { 148 | DB::Nomt(db) => db.print_metrics(), 149 | #[cfg(any(feature = "sp-trie", feature = "sov-db"))] 150 | _ => (), 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /benchtop/src/bench.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | backend::Backend, 3 | cli::bench::BenchType, 4 | timer::Timer, 5 | workload, 6 | workload::{Init, Workload}, 7 | }; 8 | use anyhow::Result; 9 | 10 | pub fn bench(bench_type: BenchType) -> Result<()> { 11 | let common_params = match bench_type { 12 | BenchType::Isolate(ref params) => ¶ms.common_params, 13 | BenchType::Sequential(ref params) => ¶ms.common_params, 14 | }; 15 | 16 | let (init, workload) = workload::parse( 17 | common_params.workload.name.as_str(), 18 | common_params.workload.size, 19 | common_params 20 | .workload 21 | .initial_capacity 22 | .map(|s| 1u64 << s) 23 | .unwrap_or(0), 24 | common_params.workload.percentage_cold, 25 | )?; 26 | let commit_concurrency = common_params.workload.commit_concurrency; 27 | let io_workers = common_params.workload.io_workers; 28 | 29 | let backends = if common_params.backends.is_empty() { 30 | Backend::all_backends() 31 | } else { 32 | common_params.backends.clone() 33 | }; 34 | 35 | match bench_type { 36 | BenchType::Isolate(params) => bench_isolate( 37 | init, 38 | workload, 39 | backends, 40 | params.iterations, 41 | true, 42 | commit_concurrency, 43 | io_workers, 44 | ) 45 | .map(|_| ()), 46 | BenchType::Sequential(params) => bench_sequential( 47 | init, 48 | workload, 49 | backends, 50 | params.op_limit, 51 | params.time_limit, 52 | true, 53 | commit_concurrency, 54 | io_workers, 55 | ) 56 | .map(|_| ()), 57 | } 58 | } 59 | 60 | // Benchmark the workload across multiple backends multiple times. 61 | // Each iteration will be executed on a freshly initialized database. 62 | // 63 | // Return the mean execution time of the workloads for each backends 64 | // in the order the backends are provided 65 | pub fn bench_isolate( 66 | mut init: Init, 67 | mut workload: Box, 68 | backends: Vec, 69 | iterations: u64, 70 | print: bool, 71 | commit_concurrency: usize, 72 | io_workers: usize, 73 | ) -> Result> { 74 | let mut mean_results = vec![]; 75 | for backend in backends { 76 | let mut timer = Timer::new(format!("{}", backend)); 77 | 78 | for _ in 0..iterations { 79 | let mut db = backend.instantiate(true, commit_concurrency, io_workers); 80 | db.execute(None, &mut init); 81 | db.execute(Some(&mut timer), &mut *workload); 82 | db.print_metrics(); 83 | } 84 | 85 | if print { 86 | timer.print(); 87 | } 88 | mean_results.push(timer.get_mean_workload_duration()?); 89 | } 90 | 91 | Ok(mean_results) 92 | } 93 | 94 | // Benchmark the workload across multiple backends multiple times. 95 | // Each iteration will be executed on the same db repeatedly 96 | // without clearing it until a time or operation count limit is reaced. 97 | // 98 | // Return the mean execution time of the workloads for each backends 99 | // in the order the backends are provided 100 | pub fn bench_sequential( 101 | mut init: Init, 102 | mut workload: Box, 103 | backends: Vec, 104 | op_limit: Option, 105 | time_limit: Option, 106 | print: bool, 107 | commit_concurrency: usize, 108 | io_workers: usize, 109 | ) -> Result> { 110 | if let (None, None) = (op_limit, time_limit) { 111 | anyhow::bail!("You need to specify at least one limiter between operations and time") 112 | } 113 | 114 | let mut mean_results = vec![]; 115 | 116 | for backend in backends { 117 | let mut timer = Timer::new(format!("{}", backend)); 118 | let mut db = backend.instantiate(true, commit_concurrency, io_workers); 119 | 120 | let mut elapsed_time = 0; 121 | let mut op_count = 0; 122 | 123 | db.execute(None, &mut init); 124 | 125 | loop { 126 | db.execute(Some(&mut timer), &mut *workload); 127 | 128 | // check if time limit exceeded 129 | elapsed_time += timer.get_last_workload_duration()?; 130 | match time_limit { 131 | Some(limit) if elapsed_time >= (limit * 1000000) => break, 132 | _ => (), 133 | }; 134 | 135 | // check if op limit exceeded 136 | op_count += workload.size() as u64; 137 | match op_limit { 138 | Some(limit) if op_count >= limit => break, 139 | _ => (), 140 | }; 141 | } 142 | 143 | db.print_metrics(); 144 | 145 | if print { 146 | timer.print(); 147 | } 148 | mean_results.push(timer.get_mean_workload_duration()?); 149 | } 150 | Ok(mean_results) 151 | } 152 | -------------------------------------------------------------------------------- /benchtop/src/custom_workload.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | backend::Transaction, 3 | cli::StateItemDistribution, 4 | workload::{Distribution, Workload}, 5 | }; 6 | use rand::Rng; 7 | 8 | #[derive(Clone)] 9 | pub struct RwInit { 10 | cur_val: u64, 11 | num_vals: u64, 12 | } 13 | 14 | impl Workload for RwInit { 15 | fn run_step(&mut self, transaction: &mut dyn Transaction) { 16 | const MAX_INIT_PER_ITERATION: u64 = 64 * 1024 * 1024; 17 | 18 | if self.num_vals == 0 { 19 | return; 20 | } 21 | 22 | let count = std::cmp::min(self.num_vals - self.cur_val, MAX_INIT_PER_ITERATION); 23 | for _ in 0..count { 24 | transaction.write(&encode_id(self.cur_val), Some(&[64u8; 32])); 25 | self.cur_val += 1; 26 | } 27 | println!( 28 | "populating {:.1}%", 29 | 100.0 * (self.cur_val as f64) / (self.num_vals as f64) 30 | ); 31 | } 32 | 33 | fn is_done(&self) -> bool { 34 | self.num_vals == self.cur_val 35 | } 36 | } 37 | 38 | /// Greate a workload for initializing a database with the given amount of key-value pairs. 39 | pub fn init(db_size: u64) -> RwInit { 40 | RwInit { 41 | cur_val: 0, 42 | num_vals: db_size, 43 | } 44 | } 45 | 46 | fn encode_id(id: u64) -> [u8; 8] { 47 | id.to_be_bytes() 48 | } 49 | 50 | /// Build N `RwWorkload`s, one for each thread. 51 | pub fn build( 52 | reads: u8, 53 | writes: u8, 54 | workload_size: u64, 55 | fresh: u8, 56 | db_size: u64, 57 | op_limit: u64, 58 | threads: usize, 59 | distribution: StateItemDistribution, 60 | ) -> Vec { 61 | let thread_workload_size = workload_size / threads as u64; 62 | let db_step = db_size / threads as u64; 63 | 64 | (0..threads) 65 | .map(|i| { 66 | let db_start = db_step * i as u64; 67 | 68 | RwWorkload { 69 | reads, 70 | writes, 71 | fresh, 72 | workload_size: if i == threads - 1 { 73 | thread_workload_size + workload_size % threads as u64 74 | } else { 75 | thread_workload_size 76 | }, 77 | ops_remaining: op_limit / threads as u64, 78 | distribution: Distribution::new(distribution, db_start, db_start + db_step), 79 | } 80 | }) 81 | .collect() 82 | } 83 | 84 | // The read-write workload will follow these rules: 85 | // 1. Reads and writes are randomly and uniformly distributed across the key space. 86 | // 2. The DB size indicates the number of entries in the database. 87 | // 3. The workload size represents the total number of operations, where reads and writes 88 | // are numbers that need to sum to 100 and represent a percentage of the total size. 89 | // 4. Fresh indicates the percentage of reads and writes that will be performed on 90 | // non-existing keys 91 | pub struct RwWorkload { 92 | pub reads: u8, 93 | pub writes: u8, 94 | pub workload_size: u64, 95 | pub fresh: u8, 96 | pub ops_remaining: u64, 97 | pub distribution: Distribution, 98 | } 99 | 100 | impl Workload for RwWorkload { 101 | fn run_step(&mut self, transaction: &mut dyn Transaction) { 102 | let from_percentage = |p: u8| (self.workload_size as f64 * p as f64 / 100.0) as u64; 103 | let fresh = |size: u64| (size as f64 * self.fresh as f64 / 100.0) as u64; 104 | 105 | // total reads and writes 106 | let n_reads = from_percentage(self.reads); 107 | let n_writes = from_percentage(self.writes); 108 | // fresh reads and writes 109 | let n_reads_fresh = fresh(n_reads); 110 | let n_writes_fresh = fresh(n_writes); 111 | 112 | let mut rng = rand::thread_rng(); 113 | 114 | for i in 0..n_reads { 115 | let _ = if i < n_reads_fresh { 116 | // fresh read, technically there is a chance to generate 117 | // a random key that is already present in the database, 118 | // but it is very unlikely 119 | transaction.read(&rand_key(&mut rng)) 120 | } else { 121 | // read already existing key 122 | let key = self.distribution.sample(&mut rng); 123 | transaction.read(&encode_id(key)) 124 | }; 125 | } 126 | 127 | for i in 0..n_writes { 128 | let value = rand_key(&mut rng); 129 | if i < n_writes_fresh { 130 | // fresh write 131 | transaction.write(&rand_key(&mut rng), Some(&value)); 132 | } else { 133 | // substitute key 134 | let key = self.distribution.sample(&mut rng); 135 | transaction.write(&encode_id(key), Some(&value)); 136 | }; 137 | } 138 | 139 | self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size); 140 | } 141 | 142 | fn is_done(&self) -> bool { 143 | self.ops_remaining == 0 144 | } 145 | } 146 | 147 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] { 148 | // keys must be uniformly distributed 149 | let mut key = [0; 32]; 150 | rng.fill(&mut key[..16]); 151 | key 152 | } 153 | -------------------------------------------------------------------------------- /benchtop/src/main.rs: -------------------------------------------------------------------------------- 1 | mod backend; 2 | mod cli; 3 | mod custom_workload; 4 | mod nomt; 5 | 6 | #[cfg(feature = "sov-db")] 7 | mod sov_db; 8 | #[cfg(feature = "sp-trie")] 9 | mod sp_trie; 10 | 11 | mod timer; 12 | mod transfer_workload; 13 | mod workload; 14 | 15 | use anyhow::Result; 16 | use clap::Parser; 17 | use cli::{Cli, Commands, InitParams, RunParams}; 18 | use timer::Timer; 19 | 20 | pub fn main() -> Result<()> { 21 | let cli = Cli::parse(); 22 | 23 | match cli.command { 24 | Commands::Init(params) => init(params), 25 | Commands::Run(params) => run(params), 26 | } 27 | } 28 | 29 | pub fn init(params: InitParams) -> Result<()> { 30 | let workload_params = params.workload; 31 | let (mut init, _) = workload::parse(&workload_params, u64::max_value())?; 32 | 33 | let mut db = params.backend.instantiate( 34 | true, 35 | workload_params.commit_concurrency, 36 | workload_params.io_workers, 37 | workload_params.hashtable_buckets, 38 | workload_params.page_cache_size, 39 | workload_params.leaf_cache_size, 40 | workload_params.page_cache_upper_levels, 41 | workload_params.prepopulate_page_cache, 42 | 0, 43 | ); 44 | db.execute(None, &mut *init, None); 45 | 46 | Ok(()) 47 | } 48 | 49 | pub fn run(params: RunParams) -> Result<()> { 50 | let workload_params = params.workload; 51 | let (mut init, mut workloads) = workload::parse( 52 | &workload_params, 53 | params.limits.ops.unwrap_or(u64::max_value()), 54 | )?; 55 | 56 | let mut db = params.backend.instantiate( 57 | params.reset, 58 | workload_params.commit_concurrency, 59 | workload_params.io_workers, 60 | workload_params.hashtable_buckets, 61 | workload_params.page_cache_size, 62 | workload_params.leaf_cache_size, 63 | workload_params.page_cache_upper_levels, 64 | workload_params.prepopulate_page_cache, 65 | workload_params.overlay_window_length, 66 | ); 67 | 68 | if params.reset { 69 | db.execute(None, &mut *init, None); 70 | } 71 | 72 | let mut timer = Timer::new(format!("{}", params.backend)); 73 | let warmup_timeout = params 74 | .warm_up 75 | .map(|time_limit| std::time::Instant::now() + time_limit.into()); 76 | 77 | let thread_pool = rayon::ThreadPoolBuilder::new() 78 | .thread_name(|_| "benchtop-workload".into()) 79 | .num_threads(workload_params.workload_concurrency as usize) 80 | .build()?; 81 | 82 | if let Some(t) = warmup_timeout { 83 | if workload_params.workload_concurrency == 1 { 84 | db.execute(Some(&mut timer), &mut *workloads[0], Some(t)); 85 | } else { 86 | db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, Some(t))?; 87 | }; 88 | 89 | timer = Timer::new(format!("{}", params.backend)); 90 | } 91 | 92 | let timeout = params 93 | .limits 94 | .time 95 | .map(|time_limit| std::time::Instant::now() + time_limit.into()); 96 | 97 | if workload_params.workload_concurrency == 1 { 98 | db.execute(Some(&mut timer), &mut *workloads[0], timeout); 99 | } else { 100 | db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, timeout)?; 101 | }; 102 | 103 | db.print_metrics(); 104 | timer.print(workload_params.size); 105 | print_max_rss(); 106 | 107 | Ok(()) 108 | } 109 | 110 | fn print_max_rss() { 111 | let max_rss = get_max_rss().unwrap_or(0); 112 | println!("max rss: {} MiB", max_rss / 1024); 113 | fn get_max_rss() -> Option { 114 | let mut usage: libc::rusage = unsafe { std::mem::zeroed() }; 115 | let ret = unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut usage) }; 116 | if ret == 0 { 117 | Some(usage.ru_maxrss as usize) 118 | } else { 119 | None 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /benchtop/src/sp_trie.rs: -------------------------------------------------------------------------------- 1 | use crate::{backend::Transaction, timer::Timer, workload::Workload}; 2 | use hash_db::{AsHashDB, HashDB, Prefix}; 3 | use kvdb::KeyValueDB; 4 | use kvdb_rocksdb::{Database, DatabaseConfig}; 5 | use sha2::Digest; 6 | use sp_trie::trie_types::TrieDBMutBuilderV1; 7 | use sp_trie::{DBValue, LayoutV1, PrefixedMemoryDB, TrieDBMut}; 8 | use std::sync::Arc; 9 | use trie_db::TrieMut; 10 | 11 | type Hasher = sp_core::Blake2Hasher; 12 | type Hash = sp_core::H256; 13 | 14 | const SP_TRIE_DB_FOLDER: &str = "sp_trie_db"; 15 | 16 | const NUM_COLUMNS: u32 = 2; 17 | const COL_TRIE: u32 = 0; 18 | const COL_ROOT: u32 = 1; 19 | 20 | const ROOT_KEY: &[u8] = b"root"; 21 | 22 | pub struct SpTrieDB { 23 | pub kvdb: Arc, 24 | pub root: Hash, 25 | } 26 | 27 | pub struct Trie<'a> { 28 | pub db: Arc, 29 | pub overlay: &'a mut PrefixedMemoryDB, 30 | } 31 | 32 | impl SpTrieDB { 33 | pub fn open(reset: bool) -> Self { 34 | if reset { 35 | // Delete previously existing db 36 | let _ = std::fs::remove_dir_all(SP_TRIE_DB_FOLDER); 37 | } 38 | 39 | let db_cfg = DatabaseConfig::with_columns(NUM_COLUMNS); 40 | let kvdb = 41 | Arc::new(Database::open(&db_cfg, SP_TRIE_DB_FOLDER).expect("Database backend error")); 42 | 43 | let root = match kvdb.get(COL_ROOT, ROOT_KEY).unwrap() { 44 | None => Hash::default(), 45 | Some(r) => Hash::from_slice(&r[..32]), 46 | }; 47 | 48 | Self { kvdb, root } 49 | } 50 | 51 | pub fn execute(&mut self, mut timer: Option<&mut Timer>, workload: &mut dyn Workload) { 52 | let _timer_guard_total = timer.as_mut().map(|t| t.record_span("workload")); 53 | 54 | let mut new_root = self.root; 55 | let mut overlay = PrefixedMemoryDB::default(); 56 | 57 | let mut trie = Trie { 58 | db: self.kvdb.clone(), 59 | overlay: &mut overlay, 60 | }; 61 | 62 | let recorder: sp_trie::recorder::Recorder = Default::default(); 63 | let _timer_guard_commit = { 64 | let mut trie_recorder = recorder.as_trie_recorder(new_root); 65 | 66 | let trie_db_mut = if self.root == Hash::default() { 67 | TrieDBMutBuilderV1::new(&mut trie, &mut new_root) 68 | .with_recorder(&mut trie_recorder) 69 | .build() 70 | } else { 71 | TrieDBMutBuilderV1::from_existing(&mut trie, &mut new_root) 72 | .with_recorder(&mut trie_recorder) 73 | .build() 74 | }; 75 | 76 | let mut transaction = Tx { 77 | trie: trie_db_mut, 78 | timer, 79 | }; 80 | workload.run_step(&mut transaction); 81 | let Tx { 82 | trie: mut trie_db_mut, 83 | mut timer, 84 | } = transaction; 85 | 86 | let timer_guard_commit = timer.as_mut().map(|t| t.record_span("commit_and_prove")); 87 | 88 | trie_db_mut.commit(); 89 | timer_guard_commit 90 | }; 91 | 92 | let _proof = recorder.drain_storage_proof().is_empty(); 93 | 94 | let mut transaction = self.kvdb.transaction(); 95 | for (key, (value, ref_count)) in overlay.drain() { 96 | if ref_count > 0 { 97 | transaction.put(COL_TRIE, &key[..], &value[..]) 98 | } else if ref_count < 0 { 99 | transaction.delete(COL_TRIE, &key[..]) 100 | } 101 | } 102 | transaction.put(COL_ROOT, ROOT_KEY, new_root.as_bytes()); 103 | self.kvdb 104 | .write(transaction) 105 | .expect("Failed to write transaction"); 106 | 107 | self.root = new_root; 108 | } 109 | } 110 | 111 | struct Tx<'a> { 112 | trie: TrieDBMut<'a, LayoutV1>, 113 | timer: Option<&'a mut Timer>, 114 | } 115 | 116 | // sp_trie does not require hashed keys, 117 | // but if keys are not hashed, the comparison does not seem to be efficient. 118 | // Not applying hashing to keys would significantly speed up sp_trie. 119 | impl<'a> Transaction for Tx<'a> { 120 | fn read(&mut self, key: &[u8]) -> Option> { 121 | let key_path = sha2::Sha256::digest(key); 122 | 123 | let _timer_guard_read = self.timer.as_mut().map(|t| t.record_span("read")); 124 | self.trie 125 | .get(&key_path) 126 | .expect("Impossible fetching from sp-trie db") 127 | } 128 | 129 | fn note_read(&mut self, key: &[u8], _value: Option>) { 130 | let _ = self.read(key); 131 | } 132 | 133 | fn write(&mut self, key: &[u8], value: Option<&[u8]>) { 134 | let key_path = sha2::Sha256::digest(key); 135 | 136 | self.trie 137 | .insert(&key_path, &value.unwrap_or(&[])) 138 | .expect("Impossible writing into sp-trie db"); 139 | } 140 | } 141 | 142 | impl<'a> AsHashDB for Trie<'a> { 143 | fn as_hash_db(&self) -> &dyn hash_db::HashDB { 144 | self 145 | } 146 | 147 | fn as_hash_db_mut<'b>(&'b mut self) -> &'b mut (dyn HashDB + 'b) { 148 | &mut *self 149 | } 150 | } 151 | 152 | impl<'a> HashDB for Trie<'a> { 153 | fn get(&self, key: &Hash, prefix: Prefix) -> Option { 154 | if let Some(value) = self.overlay.get(key, prefix) { 155 | return Some(value); 156 | } 157 | 158 | let key = sp_trie::prefixed_key::(key, prefix); 159 | self.db.get(0, &key).expect("Database backend error") 160 | } 161 | 162 | fn contains(&self, hash: &Hash, prefix: Prefix) -> bool { 163 | self.get(hash, prefix).is_some() 164 | } 165 | 166 | fn insert(&mut self, prefix: Prefix, value: &[u8]) -> Hash { 167 | self.overlay.insert(prefix, value) 168 | } 169 | 170 | fn emplace(&mut self, key: Hash, prefix: Prefix, value: DBValue) { 171 | self.overlay.emplace(key, prefix, value); 172 | } 173 | 174 | fn remove(&mut self, key: &Hash, prefix: Prefix) { 175 | self.overlay.remove(key, prefix) 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /benchtop/src/timer.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | cell::RefCell, 3 | collections::hash_map::{Entry, HashMap}, 4 | rc::Rc, 5 | }; 6 | 7 | // At least three spans are expected to be measured 8 | // + `workload` 9 | // + `read` 10 | // + `commit_and_prove` 11 | pub struct Timer { 12 | name: String, 13 | spans: HashMap<&'static str, Rc>>>, 14 | } 15 | 16 | impl Timer { 17 | pub fn new(name: String) -> Self { 18 | Self { 19 | name, 20 | spans: HashMap::new(), 21 | } 22 | } 23 | 24 | pub fn record_span(&mut self, span_name: &'static str) -> impl Drop { 25 | struct RecordSpan { 26 | h: Rc>>, 27 | start: std::time::Instant, 28 | } 29 | impl Drop for RecordSpan { 30 | fn drop(&mut self) { 31 | let elapsed = self.start.elapsed().as_nanos() as u64; 32 | self.h.borrow_mut().record(elapsed).unwrap(); 33 | } 34 | } 35 | 36 | let h = self.spans.entry(span_name).or_insert_with(|| { 37 | Rc::new(RefCell::new( 38 | hdrhistogram::Histogram::::new(3).unwrap(), 39 | )) 40 | }); 41 | 42 | RecordSpan { 43 | h: h.clone(), 44 | start: std::time::Instant::now(), 45 | } 46 | } 47 | 48 | pub fn freeze(self) -> FrozenTimer { 49 | FrozenTimer { 50 | spans: self 51 | .spans 52 | .into_iter() 53 | .map(|(name, histogram)| (name, Rc::into_inner(histogram).unwrap().into_inner())) 54 | .collect(), 55 | } 56 | } 57 | 58 | pub fn add(&mut self, other: FrozenTimer) { 59 | for (span_name, new_data) in other.spans { 60 | match self.spans.entry(span_name) { 61 | Entry::Occupied(e) => e.get().borrow_mut().add(new_data).unwrap(), 62 | Entry::Vacant(e) => { 63 | let _ = e.insert(Rc::new(RefCell::new(new_data))); 64 | } 65 | } 66 | } 67 | } 68 | 69 | pub fn get_last_workload_duration(&self) -> anyhow::Result { 70 | let h = self 71 | .spans 72 | .get("workload") 73 | .ok_or(anyhow::anyhow!("`workload` span not recorded"))?; 74 | 75 | Ok(h.borrow() 76 | .iter_recorded() 77 | .last() 78 | .ok_or(anyhow::anyhow!("No recorded value for `workload` span"))? 79 | .value_iterated_to()) 80 | } 81 | 82 | pub fn get_mean_workload_duration(&self) -> anyhow::Result { 83 | Ok(self 84 | .spans 85 | .get("workload") 86 | .ok_or(anyhow::anyhow!("`workload` span not recorded"))? 87 | .borrow() 88 | .mean() as u64) 89 | } 90 | 91 | pub fn print(&mut self, workload_size: u64) { 92 | println!("{}", self.name); 93 | 94 | let expected_spans = ["workload", "read", "commit_and_prove"]; 95 | 96 | // print expectd spans in order 97 | for span_name in expected_spans { 98 | let h = self.spans.get(span_name); 99 | match h { 100 | Some(h) => println!( 101 | " mean {}: {}", 102 | span_name, 103 | pretty_display_ns(h.borrow().mean() as u64) 104 | ), 105 | None => println!("{} not measured", span_name), 106 | }; 107 | } 108 | 109 | if let Ok(workload_mean_ns) = self.get_mean_workload_duration() { 110 | let ops_per_second = workload_size as f64 / (workload_mean_ns as f64 / 1_000_000_000.0); 111 | println!(" mean throughput: {ops_per_second:.1} ops/s"); 112 | } 113 | 114 | // print all other measured spans 115 | for (span_name, h) in &self.spans { 116 | if expected_spans.contains(span_name) { 117 | continue; 118 | } 119 | 120 | println!( 121 | " mean {}: {}", 122 | span_name, 123 | pretty_display_ns(h.borrow().mean() as u64) 124 | ) 125 | } 126 | } 127 | } 128 | 129 | pub struct FrozenTimer { 130 | spans: HashMap<&'static str, hdrhistogram::Histogram>, 131 | } 132 | 133 | pub fn pretty_display_ns(ns: u64) -> String { 134 | // preserve 3 sig figs at minimum. 135 | let (val, unit) = if ns > 100 * 1_000_000_000 { 136 | (ns / 1_000_000_000, "s") 137 | } else if ns > 100 * 1_000_000 { 138 | (ns / 1_000_000, "ms") 139 | } else if ns > 100 * 1_000 { 140 | (ns / 1_000, "us") 141 | } else { 142 | (ns, "ns") 143 | }; 144 | 145 | format!("{val} {unit}") 146 | } 147 | -------------------------------------------------------------------------------- /benchtop/src/transfer_workload.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | backend::Transaction, 3 | cli::StateItemDistribution, 4 | workload::{Distribution, Workload}, 5 | }; 6 | use rand::Rng; 7 | 8 | #[derive(Clone)] 9 | pub struct TransferInit { 10 | cur_account: u64, 11 | num_accounts: u64, 12 | } 13 | 14 | impl Workload for TransferInit { 15 | fn run_step(&mut self, transaction: &mut dyn Transaction) { 16 | const MAX_INIT_PER_ITERATION: u64 = 64 * 1024; 17 | 18 | if self.num_accounts == 0 { 19 | return; 20 | } 21 | 22 | let count = std::cmp::min(self.num_accounts - self.cur_account, MAX_INIT_PER_ITERATION); 23 | for _ in 0..count { 24 | transaction.write(&encode_id(self.cur_account), Some(&encode_balance(1000))); 25 | self.cur_account += 1; 26 | } 27 | println!( 28 | "populating {:.1}%", 29 | 100.0 * (self.cur_account as f64) / (self.num_accounts as f64) 30 | ); 31 | } 32 | 33 | fn is_done(&self) -> bool { 34 | self.cur_account == self.num_accounts 35 | } 36 | } 37 | 38 | /// Create an initialization command for a transfer database. 39 | pub fn init(num_accounts: u64) -> TransferInit { 40 | TransferInit { 41 | cur_account: 0, 42 | num_accounts, 43 | } 44 | } 45 | 46 | fn encode_id(id: u64) -> [u8; 8] { 47 | id.to_be_bytes() 48 | } 49 | 50 | fn encode_balance(balance: u64) -> [u8; 8] { 51 | balance.to_be_bytes() 52 | } 53 | 54 | fn decode_balance(encoded: &[u8]) -> u64 { 55 | let mut buf = [0; 8]; 56 | buf.copy_from_slice(encoded); 57 | u64::from_be_bytes(buf) 58 | } 59 | 60 | /// Build a new workload meant to emulate transfers. 61 | /// 62 | /// `num_accounts` refers to the amount of accounts in the database. 63 | /// 64 | /// `percentage_cold_transfer` ranges from 0 to 100 and indicates the proportion of transfers 65 | /// which should be sent to a fresh account. 66 | pub fn build( 67 | num_accounts: u64, 68 | workload_size: u64, 69 | percentage_cold_transfer: u8, 70 | op_limit: u64, 71 | threads: usize, 72 | distribution: StateItemDistribution, 73 | ) -> Vec { 74 | let thread_workload_size = workload_size / threads as u64; 75 | let num_accounts_step = num_accounts / threads as u64; 76 | 77 | (0..threads) 78 | .map(|i| { 79 | let start_account = num_accounts_step * i as u64; 80 | let end_account = if i == threads - 1 { 81 | num_accounts 82 | } else { 83 | num_accounts_step * (i as u64 + 1) 84 | }; 85 | TransferWorkload { 86 | num_accounts, 87 | workload_size: thread_workload_size, 88 | percentage_cold_transfer, 89 | ops_remaining: op_limit / threads as u64, 90 | distribution: Distribution::new(distribution, start_account, end_account), 91 | } 92 | }) 93 | .collect() 94 | } 95 | 96 | /// A transfer-like workload. 97 | pub struct TransferWorkload { 98 | /// The number of accounts in the system. 99 | pub num_accounts: u64, 100 | /// The size of the workload. 101 | pub workload_size: u64, 102 | /// The percentage of transfers to make to fresh accounts. 103 | pub percentage_cold_transfer: u8, 104 | /// The number of remaining operations before being considered 'done'. 105 | pub ops_remaining: u64, 106 | /// The random distribution to use to sample state items. 107 | pub distribution: Distribution, 108 | } 109 | 110 | impl Workload for TransferWorkload { 111 | fn run_step(&mut self, transaction: &mut dyn Transaction) { 112 | let cold_sends = 113 | (self.workload_size as f64 * (self.percentage_cold_transfer as f64 / 100.0)) as u64; 114 | let warm_sends = self.workload_size - cold_sends; 115 | 116 | let mut rng = rand::thread_rng(); 117 | for i in 0..self.workload_size { 118 | let send_account = self.distribution.sample(&mut rng); 119 | let recv_account = if i < warm_sends { 120 | let mut r = self.distribution.sample(&mut rng); 121 | while r == send_account { 122 | r = self.distribution.sample(&mut rng); 123 | } 124 | r 125 | } else { 126 | // odds of two threads generating the same random account here are 127 | // incredibly low. 128 | rng.gen_range(self.num_accounts..u64::max_value()) 129 | }; 130 | 131 | let send_balance = decode_balance( 132 | &transaction 133 | .read(&encode_id(send_account)) 134 | .expect("account exists"), 135 | ); 136 | let recv_balance = transaction 137 | .read(&encode_id(recv_account)) 138 | .map_or(0, |v| decode_balance(&v)); 139 | 140 | let new_send_balance = if send_balance == 0 { 141 | 1000 // yay, free money. 142 | } else { 143 | send_balance - 1 144 | }; 145 | let new_recv_balance = recv_balance + 1; 146 | 147 | transaction.write( 148 | &encode_id(send_account), 149 | Some(&encode_balance(new_send_balance)), 150 | ); 151 | transaction.write( 152 | &encode_id(recv_account), 153 | Some(&encode_balance(new_recv_balance)), 154 | ); 155 | } 156 | 157 | self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size); 158 | } 159 | 160 | fn is_done(&self) -> bool { 161 | self.ops_remaining == 0 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nomt-core" 3 | description = "Core trie operations for NOMT" 4 | version = "1.0.0-preview" 5 | authors.workspace = true 6 | homepage.workspace = true 7 | repository.workspace = true 8 | edition.workspace = true 9 | license.workspace = true 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dependencies] 14 | bitvec.workspace = true 15 | hex.workspace = true 16 | ruint.workspace = true 17 | arrayvec.workspace = true 18 | borsh = { workspace = true, optional = true } 19 | blake3 = { workspace = true, optional = true } 20 | sha2 = { workspace = true, optional = true } 21 | serde = { workspace = true, optional = true } 22 | digest = { workspace = true } 23 | 24 | [dev-dependencies] 25 | blake3.workspace = true 26 | 27 | [features] 28 | default = ["std", "blake3-hasher", "sha2-hasher"] 29 | std = ["bitvec/std", "borsh?/std", "serde?/std"] 30 | borsh = ["dep:borsh"] 31 | blake3-hasher = ["dep:blake3"] 32 | sha2-hasher = ["dep:sha2"] 33 | serde = ["dep:serde", "serde/alloc"] 34 | -------------------------------------------------------------------------------- /core/src/hasher.rs: -------------------------------------------------------------------------------- 1 | //! Hashers (feature-gated) and utilities for implementing them. 2 | 3 | use crate::trie::{InternalData, LeafData, Node, NodeKind, TERMINATOR}; 4 | 5 | /// A trie node hash function specialized for 64 bytes of data. 6 | /// 7 | /// Note that it is illegal for the produced hash to equal [0; 32], as this value is reserved 8 | /// for the terminator node. 9 | /// 10 | /// A node hasher should domain-separate internal and leaf nodes in some specific way. The 11 | /// recommended approach for binary hashes is to set the MSB to 0 or 1 depending on the node kind. 12 | /// However, for other kinds of hashes (e.g. Poseidon2 or other algebraic hashes), other labeling 13 | /// schemes may be required. 14 | pub trait NodeHasher { 15 | /// Hash a leaf. This should domain-separate the hash 16 | /// according to the node kind. 17 | fn hash_leaf(data: &LeafData) -> [u8; 32]; 18 | 19 | /// Hash an internal node. This should domain-separate 20 | /// the hash according to the node kind. 21 | fn hash_internal(data: &InternalData) -> [u8; 32]; 22 | 23 | /// Get the kind of the given node. 24 | fn node_kind(node: &Node) -> NodeKind; 25 | } 26 | 27 | /// A hasher for arbitrary-length values. 28 | pub trait ValueHasher { 29 | /// Hash an arbitrary-length value. 30 | fn hash_value(value: &[u8]) -> [u8; 32]; 31 | } 32 | 33 | /// Get the node kind, according to a most-significant bit labeling scheme. 34 | /// 35 | /// If the MSB is true, it's a leaf. If the node is empty, it's a [`TERMINATOR`]. Otherwise, it's 36 | /// an internal node. 37 | pub fn node_kind_by_msb(node: &Node) -> NodeKind { 38 | if node[0] >> 7 == 1 { 39 | NodeKind::Leaf 40 | } else if node == &TERMINATOR { 41 | NodeKind::Terminator 42 | } else { 43 | NodeKind::Internal 44 | } 45 | } 46 | 47 | /// Set the most-significant bit of the node. 48 | pub fn set_msb(node: &mut Node) { 49 | node[0] |= 0b10000000; 50 | } 51 | 52 | pub fn unset_msb(node: &mut Node) { 53 | node[0] &= 0b01111111; 54 | } 55 | 56 | /// A simple trait for representing binary hash functions. 57 | pub trait BinaryHash { 58 | /// Given a bit-string, produce a 32-bit hash. 59 | fn hash(input: &[u8]) -> [u8; 32]; 60 | 61 | /// An optional specialization of `hash` where there are two 32-byte inputs, left and right. 62 | fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { 63 | let mut buf = [0u8; 64]; 64 | buf[0..32].copy_from_slice(left); 65 | buf[32..64].copy_from_slice(right); 66 | Self::hash(&buf) 67 | } 68 | } 69 | 70 | /// A node and value hasher constructed from a simple binary hasher. 71 | /// 72 | /// This implements a [`ValueHasher`] and [`NodeHasher`] where the node kind is tagged by setting 73 | /// or unsetting the MSB of the hash value. 74 | /// 75 | /// The binary hash wrapped by this structure must behave approximately like a random oracle over 76 | /// the space 2^256, i.e. all 256 bit outputs are valid and inputs are uniformly distributed. 77 | /// 78 | /// Functions like Sha2/Blake3/Keccak/Groestl all meet these criteria. 79 | pub struct BinaryHasher(core::marker::PhantomData); 80 | 81 | impl ValueHasher for BinaryHasher { 82 | fn hash_value(value: &[u8]) -> [u8; 32] { 83 | H::hash(value) 84 | } 85 | } 86 | 87 | impl NodeHasher for BinaryHasher { 88 | fn hash_leaf(data: &LeafData) -> [u8; 32] { 89 | let mut h = H::hash2_32_concat(&data.key_path, &data.value_hash); 90 | set_msb(&mut h); 91 | h 92 | } 93 | 94 | fn hash_internal(data: &InternalData) -> [u8; 32] { 95 | let mut h = H::hash2_32_concat(&data.left, &data.right); 96 | unset_msb(&mut h); 97 | h 98 | } 99 | 100 | fn node_kind(node: &Node) -> NodeKind { 101 | node_kind_by_msb(node) 102 | } 103 | } 104 | 105 | /// Blanket implementation for all implementations of `Digest` 106 | impl + Send + Sync> BinaryHash for H { 107 | fn hash(input: &[u8]) -> [u8; 32] { 108 | H::digest(input).into() 109 | } 110 | } 111 | 112 | #[cfg(any(feature = "blake3-hasher", test))] 113 | pub use blake3::Blake3Hasher; 114 | 115 | /// A node hasher making use of blake3. 116 | #[cfg(any(feature = "blake3-hasher", test))] 117 | pub mod blake3 { 118 | use super::{BinaryHash, BinaryHasher}; 119 | 120 | /// A [`BinaryHash`] implementation for Blake3. 121 | pub struct Blake3BinaryHasher; 122 | 123 | /// A wrapper around Blake3 for use in NOMT. 124 | pub type Blake3Hasher = BinaryHasher; 125 | 126 | impl BinaryHash for Blake3BinaryHasher { 127 | fn hash(value: &[u8]) -> [u8; 32] { 128 | blake3::hash(value).into() 129 | } 130 | 131 | fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { 132 | let mut hasher = blake3::Hasher::new(); 133 | hasher.update(left); 134 | hasher.update(right); 135 | hasher.finalize().into() 136 | } 137 | } 138 | } 139 | 140 | #[cfg(feature = "sha2-hasher")] 141 | pub use sha2::Sha2Hasher; 142 | 143 | /// A node and value hasher making use of sha2-256. 144 | #[cfg(feature = "sha2-hasher")] 145 | pub mod sha2 { 146 | use super::{BinaryHash, BinaryHasher}; 147 | use sha2::{Digest, Sha256}; 148 | 149 | /// A [`BinaryHash`] implementation for Sha2. 150 | pub struct Sha2BinaryHasher; 151 | 152 | /// A wrapper around sha2-256 for use in NOMT. 153 | pub type Sha2Hasher = BinaryHasher; 154 | 155 | impl BinaryHash for Sha2BinaryHasher { 156 | fn hash(value: &[u8]) -> [u8; 32] { 157 | let mut hasher = Sha256::new(); 158 | hasher.update(value); 159 | hasher.finalize().into() 160 | } 161 | 162 | fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] { 163 | let mut hasher = Sha256::new(); 164 | hasher.update(left); 165 | hasher.update(right); 166 | hasher.finalize().into() 167 | } 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /core/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Core operations and types within the Nearly Optimal Merkle Trie. 2 | //! 3 | //! This crate defines the schema and basic operations over the merkle trie in a backend-agnostic 4 | //! manner. 5 | //! 6 | //! The core types and proof verification routines of this crate do not require the 7 | //! standard library, but do require Rust's alloc crate. 8 | 9 | #![cfg_attr(all(not(feature = "std"), not(test)), no_std)] 10 | 11 | extern crate alloc; 12 | 13 | pub mod hasher; 14 | pub mod page; 15 | pub mod page_id; 16 | pub mod proof; 17 | pub mod trie; 18 | pub mod trie_pos; 19 | pub mod update; 20 | pub mod witness; 21 | -------------------------------------------------------------------------------- /core/src/page.rs: -------------------------------------------------------------------------------- 1 | //! Pages: efficient node storage. 2 | //! 3 | //! Because each node in the trie is exactly 32 bytes, we can easily pack groups of nodes into 4 | //! a predictable paged representation regardless of the information in the trie. 5 | //! 6 | //! Each page is 4096 bytes and stores up to 126 nodes plus a unique 32-byte page identifier, 7 | //! with 32 bytes left over. 8 | //! 9 | //! A page stores a rootless sub-tree with depth 6: that is, it stores up to 10 | //! 2 + 4 + 8 + 16 + 32 + 64 nodes at known positions. 11 | //! Semantically, all nodes within the page should descend from the layer above, and the 12 | //! top two nodes are expected to be siblings. Each page logically has up to 64 child pages, which 13 | //! correspond to the rootless sub-tree descending from each of the 64 child nodes on the bottom 14 | //! layer. 15 | //! 16 | //! Every page is referred to by a unique ID, given by `parent_id * 2^6 + child_index + 1`, where 17 | //! the root page has ID `0x00..00`. The child index ranges from 0 to 63 and therefore can be 18 | //! represented as a 6 bit string. This module exposes functions for manipulating page IDs. 19 | //! 20 | //! The [`RawPage`] structure wraps a borrowed slice of 32-byte data and treats it as a page. 21 | 22 | /// Depth of the rootless sub-binary tree stored in a page 23 | pub const DEPTH: usize = 6; 24 | 25 | // Total number of nodes stored in one Page. It depends on the `DEPTH` 26 | // of the rootless sub-binary tree stored in a page following this formula: 27 | // (2^(DEPTH + 1)) - 2 28 | pub const NODES_PER_PAGE: usize = (1 << DEPTH + 1) - 2; 29 | 30 | /// A raw, unsized page data slice. 31 | pub type RawPage = [[u8; 32]]; 32 | -------------------------------------------------------------------------------- /core/src/proof/mod.rs: -------------------------------------------------------------------------------- 1 | //! Trie proofs and proof verification. 2 | //! 3 | //! The Merkle Trie defined in NOMT is an authenticated data structure, which means that it permits 4 | //! efficient proving against the root. This module exposes types and functions necessary for 5 | //! handling these kinds of proofs. 6 | //! 7 | //! Using the types and functions exposed from this module, you can verify the value of a single 8 | //! key within the trie ([`PathProof`]), the values of multiple keys ([`MultiProof`]), or the result 9 | //! of updating a trie with a set of changes ([`verify_update`]). 10 | 11 | pub use multi_proof::{ 12 | verify as verify_multi_proof, verify_update as verify_multi_proof_update, MultiPathProof, 13 | MultiProof, MultiProofVerificationError, VerifiedMultiProof, 14 | }; 15 | pub use path_proof::{ 16 | verify_update, KeyOutOfScope, PathProof, PathProofTerminal, PathProofVerificationError, 17 | PathUpdate, VerifiedPathProof, VerifyUpdateError, 18 | }; 19 | 20 | mod multi_proof; 21 | mod path_proof; 22 | -------------------------------------------------------------------------------- /core/src/trie.rs: -------------------------------------------------------------------------------- 1 | //! This module defines the types of a binary merkle trie, generalized over a 256 bit hash function. 2 | //! All lookup paths in the trie are 256 bits. 3 | //! 4 | //! All nodes are 256 bits. There are three kinds of nodes. 5 | //! 1. Internal nodes, which each have two children. The value of an internal node is 6 | //! given by hashing the concatenation of the two child nodes and setting the MSB to 0. 7 | //! 2. Leaf nodes, which have zero children. The value of a leaf node is given by hashing 8 | //! the concatenation of the 256-bit lookup path and the hash of the value stored at the leaf, 9 | //! and setting the MSB to 1. 10 | //! 3. [`TERMINATOR`] nodes, which have the special value of all 0s. These nodes have no children 11 | //! and serve as a stand-in for an empty sub-trie at any height. Terminator nodes enable the 12 | //! trie to be tractably represented. 13 | //! 14 | //! All node preimages are 512 bits. 15 | 16 | use crate::hasher::NodeHasher; 17 | 18 | /// A node in the binary trie. In this schema, it is always 256 bits and is the hash of either 19 | /// an [`LeafData`] or [`InternalData`], or zeroed if it's a [`TERMINATOR`]. 20 | /// 21 | /// [`Node`]s are labeled by the [`NodeHasher`] used to indicate whether they are leaves or internal 22 | /// nodes. Typically, this is done by setting the MSB. 23 | pub type Node = [u8; 32]; 24 | 25 | /// The path to a key. All paths have a 256 bit fixed length. 26 | pub type KeyPath = [u8; 32]; 27 | 28 | /// The hash of a value. In this schema, it is always 256 bits. 29 | pub type ValueHash = [u8; 32]; 30 | 31 | /// The terminator hash is a special node hash value denoting an empty sub-tree. 32 | /// Concretely, when this appears at a given location in the trie, 33 | /// it implies that no key with a path beginning with the location has a value. 34 | /// 35 | /// This value may appear at any height. 36 | pub const TERMINATOR: Node = [0u8; 32]; 37 | 38 | /// Whether the node hash indicates the node is a leaf. 39 | pub fn is_leaf(hash: &Node) -> bool { 40 | H::node_kind(hash) == NodeKind::Leaf 41 | } 42 | 43 | /// Whether the node hash indicates the node is an internal node. 44 | pub fn is_internal(hash: &Node) -> bool { 45 | H::node_kind(hash) == NodeKind::Internal 46 | } 47 | 48 | /// Whether the node holds the special `EMPTY_SUBTREE` value. 49 | pub fn is_terminator(hash: &Node) -> bool { 50 | H::node_kind(hash) == NodeKind::Terminator 51 | } 52 | 53 | /// The kind of a node. 54 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 55 | pub enum NodeKind { 56 | /// A terminator node indicates an empty sub-trie. 57 | Terminator, 58 | /// A leaf node indicates a sub-trie with a single leaf. 59 | Leaf, 60 | /// An internal node indicates at least two values. 61 | Internal, 62 | } 63 | 64 | impl NodeKind { 65 | /// Get the kind of the provided node. 66 | pub fn of(node: &Node) -> Self { 67 | H::node_kind(node) 68 | } 69 | } 70 | 71 | /// The data of an internal (branch) node. 72 | #[derive(Debug, Clone, PartialEq, Eq)] 73 | pub struct InternalData { 74 | /// The hash of the left child of this node. 75 | pub left: Node, 76 | /// The hash of the right child of this node. 77 | pub right: Node, 78 | } 79 | 80 | /// The data of a leaf node. 81 | #[derive(Debug, Default, Clone, PartialEq, Eq)] 82 | #[cfg_attr( 83 | feature = "borsh", 84 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 85 | )] 86 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 87 | pub struct LeafData { 88 | /// The total path to this value within the trie. 89 | /// 90 | /// The actual location of this node may be anywhere along this path, depending on the other 91 | /// data within the trie. 92 | pub key_path: KeyPath, 93 | /// The hash of the value carried in this leaf. 94 | pub value_hash: ValueHash, 95 | } 96 | -------------------------------------------------------------------------------- /core/src/witness.rs: -------------------------------------------------------------------------------- 1 | //! Witnesses of NOMT sessions. These types encapsulate entire sets of reads and writes. 2 | 3 | use crate::{ 4 | proof::PathProof, 5 | trie::{KeyPath, ValueHash}, 6 | trie_pos::TriePosition, 7 | }; 8 | 9 | #[cfg(not(feature = "std"))] 10 | use alloc::vec::Vec; 11 | 12 | /// A witness that can be used to prove the correctness of state trie retrievals and updates. 13 | /// 14 | /// Expected to be serializable. 15 | #[cfg_attr( 16 | feature = "borsh", 17 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 18 | )] 19 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 20 | pub struct Witness { 21 | /// Various paths down the trie used as part of this witness. 22 | /// Note that the paths are not necessarily in lexicographic order. 23 | pub path_proofs: Vec, 24 | /// The operations witnessed by the paths. 25 | pub operations: WitnessedOperations, 26 | } 27 | 28 | /// Operations provable by a corresponding witness. 29 | #[cfg_attr( 30 | feature = "borsh", 31 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 32 | )] 33 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 34 | pub struct WitnessedOperations { 35 | /// Read operations. 36 | pub reads: Vec, 37 | /// Write operations. 38 | pub writes: Vec, 39 | } 40 | 41 | /// A path observed in the witness. 42 | #[cfg_attr( 43 | feature = "borsh", 44 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 45 | )] 46 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 47 | pub struct WitnessedPath { 48 | /// Proof of a query path along the trie. 49 | pub inner: PathProof, 50 | /// The query path itself. 51 | pub path: TriePosition, 52 | } 53 | 54 | /// A witness of a read value. 55 | #[cfg_attr( 56 | feature = "borsh", 57 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 58 | )] 59 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 60 | pub struct WitnessedRead { 61 | /// The key of the read value. 62 | pub key: KeyPath, 63 | /// The hash of the value witnessed. None means no value. 64 | pub value: Option, 65 | /// The index of the path in the corresponding witness. 66 | pub path_index: usize, 67 | } 68 | 69 | /// A witness of a write operation. 70 | #[cfg_attr( 71 | feature = "borsh", 72 | derive(borsh::BorshDeserialize, borsh::BorshSerialize) 73 | )] 74 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 75 | pub struct WitnessedWrite { 76 | /// The key of the written value. 77 | pub key: KeyPath, 78 | /// The hash of the written value. `None` means "delete". 79 | pub value: Option, 80 | /// The index of the path in the corresponding witness. 81 | pub path_index: usize, 82 | } 83 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contribute to NOMT 2 | 3 | We license all code under MIT / Apache2.0 licenses. The maintainers reserve the right to refuse contributions and reject issues, even when useful. 4 | 5 | ## Formatting 6 | 7 | We use spaces for indentation and adhere to the vanilla `rustfmt` style. 8 | 9 | Format your code using `rustfmt`: 10 | 1. `cargo install cargo-fmt` 11 | 2. `cargo fmt --all` 12 | 13 | ## Documentation Policy 14 | 15 | Well-commented code is readable code. We require all `pub` and `pub(crate)` items to be annotated with doc-strings. This leads to much better auto-generated documentation pages using `rustdoc` and a better experience for library users. 16 | 17 | Public modules and crates should begin with doc-strings which explain the purpose of the module and crate and assist the reader in determining where to proceed. 18 | 19 | ## Pull Requests and Tests 20 | 21 | We require that the entire test-suite passes for every merged PR. A PR is the responsibility of the author. In submitting a PR, you are consenting to become responsible for it and continually improve, update, and request reviews for it until merged. Stale PRs are not the responsibility of the maintainers and may be closed. 22 | 23 | ## Code of Conduct 24 | 25 | We ask that all contributors maintain a respectful attitude towards each other. -------------------------------------------------------------------------------- /docs/images/binary_merkle_patricia_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/binary_merkle_patricia_tree.png -------------------------------------------------------------------------------- /docs/images/nomt_number_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_number_rule.png -------------------------------------------------------------------------------- /docs/images/nomt_pages.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_pages.jpg -------------------------------------------------------------------------------- /docs/images/nomt_put.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_put.png -------------------------------------------------------------------------------- /examples/commit_batch/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "commit-batch" 3 | version = "0.1.0" 4 | authors.workspace = true 5 | homepage.workspace = true 6 | repository.workspace = true 7 | edition.workspace = true 8 | license.workspace = true 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | nomt = { path = "../../nomt" } 14 | anyhow = "1.0.81" 15 | sha2 = "0.10.6" 16 | -------------------------------------------------------------------------------- /examples/commit_batch/src/lib.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use nomt::{ 3 | hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, Root, SessionParams, Witness, WitnessMode, 4 | }; 5 | use sha2::Digest; 6 | 7 | const NOMT_DB_FOLDER: &str = "nomt_db"; 8 | 9 | pub struct NomtDB; 10 | 11 | impl NomtDB { 12 | pub fn commit_batch() -> Result<(Root, Root, Witness)> { 13 | // Define the options used to open NOMT 14 | let mut opts = Options::new(); 15 | opts.path(NOMT_DB_FOLDER); 16 | opts.commit_concurrency(1); 17 | 18 | // Open NOMT database, it will create the folder if it does not exist 19 | let nomt = Nomt::::open(opts)?; 20 | 21 | // Create a new Session object 22 | // 23 | // During a session, the backend is responsible for returning read keys 24 | // and receiving hints about future writes 25 | // 26 | // Writes do not occur immediately, instead, 27 | // they are cached and applied all at once later on 28 | let session = 29 | nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write())); 30 | 31 | // Here we will move the data saved under b"key1" to b"key2" and deletes it 32 | // 33 | // NOMT expects keys to be uniformly distributed across the key space 34 | let key_path_1 = sha2::Sha256::digest(b"key1").into(); 35 | let key_path_2 = sha2::Sha256::digest(b"key2").into(); 36 | 37 | // First, read what is under key_path_1 38 | // 39 | // `read` will immediately return the value present in the database 40 | let value = session.read(key_path_1)?; 41 | 42 | // We are going to perform writes on both key-paths, so we have NOMT warm up the on-disk 43 | // data for both. 44 | session.warm_up(key_path_1); 45 | session.warm_up(key_path_2); 46 | 47 | // Retrieve the previous value of the root before committing changes 48 | let prev_root = nomt.root(); 49 | 50 | // To commit the batch to the backend we need to collect every 51 | // performed actions into a vector where items are ordered by the key_path 52 | let mut actual_access: Vec<_> = vec![ 53 | (key_path_1, KeyReadWrite::ReadThenWrite(value.clone(), None)), 54 | (key_path_2, KeyReadWrite::Write(value)), 55 | ]; 56 | actual_access.sort_by_key(|(k, _)| *k); 57 | 58 | // The final step in handling a session involves committing all changes 59 | // to update the trie structure and obtaining the new root of the trie, 60 | // along with a witness and the witnessed operations. 61 | let mut finished = session.finish(actual_access).unwrap(); 62 | 63 | // This field is set because the finished session was configured with 64 | // `WitnessMode::read_write`. 65 | let witness = finished.take_witness().unwrap(); 66 | let root = finished.root(); 67 | finished.commit(&nomt)?; 68 | 69 | Ok((prev_root, root, witness)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /examples/commit_batch/src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() -> anyhow::Result<()> { 2 | commit_batch::NomtDB::commit_batch().map(|_| ()) 3 | } 4 | -------------------------------------------------------------------------------- /examples/read_value/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "read_value" 3 | version = "0.1.0" 4 | authors.workspace = true 5 | homepage.workspace = true 6 | repository.workspace = true 7 | edition.workspace = true 8 | license.workspace = true 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | nomt = { path = "../../nomt" } 14 | anyhow = "1.0.81" 15 | sha2 = "0.10.6" 16 | -------------------------------------------------------------------------------- /examples/read_value/src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams, WitnessMode}; 3 | use sha2::Digest; 4 | 5 | const NOMT_DB_FOLDER: &str = "nomt_db"; 6 | 7 | fn main() -> Result<()> { 8 | // Define the options used to open NOMT 9 | let mut opts = Options::new(); 10 | opts.path(NOMT_DB_FOLDER); 11 | opts.commit_concurrency(1); 12 | 13 | // Open nomt database. This will create the folder if it does not exist 14 | let nomt = Nomt::::open(opts)?; 15 | 16 | // Instantiate a new Session object to handle read and write operations 17 | // and generate a Witness later on 18 | let session = 19 | nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write())); 20 | 21 | // Reading a key from the database 22 | let key_path = sha2::Sha256::digest(b"key").into(); 23 | let value = session.read(key_path)?; 24 | 25 | // Even though this key is only being read, we ask NOMT to warm up the on-disk data because 26 | // we will prove the read. 27 | session.warm_up(key_path); 28 | 29 | let mut finished = session 30 | .finish(vec![(key_path, KeyReadWrite::Read(value))]) 31 | .unwrap(); 32 | let _witness = finished.take_witness(); 33 | finished.commit(&nomt)?; 34 | 35 | Ok(()) 36 | } 37 | -------------------------------------------------------------------------------- /examples/witness_verification/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "witness_verification" 3 | version = "0.1.0" 4 | authors.workspace = true 5 | homepage.workspace = true 6 | repository.workspace = true 7 | edition.workspace = true 8 | license.workspace = true 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | nomt-core = { path = "../../core" } 14 | commit-batch = { path = "../commit_batch" } 15 | anyhow = "1.0.81" 16 | blake3 = "1.5.1" 17 | -------------------------------------------------------------------------------- /examples/witness_verification/src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use nomt_core::{hasher::Blake3Hasher, proof, trie::LeafData}; 3 | 4 | fn main() -> Result<()> { 5 | // The witness produced in the example `commit_batch` will be used 6 | let (prev_root, new_root, witness) = commit_batch::NomtDB::commit_batch().unwrap(); 7 | 8 | let mut updates = Vec::new(); 9 | 10 | // A witness is composed of multiple WitnessedPath objects, 11 | // which stores all the necessary information to verify the operations 12 | // performed on the same path 13 | for (i, witnessed_path) in witness.path_proofs.iter().enumerate() { 14 | // Constructing the verified operations 15 | let verified = witnessed_path 16 | .inner 17 | .verify::(&witnessed_path.path.path(), prev_root.into_inner()) 18 | .unwrap(); 19 | 20 | // Among all read operations performed the ones that interact 21 | // with the current verified path are selected 22 | // 23 | // Each witnessed operation contains an index to the path it needs to be verified against 24 | // 25 | // This information could already be known if we committed the batch initially, 26 | // and thus, the witnessed field could be discarded entirely. 27 | for read in witness 28 | .operations 29 | .reads 30 | .iter() 31 | .skip_while(|r| r.path_index != i) 32 | .take_while(|r| r.path_index == i) 33 | { 34 | match read.value { 35 | // Check for non-existence if the return value was None 36 | None => assert!(verified.confirm_nonexistence(&read.key).unwrap()), 37 | // Verify the correctness of the returned value when it is Some(_) 38 | Some(value_hash) => { 39 | let leaf = LeafData { 40 | key_path: read.key, 41 | value_hash, 42 | }; 43 | assert!(verified.confirm_value(&leaf).unwrap()); 44 | } 45 | } 46 | } 47 | 48 | // The correctness of write operations cannot be easily verified like reads. 49 | // Write operations need to be collected. 50 | // All writes that have worked on shared prefixes, 51 | // such as the witnessed_path, need to be bundled together. 52 | // Later, it needs to be verified that all these writes bring 53 | // the new trie to the expected state 54 | let mut write_ops = Vec::new(); 55 | for write in witness 56 | .operations 57 | .writes 58 | .iter() 59 | .skip_while(|r| r.path_index != i) 60 | .take_while(|r| r.path_index == i) 61 | { 62 | write_ops.push((write.key, write.value)); 63 | } 64 | 65 | if !write_ops.is_empty() { 66 | updates.push(proof::PathUpdate { 67 | inner: verified, 68 | ops: write_ops, 69 | }); 70 | } 71 | } 72 | 73 | assert_eq!( 74 | proof::verify_update::(prev_root.into_inner(), &updates).unwrap(), 75 | new_root.into_inner(), 76 | ); 77 | 78 | Ok(()) 79 | } 80 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | coverage 5 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nomt-fuzz" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2021" 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies] 11 | libfuzzer-sys = "0.4" 12 | arbitrary = { version = "1.3.1", features = ["derive"] } 13 | tempfile.workspace = true 14 | bitvec.workspace = true 15 | 16 | [dependencies.nomt] 17 | path = "../nomt" 18 | features = ["fuzz"] 19 | 20 | [[bin]] 21 | name = "api_surface" 22 | path = "fuzz_targets/api_surface.rs" 23 | test = false 24 | doc = false 25 | bench = false 26 | 27 | [[bin]] 28 | name = "bitwise_memcpy" 29 | path = "fuzz_targets/bitwise_memcpy.rs" 30 | test = false 31 | doc = false 32 | bench = false 33 | 34 | [[bin]] 35 | name = "separate" 36 | path = "fuzz_targets/separate.rs" 37 | test = false 38 | doc = false 39 | bench = false 40 | 41 | [[bin]] 42 | name = "prefix_len" 43 | path = "fuzz_targets/prefix_len.rs" 44 | test = false 45 | doc = false 46 | bench = false 47 | 48 | [[bin]] 49 | name = "separator_len" 50 | path = "fuzz_targets/separator_len.rs" 51 | test = false 52 | doc = false 53 | bench = false 54 | 55 | [[bin]] 56 | name = "reconstruct_key" 57 | path = "fuzz_targets/reconstruct_key.rs" 58 | test = false 59 | doc = false 60 | bench = false 61 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/bitwise_memcpy.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use bitvec::{order::Msb0, view::BitView}; 5 | use libfuzzer_sys::fuzz_target; 6 | use nomt::beatree::bitwise_memcpy; 7 | 8 | const MAX_BYTES_LEN: usize = 1 << 12; // 4KiB 9 | 10 | fuzz_target!(|run: Run| { 11 | let Run { 12 | source, 13 | mut destination, 14 | } = run; 15 | 16 | let expected = reference_bitwise_memcpy(&source, &destination); 17 | 18 | bitwise_memcpy( 19 | &mut destination.bytes, 20 | destination.bit_start, 21 | &source.bytes, 22 | source.bit_start, 23 | source.bit_len, 24 | ); 25 | 26 | assert_eq!(expected, destination.bytes); 27 | }); 28 | 29 | #[derive(Debug)] 30 | struct Run { 31 | source: Source, 32 | destination: Destination, 33 | } 34 | 35 | #[derive(Debug)] 36 | struct Source { 37 | bit_start: usize, 38 | bit_len: usize, 39 | bytes: Vec, 40 | } 41 | 42 | #[derive(Debug)] 43 | struct Destination { 44 | bit_start: usize, 45 | bytes: Vec, 46 | } 47 | 48 | impl<'a> Arbitrary<'a> for Run { 49 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 50 | let source = Source::arbitrary(input)?; 51 | 52 | // Destination must be long enough to store the source. 53 | let destination_bit_start = input.int_in_range(0..=7)?; 54 | let min_destination_len = (destination_bit_start + source.bit_len + 7) / 8; 55 | let destination_len = input.int_in_range(min_destination_len..=MAX_BYTES_LEN)?; 56 | let mut destination_bytes = vec![0; destination_len]; 57 | input.fill_buffer(&mut destination_bytes)?; 58 | 59 | let run = Run { 60 | source, 61 | destination: Destination { 62 | bit_start: destination_bit_start, 63 | bytes: destination_bytes, 64 | }, 65 | }; 66 | 67 | Ok(run) 68 | } 69 | } 70 | 71 | impl<'a> Arbitrary<'a> for Source { 72 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 73 | let bytes_len = (input.int_in_range(0..=MAX_BYTES_LEN)? as usize).next_multiple_of(8); 74 | 75 | let mut bytes: Vec = vec![0; bytes_len]; 76 | input.fill_buffer(&mut bytes)?; 77 | 78 | let bit_start = if bytes_len != 0 { 79 | input.int_in_range(0..=7)? 80 | } else { 81 | 0 82 | }; 83 | 84 | let bit_len = if bytes_len > 0 { 85 | // `bitwise_memcpy` requires to the source length to be the smallest length, 86 | // multiple of 8 bytes that the contain the source bits. 87 | let min_bit_len = ((bytes_len - 8) * 8).saturating_sub(bit_start) + 1; 88 | let max_bit_len = (bytes_len * 8) - bit_start; 89 | input.int_in_range(min_bit_len..=max_bit_len)? 90 | } else { 91 | 0 92 | }; 93 | 94 | Ok(Self { 95 | bit_start, 96 | bit_len, 97 | bytes, 98 | }) 99 | } 100 | } 101 | 102 | fn reference_bitwise_memcpy(source: &Source, destination: &Destination) -> Vec { 103 | let mut destination_bytes = destination.bytes.clone(); 104 | 105 | destination_bytes.view_bits_mut::()[destination.bit_start..][..source.bit_len] 106 | .copy_from_bitslice( 107 | &source.bytes.view_bits::()[source.bit_start..][..source.bit_len], 108 | ); 109 | 110 | destination_bytes 111 | } 112 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/common/mod.rs: -------------------------------------------------------------------------------- 1 | use arbitrary::Arbitrary; 2 | use bitvec::{order::Msb0, view::BitView}; 3 | 4 | #[derive(Debug)] 5 | pub struct Run { 6 | pub prefix_bit_len: usize, 7 | pub a: [u8; 32], 8 | pub b: [u8; 32], 9 | } 10 | 11 | impl<'a> Arbitrary<'a> for Run { 12 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 13 | let prefix_bit_len = input.int_in_range(0..=255)?; 14 | let mut a = [0; 32]; 15 | let mut b = [0; 32]; 16 | input.fill_buffer(&mut a)?; 17 | input.fill_buffer(&mut b)?; 18 | b.view_bits_mut::()[0..prefix_bit_len] 19 | .copy_from_bitslice(&a.view_bits::()[0..prefix_bit_len]); 20 | 21 | let effective_prefix_bit_len = a 22 | .view_bits::() 23 | .iter() 24 | .zip(b.view_bits::().iter()) 25 | .take_while(|(a, b)| a == b) 26 | .count(); 27 | 28 | if effective_prefix_bit_len != prefix_bit_len { 29 | Err(arbitrary::Error::IncorrectFormat) 30 | } else { 31 | Ok(Self { 32 | prefix_bit_len, 33 | a, 34 | b, 35 | }) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/prefix_len.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | mod common; 4 | 5 | use common::Run; 6 | use libfuzzer_sys::fuzz_target; 7 | use nomt::beatree::prefix_len; 8 | 9 | fuzz_target!(|run: Run| { 10 | let Run { 11 | prefix_bit_len, 12 | a, 13 | b, 14 | } = run; 15 | 16 | assert_eq!(prefix_bit_len, prefix_len(&a, &b)); 17 | }); 18 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/reconstruct_key.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use bitvec::{order::Msb0, view::BitView}; 5 | use libfuzzer_sys::fuzz_target; 6 | use nomt::beatree::reconstruct_key; 7 | 8 | fuzz_target!(|run: Run| { 9 | let Run { 10 | raw_separator, 11 | raw_prefix, 12 | } = run; 13 | 14 | let expected = reference_reconstruct_key(&raw_prefix, &raw_separator); 15 | 16 | let maybe_prefix = if raw_prefix.bit_len == 0 { 17 | None 18 | } else { 19 | Some((&raw_prefix.bytes[..], raw_prefix.bit_len)) 20 | }; 21 | 22 | let raw_separator = ( 23 | &raw_separator.bytes[..], 24 | raw_separator.bit_start, 25 | raw_separator.bit_len, 26 | ); 27 | 28 | assert_eq!(expected, reconstruct_key(maybe_prefix, raw_separator)); 29 | }); 30 | 31 | #[derive(Debug)] 32 | struct Run { 33 | raw_separator: RawSeparator, 34 | raw_prefix: RawPrefix, 35 | } 36 | 37 | #[derive(Debug)] 38 | struct RawSeparator { 39 | bit_start: usize, 40 | bit_len: usize, 41 | bytes: Vec, 42 | } 43 | 44 | #[derive(Debug)] 45 | struct RawPrefix { 46 | bit_len: usize, 47 | bytes: Vec, 48 | } 49 | 50 | impl<'a> Arbitrary<'a> for Run { 51 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 52 | let raw_separator = RawSeparator::arbitrary(input)?; 53 | 54 | let raw_prefix_bit_len = input.int_in_range(0..=(256 - raw_separator.bit_len))?; 55 | let raw_prefix_min_byte_len = (raw_prefix_bit_len + 7) / 8; 56 | let raw_prefix_byte_len = input.int_in_range(raw_prefix_min_byte_len..=(1 << 12))?; 57 | let mut raw_prefix_bytes = vec![0; raw_prefix_byte_len]; 58 | input.fill_buffer(&mut raw_prefix_bytes)?; 59 | 60 | let run = Run { 61 | raw_separator, 62 | raw_prefix: RawPrefix { 63 | bit_len: raw_prefix_bit_len, 64 | bytes: raw_prefix_bytes, 65 | }, 66 | }; 67 | 68 | Ok(run) 69 | } 70 | } 71 | 72 | impl<'a> Arbitrary<'a> for RawSeparator { 73 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 74 | let bit_start = input.int_in_range(0..=7)?; 75 | 76 | let bit_len = input.int_in_range(0..=(256 - bit_start))?; 77 | 78 | let bytes_len = (((bit_start + bit_len + 7) / 8) as usize).next_multiple_of(8); 79 | let mut bytes: Vec = vec![0; bytes_len]; 80 | input.fill_buffer(&mut bytes)?; 81 | 82 | Ok(Self { 83 | bit_start, 84 | bit_len, 85 | bytes, 86 | }) 87 | } 88 | } 89 | 90 | fn reference_reconstruct_key(maybe_prefix: &RawPrefix, separator: &RawSeparator) -> [u8; 32] { 91 | let mut key = [0; 32]; 92 | 93 | let mut key_start_separator = 0; 94 | let RawPrefix { bit_len, bytes } = maybe_prefix; 95 | if *bit_len != 0 { 96 | key.view_bits_mut::()[..*bit_len] 97 | .copy_from_bitslice(&bytes.view_bits::()[..*bit_len]); 98 | key_start_separator = *bit_len; 99 | } 100 | 101 | let RawSeparator { 102 | bit_start, 103 | bit_len, 104 | bytes, 105 | } = separator; 106 | 107 | key.view_bits_mut::()[key_start_separator..][..*bit_len] 108 | .copy_from_bitslice(&bytes.view_bits::()[*bit_start..][..*bit_len]); 109 | 110 | key 111 | } 112 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/separate.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | mod common; 4 | 5 | use bitvec::{order::Msb0, view::BitView}; 6 | use common::Run; 7 | use libfuzzer_sys::fuzz_target; 8 | use nomt::beatree::separate; 9 | 10 | fuzz_target!(|run: Run| { 11 | let Run { 12 | prefix_bit_len, 13 | mut a, 14 | mut b, 15 | } = run; 16 | 17 | if a > b { 18 | std::mem::swap(&mut a, &mut b); 19 | } 20 | 21 | let mut expected = [0u8; 32]; 22 | expected.view_bits_mut::()[..prefix_bit_len + 1] 23 | .copy_from_bitslice(&b.view_bits::()[..prefix_bit_len + 1]); 24 | 25 | assert_eq!(expected, separate(&a, &b)); 26 | }); 27 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/separator_len.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use bitvec::{order::Msb0, view::BitView}; 5 | use libfuzzer_sys::fuzz_target; 6 | 7 | fuzz_target!(|run: Run| { 8 | let Run { 9 | separator_len, 10 | separator, 11 | } = run; 12 | 13 | assert_eq!(separator_len, nomt::beatree::separator_len(&separator)); 14 | }); 15 | 16 | #[derive(Debug)] 17 | struct Run { 18 | separator_len: usize, 19 | separator: [u8; 32], 20 | } 21 | 22 | impl<'a> Arbitrary<'a> for Run { 23 | fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { 24 | let mut separator_len = input.int_in_range(0..=255)?; 25 | let mut separator = [0; 32]; 26 | input.fill_buffer(&mut separator)?; 27 | separator.view_bits_mut::()[separator_len..].fill(false); 28 | 29 | if separator == [0u8; 32] { 30 | separator_len = 1; 31 | } else { 32 | let effective_separator_len = 256 - separator.view_bits::().trailing_zeros(); 33 | if separator_len != effective_separator_len { 34 | return Err(arbitrary::Error::IncorrectFormat); 35 | } 36 | }; 37 | 38 | Ok(Self { 39 | separator_len, 40 | separator, 41 | }) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /nomt/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nomt" 3 | description = "Nearly Optimal Merkle Trie - Schema and Database" 4 | version = "1.0.0-preview" 5 | authors.workspace = true 6 | homepage.workspace = true 7 | repository.workspace = true 8 | edition.workspace = true 9 | license.workspace = true 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dependencies] 14 | anyhow.workspace = true 15 | nomt-core = { path = "../core", default-features = false, features = ["std"] } 16 | parking_lot.workspace = true 17 | threadpool.workspace = true 18 | bitvec.workspace = true 19 | twox-hash.workspace = true 20 | fxhash.workspace = true 21 | dashmap.workspace = true 22 | crossbeam.workspace = true 23 | crossbeam-channel.workspace = true 24 | slab.workspace = true 25 | rand.workspace = true 26 | ahash.workspace = true 27 | imbl.workspace = true 28 | lru.workspace = true 29 | libc.workspace = true 30 | criterion = { workspace = true, optional = true } 31 | thread_local.workspace = true 32 | cfg-if.workspace = true 33 | borsh = { workspace = true, optional = true } 34 | serde = { workspace = true, optional = true } 35 | 36 | [target.'cfg(target_os="linux")'.dependencies] 37 | io-uring.workspace = true 38 | 39 | [target.'cfg(loom)'.dependencies] 40 | loom.workspace = true 41 | 42 | [dev-dependencies] 43 | rand_pcg.workspace = true 44 | hex-literal.workspace = true 45 | tempfile.workspace = true 46 | criterion.workspace = true 47 | lazy_static.workspace = true 48 | hex.workspace = true 49 | quickcheck.workspace = true 50 | blake3.workspace = true 51 | 52 | [lints.rust] 53 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(loom)'] } 54 | 55 | [[bench]] 56 | name = "beatree" 57 | harness = false 58 | 59 | [features] 60 | default = ["blake3-hasher", "sha2-hasher"] 61 | benchmarks = ["dep:criterion"] 62 | fuzz = [] 63 | borsh = ["dep:borsh", "nomt-core/borsh"] 64 | blake3-hasher = ["nomt-core/blake3-hasher"] 65 | sha2-hasher = ["nomt-core/sha2-hasher"] 66 | serde = ["dep:serde", "nomt-core/serde"] 67 | -------------------------------------------------------------------------------- /nomt/benches/beatree.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "benchmarks")] 2 | use criterion::{criterion_group, criterion_main}; 3 | #[cfg(feature = "benchmarks")] 4 | use nomt::beatree::benches::beatree_benchmark; 5 | 6 | #[cfg(feature = "benchmarks")] 7 | criterion_group!(benches, beatree_benchmark); 8 | #[cfg(feature = "benchmarks")] 9 | criterion_main!(benches); 10 | 11 | #[cfg(not(feature = "benchmarks"))] 12 | fn main() {} 13 | -------------------------------------------------------------------------------- /nomt/src/beatree/benches.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "benchmarks")] 2 | 3 | use crate::beatree::{ 4 | branch::node::benches::*, leaf::node::benches::*, ops::benches::*, ops::bit_ops::benches::*, 5 | Key, 6 | }; 7 | use rand::RngCore; 8 | 9 | pub fn beatree_benchmark(c: &mut criterion::Criterion) { 10 | separate_benchmark(c); 11 | separator_len_benchmark(c); 12 | prefix_len_benchmark(c); 13 | search_branch_benchmark(c); 14 | leaf_search_benchmark(c); 15 | reconstruct_key_benchmark(c); 16 | branch_builder_benchmark(c); 17 | leaf_builder_benchmark(c); 18 | } 19 | 20 | // returns two keys a and b where b > a and b shares the first n bits with a 21 | pub fn get_key_pair(shared_bytes: usize) -> (Key, Key) { 22 | let mut rand = rand::thread_rng(); 23 | let mut a = [0; 32]; 24 | rand.fill_bytes(&mut a[0..shared_bytes]); 25 | 26 | // b > a 27 | let mut b = a.clone(); 28 | b[shared_bytes] = 1; 29 | 30 | (a, b) 31 | } 32 | 33 | // Get a vector containing `n` random keys that share the first `shared_bytes` 34 | pub fn get_keys(shared_bytes: usize, n: usize) -> Vec { 35 | let mut rand = rand::thread_rng(); 36 | let mut prefix = [0; 32]; 37 | rand.fill_bytes(&mut prefix[0..shared_bytes]); 38 | 39 | let mut keys = vec![]; 40 | for _ in 0..n { 41 | let mut key = prefix.clone(); 42 | rand.fill_bytes(&mut key[shared_bytes..]); 43 | keys.push(key); 44 | } 45 | 46 | keys 47 | } 48 | -------------------------------------------------------------------------------- /nomt/src/beatree/branch/mod.rs: -------------------------------------------------------------------------------- 1 | pub use node::{body_size, BranchNode, BranchNodeBuilder, BranchNodeView, BRANCH_NODE_BODY_SIZE}; 2 | pub mod node; 3 | 4 | pub const BRANCH_NODE_SIZE: usize = 4096; 5 | -------------------------------------------------------------------------------- /nomt/src/beatree/index.rs: -------------------------------------------------------------------------------- 1 | //! In-memory index tracking bottom level branch nodes. This is an immutable data structure, 2 | //! which is cheaply cloneable in O(1) and performs COW operations. 3 | 4 | use std::ops::{Bound, RangeBounds}; 5 | use std::sync::Arc; 6 | 7 | use imbl::OrdMap; 8 | 9 | use super::Key; 10 | use crate::beatree::branch::BranchNode; 11 | 12 | #[derive(Default, Clone)] 13 | pub struct Index { 14 | first_key_map: OrdMap>, 15 | } 16 | 17 | impl Index { 18 | /// Look up the branch that would store the given key. 19 | /// 20 | /// This is either a branch whose separator is exactly equal to this key or the branch with the 21 | /// highest separator less than the key. 22 | pub fn lookup(&self, key: Key) -> Option<(Key, Arc)> { 23 | self.first_key_map 24 | .get_prev(&key) 25 | .map(|(sep, b)| (sep.clone(), b.clone())) 26 | } 27 | 28 | /// Get the first separator greater than the given key. 29 | pub fn next_key(&self, key: Key) -> Option { 30 | self.first_key_map 31 | .range(RangeFromExclusive { start: key }) 32 | .next() 33 | .map(|(k, _)| *k) 34 | } 35 | 36 | /// Remove the branch with the given separator key. 37 | pub fn remove(&mut self, separator: &Key) -> Option> { 38 | self.first_key_map.remove(separator) 39 | } 40 | 41 | /// Insert a branch with the given separator key. 42 | pub fn insert(&mut self, separator: Key, branch: Arc) -> Option> { 43 | self.first_key_map.insert(separator, branch) 44 | } 45 | 46 | #[cfg(test)] 47 | pub fn into_iter(self) -> impl Iterator)> { 48 | self.first_key_map.into_iter() 49 | } 50 | } 51 | 52 | struct RangeFromExclusive { 53 | start: Key, 54 | } 55 | 56 | impl RangeBounds for RangeFromExclusive { 57 | fn start_bound(&self) -> Bound<&Key> { 58 | Bound::Excluded(&self.start) 59 | } 60 | 61 | fn end_bound(&self) -> Bound<&Key> { 62 | Bound::Unbounded 63 | } 64 | 65 | fn contains(&self, item: &U) -> bool 66 | where 67 | U: PartialOrd + ?Sized, 68 | { 69 | item > &self.start 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /nomt/src/beatree/leaf/mod.rs: -------------------------------------------------------------------------------- 1 | // The `LeafStore` struct manages leaves. It's responsible for management (allocation and 2 | // deallocation) and querying the LNs by their LNID. 3 | // 4 | // It maintains an in-memory copy of the freelist to facilitate the page management. The allocation 5 | // is performed in LIFO order. The allocations are performed in batches to amortize the IO for the 6 | // freelist and metadata updates (growing the file in case freelist is empty). 7 | // 8 | // The leaf store doesn't perform caching. When querying the leaf store returns a handle to a page. 9 | // As soon as the handle is dropped, the data becomes inaccessible and another disk roundtrip would 10 | // be required to access the data again. 11 | 12 | pub mod node; 13 | -------------------------------------------------------------------------------- /nomt/src/beatree/leaf_cache.rs: -------------------------------------------------------------------------------- 1 | //! The leaf cache stores recently accessed leaf nodes. 2 | 3 | use crate::{ 4 | beatree::{allocator::PageNumber, leaf::node::LeafNode}, 5 | io::PAGE_SIZE, 6 | }; 7 | use lru::LruCache; 8 | use parking_lot::{Mutex, MutexGuard}; 9 | use std::{collections::hash_map::RandomState, hash::BuildHasher, sync::Arc}; 10 | 11 | /// A cache for leaf nodes. 12 | /// 13 | /// This i cheap to clone. 14 | #[derive(Clone)] 15 | pub struct LeafCache { 16 | inner: Arc, 17 | } 18 | 19 | impl LeafCache { 20 | /// Create a new cache with the given number of shards and the maximum number of items 21 | /// to hold. `shards` must be non-zero. 22 | pub fn new(shards: usize, leaf_cache_size: usize) -> Self { 23 | let max_items = (leaf_cache_size * 1024 * 1024) / PAGE_SIZE; 24 | let items_per_shard = max_items / shards; 25 | LeafCache { 26 | inner: Arc::new(Shared { 27 | shards: (0..shards) 28 | .map(|_| Shard { 29 | cache: LruCache::unbounded(), 30 | max_items: items_per_shard, 31 | }) 32 | .map(Mutex::new) 33 | .collect::>(), 34 | shard_assigner: RandomState::new(), 35 | }), 36 | } 37 | } 38 | 39 | /// Get a cache entry, updating the LRU state. 40 | pub fn get(&self, page_number: PageNumber) -> Option> { 41 | let mut shard = self.inner.shard_for(page_number); 42 | 43 | shard.cache.get(&page_number).map(|x| x.clone()) 44 | } 45 | 46 | /// Insert a cache entry. This does not evict anything. 47 | pub fn insert(&self, page_number: PageNumber, node: Arc) { 48 | let mut shard = self.inner.shard_for(page_number); 49 | 50 | shard.cache.put(page_number, node); 51 | } 52 | 53 | /// Evict all excess items from the cache. 54 | pub fn evict(&self) { 55 | for shard in &self.inner.shards { 56 | let mut shard = shard.lock(); 57 | while shard.cache.len() > shard.max_items { 58 | let _ = shard.cache.pop_lru(); 59 | } 60 | } 61 | } 62 | } 63 | 64 | struct Shared { 65 | shards: Vec>, 66 | shard_assigner: RandomState, 67 | } 68 | 69 | impl Shared { 70 | fn shard_for(&self, page_number: PageNumber) -> MutexGuard<'_, Shard> { 71 | self.shards[self.shard_index_for(page_number)].lock() 72 | } 73 | 74 | fn shard_index_for(&self, page_number: PageNumber) -> usize { 75 | (self.shard_assigner.hash_one(page_number.0) as usize) % self.shards.len() 76 | } 77 | } 78 | 79 | struct Shard { 80 | cache: LruCache>, 81 | max_items: usize, 82 | } 83 | -------------------------------------------------------------------------------- /nomt/src/beatree/writeout.rs: -------------------------------------------------------------------------------- 1 | //! The writeout logic for beatree. 2 | 3 | // As part of beatree writeout, we need to write BBN and LN files, resizing them to the correct 4 | // size beforehand. After the writes are completed (fsync'd), we wait for the MANIFEST to be 5 | // updated and then perform some cleanup. 6 | 7 | use super::allocator::{PageNumber, Store}; 8 | use crate::io::{FatPage, IoHandle}; 9 | 10 | pub fn submit_freelist_write( 11 | io_handle: &IoHandle, 12 | store: &Store, 13 | free_list_pages: Vec<(PageNumber, FatPage)>, 14 | ) { 15 | for (pn, page) in free_list_pages { 16 | io_handle 17 | .send(crate::io::IoCommand { 18 | kind: crate::io::IoKind::Write(store.store_fd(), pn.0 as u64, page), 19 | user_data: 0, 20 | }) 21 | .unwrap(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /nomt/src/bitbox/ht_file.rs: -------------------------------------------------------------------------------- 1 | /// The HT file. 2 | /// 3 | /// The file that stores the hash-table buckets and the meta map. 4 | use super::meta_map::MetaMap; 5 | use crate::io::{self, PagePool, PAGE_SIZE}; 6 | use std::{ 7 | fs::{File, OpenOptions}, 8 | path::PathBuf, 9 | }; 10 | 11 | /// The offsets of the HT file. 12 | #[derive(Clone)] 13 | pub struct HTOffsets { 14 | // the number of pages to add to a page number to find its real location in the file, 15 | // taking account of the meta page and meta byte pages. 16 | data_page_offset: u64, 17 | } 18 | 19 | impl HTOffsets { 20 | /// Returns the page number of the `ix`th item in the data section of the store. 21 | pub fn data_page_index(&self, ix: u64) -> u64 { 22 | self.data_page_offset + ix 23 | } 24 | 25 | /// Returns the page number of the `ix`th item in the meta bytes section of the store. 26 | pub fn meta_bytes_index(&self, ix: u64) -> u64 { 27 | ix 28 | } 29 | } 30 | 31 | fn expected_file_len(num_pages: u32) -> u64 { 32 | (num_meta_byte_pages(num_pages) + num_pages) as u64 * PAGE_SIZE as u64 33 | } 34 | 35 | fn num_meta_byte_pages(num_pages: u32) -> u32 { 36 | (num_pages + 4095) / PAGE_SIZE as u32 37 | } 38 | 39 | /// Opens the HT file, checks its length and reads the meta map. 40 | pub fn open( 41 | num_pages: u32, 42 | page_pool: &PagePool, 43 | ht_fd: &File, 44 | ) -> anyhow::Result<(HTOffsets, MetaMap)> { 45 | if ht_fd.metadata()?.len() != expected_file_len(num_pages) { 46 | anyhow::bail!("Store corrupted; unexpected file length"); 47 | } 48 | 49 | let num_meta_byte_pages = num_meta_byte_pages(num_pages); 50 | let mut meta_bytes = Vec::with_capacity(num_meta_byte_pages as usize * PAGE_SIZE); 51 | for pn in 0..num_meta_byte_pages { 52 | let extra_meta_page = io::read_page(page_pool, ht_fd, pn as u64)?; 53 | meta_bytes.extend_from_slice(&*extra_meta_page); 54 | } 55 | 56 | let data_page_offset = num_meta_byte_pages as u64; 57 | Ok(( 58 | HTOffsets { data_page_offset }, 59 | MetaMap::from_bytes(meta_bytes, num_pages as usize), 60 | )) 61 | } 62 | 63 | /// Creates the store file. Fails if store file already exists. 64 | /// 65 | /// Lays out the meta page. If `preallocate` is true, preallocates the blocks for the file. 66 | pub fn create(path: PathBuf, num_pages: u32, preallocate: bool) -> std::io::Result<()> { 67 | let ht_path = path.join("ht"); 68 | let ht_file = OpenOptions::new().write(true).create(true).open(ht_path)?; 69 | 70 | // number of pages + pages required for meta bits. 71 | let page_count = num_pages + num_meta_byte_pages(num_pages); 72 | let len = page_count as usize * PAGE_SIZE; 73 | 74 | resize_and_prealloc(&ht_file, len as u64, preallocate)?; 75 | 76 | ht_file.sync_all()?; 77 | drop(ht_file); 78 | 79 | let wal_path = path.join("wal"); 80 | let wal_file = OpenOptions::new().write(true).create(true).open(wal_path)?; 81 | wal_file.sync_all()?; 82 | drop(wal_file); 83 | Ok(()) 84 | } 85 | 86 | /// Sets the file size and attempts to preallocate the file if `preallocate` is true. 87 | /// 88 | /// Returns an error if setting the file size fails. File preallocation is done on a best-effort basis 89 | /// and may silently fall back to regular allocation. 90 | /// 91 | /// After this call, if successful, the file size is set to `len` bytes. 92 | fn resize_and_prealloc(ht_file: &File, len: u64, preallocate: bool) -> std::io::Result<()> { 93 | if !preallocate { 94 | // If not preallocating, just set the file size and return. 95 | ht_file.set_len(len)?; 96 | return Ok(()); 97 | } 98 | 99 | cfg_if::cfg_if! { 100 | if #[cfg(target_os = "linux")] { 101 | // To preallocate on Linux systems, try using fallocate with ZERO_RANGE first as it's more 102 | // efficient. fallocate sets the file size as well, so ftruncate (aka file.set_len()) is 103 | // not needed. 104 | if crate::sys::linux::fs_check(ht_file).map_or(false, |fsck| fsck.is_tmpfs()) { 105 | // Skip preallocation for tmpfs. It doesn't support fallocate and it's 106 | // memory-backed anyway. ftruncate and bail. 107 | ht_file.set_len(len)?; 108 | return Ok(()); 109 | } 110 | if let Err(_) = crate::sys::linux::falloc_zero_file(ht_file, len) { 111 | // If fallocate fails, fall back to zeroing the file with write. 112 | resize_and_zero_file(ht_file, len)?; 113 | } 114 | } else { 115 | resize_and_zero_file(ht_file, len)?; 116 | } 117 | } 118 | 119 | Ok(()) 120 | } 121 | 122 | // Fallback method for allocating extents for the file: just incrementally write zeroes to the file. 123 | fn resize_and_zero_file(mut file: &File, len: u64) -> std::io::Result<()> { 124 | use std::io::Write; 125 | 126 | // Set the file size first. 127 | file.set_len(len)?; 128 | 129 | // Zero the file. 130 | let len = len as usize; 131 | let buf = [0u8; PAGE_SIZE * 4]; 132 | let mut remaining = len; 133 | while remaining > 0 { 134 | let len = std::cmp::min(remaining, buf.len()); 135 | file.write_all(&buf[..len])?; 136 | remaining -= len; 137 | } 138 | Ok(()) 139 | } 140 | -------------------------------------------------------------------------------- /nomt/src/bitbox/meta_map.rs: -------------------------------------------------------------------------------- 1 | //! in-memory metadata for each bucket. this is also persisted on disk. 2 | 3 | const EMPTY: u8 = 0b0000_0000; 4 | const TOMBSTONE: u8 = 0b0111_1111; 5 | const FULL_MASK: u8 = 0b1000_0000; 6 | 7 | fn full_entry(hash: u64) -> u8 { 8 | (hash >> 57) as u8 ^ FULL_MASK 9 | } 10 | 11 | pub struct MetaMap { 12 | buckets: usize, 13 | bitvec: Vec, 14 | } 15 | 16 | impl MetaMap { 17 | // Create a new meta-map from an existing vector. 18 | pub fn from_bytes(meta_bytes: Vec, buckets: usize) -> Self { 19 | assert_eq!(meta_bytes.len() % 4096, 0); 20 | MetaMap { 21 | buckets, 22 | bitvec: meta_bytes, 23 | } 24 | } 25 | 26 | pub fn full_count(&self) -> usize { 27 | self.bitvec 28 | .iter() 29 | .filter(|&&byte| byte & FULL_MASK != 0) 30 | .count() 31 | } 32 | 33 | pub fn len(&self) -> usize { 34 | self.buckets 35 | } 36 | 37 | pub fn set_full(&mut self, bucket: usize, hash: u64) { 38 | self.bitvec[bucket] = full_entry(hash); 39 | } 40 | 41 | pub fn set_tombstone(&mut self, bucket: usize) { 42 | self.bitvec[bucket] = TOMBSTONE; 43 | } 44 | 45 | // true means definitely empty. 46 | pub fn hint_empty(&self, bucket: usize) -> bool { 47 | self.bitvec[bucket] == EMPTY 48 | } 49 | 50 | // true means definitely a tombstone. 51 | pub fn hint_tombstone(&self, bucket: usize) -> bool { 52 | self.bitvec[bucket] == TOMBSTONE 53 | } 54 | 55 | // returns true if it's definitely not a match. 56 | pub fn hint_not_match(&self, bucket: usize, raw_hash: u64) -> bool { 57 | self.bitvec[bucket] != full_entry(raw_hash) 58 | } 59 | 60 | // get the page index of a bucket in the meta-map. 61 | pub fn page_index(&self, bucket: usize) -> usize { 62 | bucket / 4096 63 | } 64 | 65 | // get a page-sized slice of the metamap. This is guaranteed to have len 4096 66 | pub fn page_slice(&self, page_index: usize) -> &[u8] { 67 | let start = page_index * 4096; 68 | let end = start + 4096; 69 | &self.bitvec[start..end] 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /nomt/src/bitbox/wal/mod.rs: -------------------------------------------------------------------------------- 1 | const WAL_ENTRY_TAG_START: u8 = 1; 2 | const WAL_ENTRY_TAG_END: u8 = 2; 3 | const WAL_ENTRY_TAG_CLEAR: u8 = 3; 4 | const WAL_ENTRY_TAG_UPDATE: u8 = 4; 5 | 6 | pub use read::{WalBlobReader, WalEntry}; 7 | pub use write::WalBlobBuilder; 8 | 9 | mod read; 10 | mod write; 11 | 12 | #[cfg(test)] 13 | mod tests; 14 | -------------------------------------------------------------------------------- /nomt/src/bitbox/wal/tests.rs: -------------------------------------------------------------------------------- 1 | use super::{WalBlobBuilder, WalBlobReader, WalEntry}; 2 | use crate::{io::page_pool::PagePool, merkle::ElidedChildren, page_diff::PageDiff}; 3 | use std::{fs::OpenOptions, io::Write as _}; 4 | 5 | #[test] 6 | fn test_write_read() { 7 | let tempdir = tempfile::tempdir().unwrap(); 8 | let wal_filename = tempdir.path().join("wal"); 9 | std::fs::create_dir_all(tempdir.path()).unwrap(); 10 | let mut wal_fd = { 11 | let mut options = OpenOptions::new(); 12 | options.read(true).write(true).create(true); 13 | options.open(&wal_filename).unwrap() 14 | }; 15 | 16 | let mut builder = WalBlobBuilder::new().unwrap(); 17 | builder.reset(69); 18 | builder.write_clear(0); 19 | builder.write_update( 20 | [0; 32], 21 | &PageDiff::from_bytes(hex_literal::hex!( 22 | "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00" 23 | )) 24 | .unwrap(), 25 | vec![].into_iter(), 26 | ElidedChildren::new(), 27 | 0, 28 | ); 29 | builder.write_clear(1); 30 | builder.write_update( 31 | [1; 32], 32 | &PageDiff::from_bytes(hex_literal::hex!( 33 | "01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00" 34 | )) 35 | .unwrap(), 36 | vec![[1; 32]].into_iter(), 37 | ElidedChildren::from_bytes([1, 0, 0, 0, 0, 0, 0, 0]), 38 | 1, 39 | ); 40 | builder.write_update( 41 | [2; 32], 42 | &{ 43 | let mut diff = PageDiff::default(); 44 | for i in 0..126 { 45 | diff.set_changed(i); 46 | } 47 | diff 48 | }, 49 | (0..126).map(|x| [x; 32]), 50 | ElidedChildren::from_bytes([2, 0, 0, 0, 0, 0, 0, 0]), 51 | 2, 52 | ); 53 | builder.finalize(); 54 | wal_fd.write_all(builder.as_slice()).unwrap(); 55 | wal_fd.sync_data().unwrap(); 56 | 57 | let page_pool = PagePool::new(); 58 | let mut reader = WalBlobReader::new(&page_pool, &wal_fd).unwrap(); 59 | 60 | assert_eq!(reader.sync_seqn(), 69); 61 | assert_eq!( 62 | reader.read_entry().unwrap(), 63 | Some(WalEntry::Clear { bucket: 0 }) 64 | ); 65 | assert_eq!( 66 | reader.read_entry().unwrap(), 67 | Some(WalEntry::Update { 68 | page_id: [0; 32], 69 | page_diff: PageDiff::default(), 70 | changed_nodes: vec![], 71 | elided_children: ElidedChildren::new(), 72 | bucket: 0, 73 | }) 74 | ); 75 | assert_eq!( 76 | reader.read_entry().unwrap(), 77 | Some(WalEntry::Clear { bucket: 1 }) 78 | ); 79 | assert_eq!( 80 | reader.read_entry().unwrap(), 81 | Some(WalEntry::Update { 82 | page_id: [1; 32], 83 | page_diff: { 84 | let mut diff = PageDiff::default(); 85 | diff.set_changed(0); 86 | diff 87 | }, 88 | changed_nodes: vec![[1; 32]], 89 | elided_children: ElidedChildren::from_bytes([1, 0, 0, 0, 0, 0, 0, 0]), 90 | bucket: 1, 91 | }) 92 | ); 93 | assert_eq!( 94 | reader.read_entry().unwrap(), 95 | Some(WalEntry::Update { 96 | page_id: [2; 32], 97 | page_diff: { 98 | let mut diff = PageDiff::default(); 99 | for i in 0..126 { 100 | diff.set_changed(i); 101 | } 102 | diff 103 | }, 104 | changed_nodes: (0..126).map(|x| [x; 32]).collect(), 105 | elided_children: ElidedChildren::from_bytes([2, 0, 0, 0, 0, 0, 0, 0]), 106 | bucket: 2, 107 | }) 108 | ); 109 | assert_eq!(reader.read_entry().unwrap(), None); 110 | } 111 | -------------------------------------------------------------------------------- /nomt/src/bitbox/writeout.rs: -------------------------------------------------------------------------------- 1 | //! The writeout logic for bitbox. 2 | 3 | // The logic for writeout is split into three parts: 4 | // - first we write out the wal blob to the WAL file and wait for the MANIFEST to be synced. 5 | // - then we write out the metabits and bucket pages to the HT file. 6 | // - finally, we truncate the WAL file. 7 | 8 | use std::{ 9 | fs::File, 10 | io::{Seek as _, SeekFrom, Write}, 11 | os::fd::AsRawFd as _, 12 | sync::Arc, 13 | }; 14 | 15 | use crate::io::{FatPage, IoCommand, IoHandle, IoKind}; 16 | 17 | pub(super) fn write_wal(mut wal_fd: &File, wal_blob: &[u8]) -> std::io::Result<()> { 18 | wal_fd.set_len(0)?; 19 | wal_fd.seek(SeekFrom::Start(0))?; 20 | wal_fd.write_all(wal_blob)?; 21 | wal_fd.sync_all()?; 22 | Ok(()) 23 | } 24 | 25 | /// Truncates the WAL file to zero length. 26 | /// 27 | /// Conditionally syncs the file to disk. 28 | pub(super) fn truncate_wal(mut wal_fd: &File, do_sync: bool) -> std::io::Result<()> { 29 | wal_fd.set_len(0)?; 30 | wal_fd.seek(SeekFrom::Start(0))?; 31 | if do_sync { 32 | wal_fd.sync_all()?; 33 | } 34 | Ok(()) 35 | } 36 | 37 | pub(super) fn write_ht( 38 | io_handle: IoHandle, 39 | ht_fd: &File, 40 | mut ht: Vec<(u64, Arc)>, 41 | ) -> std::io::Result<()> { 42 | let mut sent = 0; 43 | 44 | ht.sort_unstable_by_key(|item| item.0); 45 | for (pn, page) in ht { 46 | io_handle 47 | .send(IoCommand { 48 | kind: IoKind::WriteArc(ht_fd.as_raw_fd(), pn, page), 49 | user_data: 0, 50 | }) 51 | .unwrap(); 52 | sent += 1; 53 | } 54 | 55 | while sent > 0 { 56 | io_handle.recv().unwrap(); 57 | sent -= 1; 58 | } 59 | 60 | ht_fd.sync_all()?; 61 | 62 | Ok(()) 63 | } 64 | -------------------------------------------------------------------------------- /nomt/src/io/fsyncer.rs: -------------------------------------------------------------------------------- 1 | use parking_lot::{Condvar, Mutex}; 2 | use std::{fs::File, sync::Arc}; 3 | 4 | #[derive(Debug)] 5 | enum State { 6 | Idle, 7 | Started, 8 | Done(Result<(), std::io::Error>), 9 | HandleDead, 10 | } 11 | 12 | impl State { 13 | fn force_take_done(&mut self) -> Result<(), std::io::Error> { 14 | let s = std::mem::replace(self, State::Idle); 15 | if let State::Done(res) = s { 16 | res 17 | } else { 18 | panic!("force_take_done called on non-done state"); 19 | } 20 | } 21 | } 22 | 23 | struct Shared { 24 | cv: Condvar, 25 | s: Mutex, 26 | } 27 | 28 | /// Fsyncer is a helper that allows to fsync a file in a non-blocking manner. 29 | /// 30 | /// It spawns a thread that will fsync the file in the background. 31 | /// 32 | /// The expected usage is from two threads: the one that calls [`Self::fsync`] and the one that calls 33 | /// [`Self::wait`]. 34 | pub struct Fsyncer { 35 | shared: Arc, 36 | } 37 | 38 | impl Fsyncer { 39 | /// Creates a new fsyncer with the given file descriptor and identifier. 40 | pub fn new(name: &'static str, fd: Arc) -> Self { 41 | let name = format!("nomt-fsyncer-{}", name); 42 | let shared = Arc::new(Shared { 43 | cv: Condvar::new(), 44 | s: Mutex::new(State::Idle), 45 | }); 46 | let _thread = std::thread::Builder::new() 47 | .name(name) 48 | .spawn({ 49 | let shared = shared.clone(); 50 | move || { 51 | worker(fd, shared); 52 | } 53 | }) 54 | .expect("failed to spawn fsyncer thread"); 55 | Fsyncer { shared } 56 | } 57 | 58 | /// Issues a fsync request. 59 | /// 60 | /// # Panics 61 | /// 62 | /// Panics if there is an outstanding fsync operation that hasn't been consumed by 63 | /// [`Self::wait()`] yet. 64 | /// 65 | /// Make sure to call [`Self::wait()`] to consume any previous fsync result before issuing a new 66 | /// request. 67 | pub fn fsync(&self) { 68 | let mut s_guard = self.shared.s.lock(); 69 | assert!(matches!(&*s_guard, State::Idle)); 70 | *s_guard = State::Started; 71 | self.shared.cv.notify_all(); 72 | } 73 | 74 | /// Waits for the fsync to complete and consumes the result. 75 | /// 76 | /// This blocks until a synchronization initiated by [`Self::fsync`] completes. If no fsync has been 77 | /// initiated yet, this will block until one is both started and completed. After consuming the result, 78 | /// subsequent calls will block until the next `fsync()` operation finishes. 79 | pub fn wait(&self) -> Result<(), std::io::Error> { 80 | let mut s_guard = self.shared.s.lock(); 81 | self.shared 82 | .cv 83 | .wait_while(&mut s_guard, |s| !matches!(s, State::Done(_))); 84 | s_guard.force_take_done() 85 | } 86 | } 87 | 88 | impl Drop for Fsyncer { 89 | fn drop(&mut self) { 90 | let mut s_guard = self.shared.s.lock(); 91 | *s_guard = State::HandleDead; 92 | self.shared.cv.notify_all(); 93 | } 94 | } 95 | 96 | fn worker(fd: Arc, shared: Arc) { 97 | let bomb = Bomb; 98 | 'outer: loop { 99 | let mut s_guard = shared.s.lock(); 100 | shared.cv.wait_while(&mut s_guard, |state| { 101 | !matches!(state, State::Started | State::HandleDead) 102 | }); 103 | if matches!(&*s_guard, State::HandleDead) { 104 | break 'outer; 105 | } 106 | assert!(matches!(&*s_guard, State::Started | State::Done(_))); 107 | drop(s_guard); 108 | 109 | let sync_result = fd.sync_all(); 110 | 111 | let mut s_guard = shared.s.lock(); 112 | if matches!(&*s_guard, State::HandleDead) { 113 | break 'outer; 114 | } 115 | *s_guard = State::Done(sync_result); 116 | shared.cv.notify_all(); 117 | } 118 | bomb.defuse(); 119 | 120 | struct Bomb; 121 | impl Bomb { 122 | fn defuse(self) { 123 | std::mem::forget(self); 124 | } 125 | } 126 | impl Drop for Bomb { 127 | fn drop(&mut self) { 128 | panic!("worker panicked"); 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /nomt/src/io/unix.rs: -------------------------------------------------------------------------------- 1 | use super::{CompleteIo, IoCommand, IoKind, IoKindResult, IoPacket, PagePool, PAGE_SIZE}; 2 | use crossbeam_channel::{Receiver, Sender}; 3 | use threadpool::ThreadPool; 4 | 5 | pub fn start_io_worker( 6 | page_pool: PagePool, 7 | io_workers_tp: &ThreadPool, 8 | io_workers: usize, 9 | ) -> Sender { 10 | let (command_tx, command_rx) = crossbeam_channel::unbounded(); 11 | 12 | for _ in 0..io_workers { 13 | spawn_worker_thread(page_pool.clone(), io_workers_tp, command_rx.clone()); 14 | } 15 | 16 | command_tx 17 | } 18 | 19 | fn spawn_worker_thread( 20 | page_pool: PagePool, 21 | io_workers_tp: &ThreadPool, 22 | command_rx: Receiver, 23 | ) { 24 | let work = move || loop { 25 | let Ok(packet) = command_rx.recv() else { 26 | // Why the `drop` here? 27 | // 28 | // `command_rx` receives the IoPacket's which are ultimately parameterized by buffers. 29 | // Those buffers are allocated in the `page_pool`. If the `page_pool` is deallocated 30 | // before this worker thread is done, that's a use-after-free. 31 | // 32 | // So in other words, we plumb `page_pool` all the way here and drop it here only to 33 | // ensure safety. 34 | drop(page_pool); 35 | return; 36 | }; 37 | let complete = execute(packet.command); 38 | let _ = packet.completion_sender.send(complete); 39 | }; 40 | 41 | io_workers_tp.execute(work); 42 | } 43 | 44 | fn execute(mut command: IoCommand) -> CompleteIo { 45 | let result = loop { 46 | let res = match command.kind { 47 | IoKind::Read(fd, page_index, ref mut page) => unsafe { 48 | libc::pread( 49 | fd, 50 | page.as_mut_ptr() as *mut libc::c_void, 51 | PAGE_SIZE as libc::size_t, 52 | (page_index * PAGE_SIZE as u64) as libc::off_t, 53 | ) 54 | }, 55 | IoKind::Write(fd, page_index, ref page) => unsafe { 56 | libc::pwrite( 57 | fd, 58 | page.as_ptr() as *const libc::c_void, 59 | PAGE_SIZE as libc::size_t, 60 | (page_index * PAGE_SIZE as u64) as libc::off_t, 61 | ) 62 | }, 63 | IoKind::WriteArc(fd, page_index, ref page) => unsafe { 64 | let page: &[u8] = &*page; 65 | libc::pwrite( 66 | fd, 67 | page.as_ptr() as *const libc::c_void, 68 | PAGE_SIZE as libc::size_t, 69 | (page_index * PAGE_SIZE as u64) as libc::off_t, 70 | ) 71 | }, 72 | IoKind::WriteRaw(fd, page_index, ref mut page) => unsafe { 73 | libc::pwrite( 74 | fd, 75 | page.as_ptr() as *const libc::c_void, 76 | PAGE_SIZE as libc::size_t, 77 | (page_index * PAGE_SIZE as u64) as libc::off_t, 78 | ) 79 | }, 80 | }; 81 | match command.kind.get_result(res) { 82 | IoKindResult::Ok => break Ok(()), 83 | IoKindResult::Err => break Err(std::io::Error::last_os_error()), 84 | IoKindResult::Retry => (), 85 | } 86 | }; 87 | 88 | CompleteIo { command, result } 89 | } 90 | -------------------------------------------------------------------------------- /nomt/src/merkle/cache_prepopulate.rs: -------------------------------------------------------------------------------- 1 | //! Utility for prepopulating the first N layers of the cache. 2 | 3 | use std::io; 4 | 5 | use crate::{ 6 | io::IoHandle, 7 | page_cache::{PageCache, PageMut}, 8 | store::{PageLoad, PageLoader, Store}, 9 | }; 10 | 11 | use nomt_core::page_id::{ChildPageIndex, PageId, MAX_PAGE_DEPTH, NUM_CHILDREN, ROOT_PAGE_ID}; 12 | 13 | /// Prepopulate the given number of levels of the page tree into the page cache. 14 | /// 15 | /// This function blocks until the prepopulation has finished. 16 | pub fn prepopulate( 17 | io_handle: IoHandle, 18 | page_cache: &PageCache, 19 | store: &Store, 20 | levels: usize, 21 | ) -> io::Result<()> { 22 | let page_loader = store.page_loader(); 23 | let mut loads = Vec::new(); 24 | 25 | let levels = std::cmp::min(levels, MAX_PAGE_DEPTH); 26 | 27 | // dispatch all page loads recursively. 28 | dispatch_recursive(ROOT_PAGE_ID, &page_loader, &io_handle, &mut loads, levels)?; 29 | 30 | let mut completed = 0; 31 | 32 | // wait on I/O results. 33 | while completed < loads.len() { 34 | // UNWRAP: we don't expect the I/O pool to go down. fatal error. 35 | let complete_io = io_handle.recv().expect("I/O Pool Down"); 36 | complete_io.result?; 37 | let load_index = complete_io.command.user_data as usize; 38 | let load = &mut loads[load_index]; 39 | 40 | // UNWRAP: all submitted requests are of kind Read(FatPage). 41 | if let Some((page, bucket)) = load.try_complete(complete_io.command.kind.unwrap_buf()) { 42 | completed += 1; 43 | page_cache.insert( 44 | load.page_id().clone(), 45 | PageMut::pristine_with_data(page).freeze(), 46 | bucket, 47 | ); 48 | } else { 49 | // misprobe. try again. 50 | if !page_loader.probe(load, &io_handle, complete_io.command.user_data) { 51 | // guaranteed empty. 52 | completed += 1; 53 | } 54 | } 55 | } 56 | 57 | Ok(()) 58 | } 59 | 60 | // dispatch page loads for all the children of the given page. 61 | fn dispatch_recursive( 62 | page_id: PageId, 63 | page_loader: &PageLoader, 64 | io_handle: &IoHandle, 65 | loads: &mut Vec, 66 | levels_remaining: usize, 67 | ) -> io::Result<()> { 68 | if levels_remaining == 0 { 69 | return Ok(()); 70 | } 71 | 72 | for child_index in 0..NUM_CHILDREN { 73 | // UNWRAP: all indices up to NUM_CHILDREN are allowed. 74 | let child_index = ChildPageIndex::new(child_index as u8).unwrap(); 75 | 76 | // UNWRAP: depth is not out of bounds and child index is valid. 77 | let child_page_id = page_id.child_page_id(child_index).unwrap(); 78 | 79 | let mut page_load = page_loader.start_load(child_page_id.clone()); 80 | 81 | let next_index = loads.len() as u64; 82 | if page_loader.probe(&mut page_load, io_handle, next_index) { 83 | // probe has been dispatched. 84 | loads.push(page_load); 85 | dispatch_recursive( 86 | child_page_id, 87 | page_loader, 88 | io_handle, 89 | loads, 90 | levels_remaining - 1, 91 | )?; 92 | } 93 | } 94 | 95 | Ok(()) 96 | } 97 | -------------------------------------------------------------------------------- /nomt/src/merkle/page_set.rs: -------------------------------------------------------------------------------- 1 | //! A set of pages that the page walker draws upon and which is filled by `Seek`ing. 2 | 3 | use nomt_core::page_id::PageId; 4 | use std::{collections::HashMap, sync::Arc}; 5 | 6 | use super::BucketInfo; 7 | use crate::{ 8 | io::PagePool, 9 | page_cache::{Page, PageMut}, 10 | page_diff::PageDiff, 11 | }; 12 | 13 | /// A page in the [`PageSet`] can have two different origins. 14 | #[derive(Clone)] 15 | pub enum PageOrigin { 16 | /// It could have been fetched from the hash table, thereby having an associated `BucketInfo`. 17 | Persisted(BucketInfo), 18 | /// It could have been reconstructed on the fly without being stored anywhere. 19 | /// It keeps track of the total number of leaves in child pages and which nodes 20 | /// in the page have been reconstructed. 21 | Reconstructed(u64, PageDiff), 22 | } 23 | 24 | impl PageOrigin { 25 | /// Extract `BucketInfo` from [`PageOrigin::Persisted`] variant. 26 | pub fn bucket_info(self) -> Option { 27 | match self { 28 | PageOrigin::Persisted(bucket_info) => Some(bucket_info), 29 | PageOrigin::Reconstructed(_, _) => None, 30 | } 31 | } 32 | 33 | /// Extract the number of leaves from [`PageOrigin::Reconstructed`] variant. 34 | pub fn leaves_counter(&self) -> Option { 35 | match self { 36 | PageOrigin::Reconstructed(counter, _) => Some(*counter), 37 | PageOrigin::Persisted(_) => None, 38 | } 39 | } 40 | 41 | /// Extract the [`PageDiff`] from [`PageOrigin::Reconstructed`] variant. 42 | pub fn page_diff(&self) -> Option<&PageDiff> { 43 | match self { 44 | PageOrigin::Reconstructed(_, page_diff) => Some(page_diff), 45 | PageOrigin::Persisted(_) => None, 46 | } 47 | } 48 | } 49 | 50 | pub struct PageSet { 51 | map: HashMap, 52 | warm_up_map: Option>>, 53 | page_pool: PagePool, 54 | } 55 | 56 | impl PageSet { 57 | pub fn new(page_pool: PagePool, warmed_up: Option) -> Self { 58 | PageSet { 59 | map: HashMap::new(), 60 | page_pool, 61 | warm_up_map: warmed_up.map(|x| x.0), 62 | } 63 | } 64 | 65 | /// Freeze this page-set and make a shareable version of it. This returns a frozen page set 66 | /// containing all insertions into this map. 67 | pub fn freeze(self) -> FrozenSharedPageSet { 68 | FrozenSharedPageSet(Arc::new(self.map)) 69 | } 70 | 71 | fn get_warmed_up(&self, page_id: &PageId) -> Option<(Page, PageOrigin)> { 72 | self.warm_up_map 73 | .as_ref() 74 | .and_then(|m| m.get(page_id)) 75 | .map(|(p, b)| (p.clone(), b.clone())) 76 | } 77 | } 78 | 79 | impl super::page_walker::PageSet for PageSet { 80 | fn fresh(&self, page_id: &PageId) -> PageMut { 81 | let page = PageMut::pristine_empty(&self.page_pool, &page_id); 82 | page 83 | } 84 | 85 | fn contains(&self, page_id: &PageId) -> bool { 86 | self.map.contains_key(&page_id) 87 | } 88 | 89 | fn get(&self, page_id: &PageId) -> Option<(Page, PageOrigin)> { 90 | self.map 91 | .get(&page_id) 92 | .map(|(p, bucket_info)| (p.clone(), bucket_info.clone())) 93 | .or_else(|| self.get_warmed_up(page_id)) 94 | } 95 | 96 | fn insert(&mut self, page_id: PageId, page: Page, page_origin: PageOrigin) { 97 | self.map.insert(page_id, (page, page_origin)); 98 | } 99 | } 100 | 101 | /// A frozen, shared page set. This is cheap to clone. 102 | #[derive(Clone)] 103 | pub struct FrozenSharedPageSet(Arc>); 104 | -------------------------------------------------------------------------------- /nomt/src/metrics.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{ 2 | atomic::{AtomicU64, Ordering}, 3 | Arc, 4 | }; 5 | 6 | /// Metrics collector, if active, it provides Counters and Timers 7 | #[derive(Clone)] 8 | pub struct Metrics { 9 | metrics: Option>, 10 | } 11 | 12 | /// Metrics that can be collected during execution 13 | #[derive(PartialEq, Eq, Hash)] 14 | pub enum Metric { 15 | /// Counter of total page requests 16 | PageRequests, 17 | /// Counter of page requests cache misses over all page requests 18 | PageCacheMisses, 19 | /// Timer used to record average page fetch time 20 | PageFetchTime, 21 | /// Timer used to record average value fetch time during reads 22 | ValueFetchTime, 23 | } 24 | 25 | struct ActiveMetrics { 26 | page_requests: AtomicU64, 27 | page_cache_misses: AtomicU64, 28 | page_fetch_time: Timer, 29 | value_fetch_time: Timer, 30 | } 31 | 32 | impl Metrics { 33 | /// Returns the Metrics object, active or not based on the specified input 34 | pub fn new(active: bool) -> Self { 35 | Self { 36 | metrics: if active { 37 | Some(Arc::new(ActiveMetrics { 38 | page_requests: AtomicU64::new(0), 39 | page_cache_misses: AtomicU64::new(0), 40 | page_fetch_time: Timer::new(), 41 | value_fetch_time: Timer::new(), 42 | })) 43 | } else { 44 | None 45 | }, 46 | } 47 | } 48 | 49 | /// Increase the Counter specified by the input 50 | /// 51 | /// panics if the specified [`Metric`] is not a Counter 52 | pub fn count(&self, metric: Metric) { 53 | if let Some(ref metrics) = self.metrics { 54 | let counter = match metric { 55 | Metric::PageRequests => &metrics.page_requests, 56 | Metric::PageCacheMisses => &metrics.page_cache_misses, 57 | _ => panic!("Specified metric is not a Counter"), 58 | }; 59 | 60 | counter.fetch_add(1, Ordering::Relaxed); 61 | } 62 | } 63 | 64 | /// Returns a guard that, when dropped, will record the time passed since creation 65 | /// 66 | /// panics if the specified [`Metric`] is not a Timer 67 | pub fn record<'a>(&'a self, metric: Metric) -> Option { 68 | self.metrics.as_ref().and_then(|metrics| { 69 | let timer = match metric { 70 | Metric::PageFetchTime => &metrics.page_fetch_time, 71 | Metric::ValueFetchTime => &metrics.value_fetch_time, 72 | _ => panic!("Specified metric is not a Timer"), 73 | }; 74 | 75 | Some(timer.record()) 76 | }) 77 | } 78 | 79 | /// Print collected metrics to stdout 80 | pub fn print(&self) { 81 | if let Some(ref metrics) = self.metrics { 82 | println!("metrics"); 83 | 84 | let tot_page_requests = metrics.page_requests.load(Ordering::Relaxed); 85 | println!(" page requests {}", tot_page_requests); 86 | 87 | if tot_page_requests != 0 { 88 | let cache_misses = metrics.page_cache_misses.load(Ordering::Relaxed); 89 | let percentage_cache_misses = 90 | (cache_misses as f64 / tot_page_requests as f64) * 100.0; 91 | 92 | println!( 93 | " page cache misses {} - {:.2}% of page requests", 94 | cache_misses, percentage_cache_misses 95 | ); 96 | } 97 | 98 | if let Some(mean) = metrics.page_fetch_time.mean() { 99 | println!(" page fetch mean {}", pretty_display_ns(mean)); 100 | } 101 | 102 | if let Some(mean) = metrics.value_fetch_time.mean() { 103 | println!(" value fetch mean {}", pretty_display_ns(mean)); 104 | } 105 | } else { 106 | println!("Metrics collection was not activated") 107 | } 108 | } 109 | } 110 | 111 | fn pretty_display_ns(ns: u64) -> String { 112 | // preserve 3 sig figs at minimum. 113 | let (val, unit) = if ns > 100 * 1_000_000_000 { 114 | (ns / 1_000_000_000, "s") 115 | } else if ns > 100 * 1_000_000 { 116 | (ns / 1_000_000, "ms") 117 | } else if ns > 100 * 1_000 { 118 | (ns / 1_000, "us") 119 | } else { 120 | (ns, "ns") 121 | }; 122 | 123 | format!("{val} {unit}") 124 | } 125 | 126 | struct Timer { 127 | number_of_records: AtomicU64, 128 | sum: AtomicU64, 129 | } 130 | 131 | impl Timer { 132 | fn new() -> Self { 133 | Timer { 134 | number_of_records: AtomicU64::new(0), 135 | sum: AtomicU64::new(0), 136 | } 137 | } 138 | 139 | fn mean(&self) -> Option { 140 | let n = self.number_of_records.load(Ordering::Relaxed); 141 | let sum = self.sum.load(Ordering::Relaxed); 142 | sum.checked_div(n) 143 | } 144 | 145 | fn record<'a>(&'a self) -> impl Drop + 'a { 146 | struct TimerGuard<'a> { 147 | start: std::time::Instant, 148 | n: &'a AtomicU64, 149 | sum: &'a AtomicU64, 150 | } 151 | 152 | impl Drop for TimerGuard<'_> { 153 | fn drop(&mut self) { 154 | let elapsed = self.start.elapsed().as_nanos() as u64; 155 | self.n.fetch_add(1, Ordering::Relaxed); 156 | self.sum.fetch_add(elapsed, Ordering::Relaxed); 157 | } 158 | } 159 | 160 | TimerGuard { 161 | start: std::time::Instant::now(), 162 | n: &self.number_of_records, 163 | sum: &self.sum, 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /nomt/src/rollback/delta.rs: -------------------------------------------------------------------------------- 1 | use nomt_core::trie::KeyPath; 2 | use std::{ 3 | collections::HashMap, 4 | io::{Cursor, Read as _}, 5 | }; 6 | 7 | /// A delta that should be applied to reverse a commit. 8 | #[derive(Debug, Clone)] 9 | pub struct Delta { 10 | /// This map contains the prior value for each key that was written by the commit this delta 11 | /// reverses. `None` indicates that the key did not exist before the commit. 12 | pub(crate) priors: HashMap>>, 13 | } 14 | 15 | impl Delta { 16 | #[cfg(test)] 17 | fn empty() -> Self { 18 | Self { 19 | priors: HashMap::new(), 20 | } 21 | } 22 | 23 | /// Encode the delta into a buffer. 24 | /// 25 | /// Returns the number of bytes written. 26 | pub(super) fn encode(&self) -> Vec { 27 | // The serialization format has the following layout. 28 | // 29 | // The keys are split into two groups and written as separate arrays. Those groups are: 30 | // 31 | // 1. erase: The keys that did not exist before the commit. 32 | // 2. reinstateThe keys that had prior values. 33 | // 34 | // The keys that did not exist are written first. The keys that had prior values are 35 | // written second. 36 | // 37 | // For each kind of key, we first write out the length of the array encoded as a u32. 38 | // This is followed by the keys themselves, written contiguously in little-endian order. 39 | // 40 | // The keys are written as 32-byte big-endian values. 41 | 42 | // Sort the keys into two groups. 43 | let mut to_erase = Vec::with_capacity(self.priors.len()); 44 | let mut to_reinstate = Vec::with_capacity(self.priors.len()); 45 | for (key, value) in self.priors.iter() { 46 | match value { 47 | None => to_erase.push(key), 48 | Some(value) => to_reinstate.push((key, value)), 49 | } 50 | } 51 | 52 | let to_erase_len = to_erase.len() as u32; 53 | let mut buf = Vec::with_capacity(4 + 32 * to_erase.len()); 54 | buf.extend_from_slice(&to_erase_len.to_le_bytes()); 55 | for key in to_erase { 56 | buf.extend_from_slice(&key[..]); 57 | } 58 | 59 | let to_reinstate_len = to_reinstate.len() as u32; 60 | buf.extend_from_slice(&to_reinstate_len.to_le_bytes()); 61 | for (key, value) in to_reinstate { 62 | buf.extend_from_slice(&key[..]); 63 | let value_len = value.len() as u32; 64 | buf.extend_from_slice(&value_len.to_le_bytes()); 65 | buf.extend_from_slice(value); 66 | } 67 | 68 | buf 69 | } 70 | 71 | /// Decodes the delta from a buffer. 72 | pub(super) fn decode(reader: &mut Cursor>) -> anyhow::Result { 73 | let mut priors = HashMap::new(); 74 | 75 | // Read the number of keys to erase. 76 | let mut buf = [0; 4]; 77 | reader.read_exact(&mut buf)?; 78 | let to_erase_len = u32::from_le_bytes(buf); 79 | // Read the keys to erase. 80 | for _ in 0..to_erase_len { 81 | let mut key_path = [0; 32]; 82 | reader.read_exact(&mut key_path)?; 83 | let preemted = priors.insert(key_path, None).is_some(); 84 | if preemted { 85 | anyhow::bail!("duplicate key path (erase): {:?}", key_path); 86 | } 87 | } 88 | 89 | // Read the number of keys to reinstate. 90 | reader.read_exact(&mut buf)?; 91 | let to_reinsate_len = u32::from_le_bytes(buf); 92 | // Read the keys to reinstate along with their values. 93 | for _ in 0..to_reinsate_len { 94 | // Read the key path. 95 | let mut key_path = [0; 32]; 96 | reader.read_exact(&mut key_path)?; 97 | // Read the value. 98 | let mut value = Vec::new(); 99 | reader.read_exact(&mut buf)?; 100 | let value_len = u32::from_le_bytes(buf); 101 | value.resize(value_len as usize, 0); 102 | reader.read_exact(&mut value)?; 103 | let preempted = priors.insert(key_path, Some(value)).is_some(); 104 | if preempted { 105 | anyhow::bail!("duplicate key path (reinstate): {:?}", key_path); 106 | } 107 | } 108 | Ok(Delta { priors }) 109 | } 110 | } 111 | 112 | #[cfg(test)] 113 | mod tests { 114 | use super::*; 115 | 116 | #[test] 117 | fn delta_roundtrip() { 118 | let mut delta = Delta::empty(); 119 | delta.priors.insert([1; 32], Some(b"value1".to_vec())); 120 | delta.priors.insert([2; 32], None); 121 | delta.priors.insert([3; 32], Some(b"value3".to_vec())); 122 | 123 | let mut buf = delta.encode(); 124 | let mut cursor = Cursor::new(&mut buf); 125 | let delta2 = Delta::decode(&mut cursor).unwrap(); 126 | assert_eq!(delta.priors, delta2.priors); 127 | } 128 | 129 | #[test] 130 | fn delta_roundtrip_empty() { 131 | let delta = Delta::empty(); 132 | let mut buf = delta.encode(); 133 | let mut cursor = Cursor::new(&mut buf); 134 | let delta2 = Delta::decode(&mut cursor).unwrap(); 135 | assert_eq!(delta.priors, delta2.priors); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /nomt/src/seglog/segment_filename.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; 2 | 3 | pub fn format(prefix: &str, segment_id: u32) -> String { 4 | // The format string specifies a 10-digit number, so we pad with leading zeros from 5 | // the left. This assumes that segment_id is a 32-bit integer, which is confirmed by 6 | // the assert below. If you came here because it failed due to changing it to u64, 7 | // you will need to update the format string as well. 8 | assert_eq!(segment_id.to_le_bytes().len(), 4); 9 | format!("{prefix}.{segment_id:0>10}.log") 10 | } 11 | 12 | pub fn parse(prefix: &str, filename: &str) -> Result { 13 | // The filename of a segment file consists of a configurable prefix, a 10-digit segment ID, 14 | // and a ".log" suffix. 15 | // 16 | // Example: "prefix.0000000001.log". 17 | // Extract the segment ID from the filename 18 | assert!(!prefix.is_empty()); 19 | let without_prefix = match filename.strip_prefix(prefix) { 20 | Some(s) => s, 21 | None => { 22 | return Err(anyhow::anyhow!( 23 | "Invalid segment filename format: missing prefix" 24 | )) 25 | } 26 | }; 27 | 28 | let without_suffix = match without_prefix.strip_suffix(".log") { 29 | Some(s) => s, 30 | None => { 31 | return Err(anyhow::anyhow!( 32 | "Invalid segment filename format: missing .log suffix" 33 | )) 34 | } 35 | }; 36 | 37 | let segment_id_str = match without_suffix.strip_prefix('.') { 38 | Some(s) => s, 39 | None => { 40 | return Err(anyhow::anyhow!( 41 | "Invalid segment filename format: missing dot separator" 42 | )) 43 | } 44 | }; 45 | 46 | // Check that the segment ID string has exactly 10 digits 47 | if segment_id_str.len() != 10 { 48 | return Err(anyhow::anyhow!( 49 | "Invalid segment filename format: segment ID must be exactly 10 digits" 50 | )); 51 | } 52 | 53 | // Parse the segment ID as a u32 54 | let segment_id = segment_id_str 55 | .parse::() 56 | .context("Failed to parse segment ID")?; 57 | 58 | Ok(segment_id) 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use super::{format, parse}; 64 | 65 | #[test] 66 | fn test_filename_isomorphism() { 67 | let test_cases = vec![ 68 | ("prefix", 0), 69 | ("prefix", 1), 70 | ("prefix", 9999), 71 | ("prefix", u32::MAX), 72 | ("log", 42), 73 | ("segment", 1000000), 74 | ("very_long_prefix_name", 12345), 75 | ("a", 987654321), 76 | ]; 77 | 78 | for (prefix, id) in test_cases { 79 | let filename = format(prefix, id); 80 | let parsed_id = parse(prefix, &filename).unwrap(); 81 | assert_eq!( 82 | id, parsed_id, 83 | "Mismatch for prefix '{}' and id {}", 84 | prefix, id 85 | ); 86 | } 87 | } 88 | 89 | #[test] 90 | fn test_parse_segment_filename_edge_cases() { 91 | // Valid cases 92 | assert_eq!(parse("prefix", "prefix.0000000000.log").unwrap(), 0); 93 | assert_eq!(parse("prefix", "prefix.0000000001.log").unwrap(), 1); 94 | assert_eq!(parse("prefix", "prefix.4294967295.log").unwrap(), u32::MAX); 95 | assert_eq!(parse("a", "a.0000000042.log").unwrap(), 42); 96 | 97 | // Invalid cases 98 | assert!(parse("prefix", "prefix.00000000001.log").is_err()); // Too many digits 99 | assert!(parse("prefix", "prefix.000000001.log").is_err()); // Too few digits 100 | assert!(parse("prefix", "prefix.000000000a.log").is_err()); // Non-numeric ID 101 | assert!(parse("prefix", "prefix.0000000000").is_err()); // Missing .log suffix 102 | assert!(parse("prefix", "prefix0000000000.log").is_err()); // Missing dot after prefix 103 | assert!(parse("prefix", "wrongprefix.0000000000.log").is_err()); // Wrong prefix 104 | assert!(parse("prefix", ".0000000000.log").is_err()); // Missing prefix 105 | assert!(parse("prefix", "prefix..log").is_err()); // Missing ID 106 | assert!(parse("prefix", "prefix.0000000000.wrongsuffix").is_err()); // Wrong suffix 107 | 108 | // Adversarial cases 109 | assert!(parse("prefix", "prefix.0000000000.logx").is_err()); // Extra character after .log 110 | assert!(parse("prefix", "xprefix.0000000000.log").is_err()); // Extra character before prefix 111 | assert!(parse("prefix", "prefix.00000000001log").is_err()); // Missing dot before log 112 | assert!(parse("prefix", "prefix.0000000000.log.").is_err()); // Extra dot at the end 113 | assert!(parse("prefix", "prefix.4294967296.log").is_err()); // ID overflow (u32::MAX + 1) 114 | assert!(parse("prefix", "prefix.0x0000000A.log").is_err()); // Hexadecimal ID 115 | assert_eq!( 116 | parse("prefix.with.dots", "prefix.with.dots.0000000000.log").unwrap(), 117 | 0 118 | ); // Prefix with dots 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /nomt/src/store/flock.rs: -------------------------------------------------------------------------------- 1 | //! This module provides a cross-platform advisory lock on a directory. 2 | 3 | use std::{ 4 | fs::{File, OpenOptions}, 5 | path::Path, 6 | }; 7 | 8 | /// Represents a cross-platform advisory lock on a directory. 9 | pub struct Flock { 10 | lock_fd: File, 11 | } 12 | 13 | impl Flock { 14 | pub fn lock(db_dir: &Path, lock_filename: &str) -> anyhow::Result { 15 | let lock_path = db_dir.join(lock_filename); 16 | 17 | let lock_fd = OpenOptions::new() 18 | .read(true) 19 | .write(true) 20 | .create(true) 21 | .open(lock_path)?; 22 | 23 | match crate::sys::unix::try_lock_exclusive(&lock_fd) { 24 | Ok(_) => Ok(Self { lock_fd }), 25 | Err(e) => { 26 | anyhow::bail!("Failed to lock directory: {e}"); 27 | } 28 | } 29 | } 30 | } 31 | 32 | impl Drop for Flock { 33 | fn drop(&mut self) { 34 | if let Err(e) = crate::sys::unix::unlock(&self.lock_fd) { 35 | eprintln!("Failed to unlock directory lock: {e}"); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /nomt/src/store/page_loader.rs: -------------------------------------------------------------------------------- 1 | use crate::{bitbox, io::IoHandle}; 2 | use nomt_core::page_id::PageId; 3 | 4 | pub use bitbox::PageLoad; 5 | 6 | pub struct PageLoader { 7 | pub(super) inner: bitbox::PageLoader, 8 | } 9 | 10 | impl PageLoader { 11 | /// Create a new page load. 12 | pub fn start_load(&self, page_id: PageId) -> PageLoad { 13 | self.inner.start_load(page_id) 14 | } 15 | 16 | /// Advance the state of the given page load, blocking the current thread. 17 | /// 18 | /// Panics if the page load needs a completion or if the I/O pool is down. 19 | /// 20 | /// This returns `true` if the page request has been submitted and a completion will be 21 | /// coming. `false` means that the page is guaranteed to be fresh. 22 | pub fn probe(&self, load: &mut PageLoad, io_handle: &IoHandle, user_data: u64) -> bool { 23 | self.inner.probe(load, io_handle, user_data) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /nomt/src/store/sync.rs: -------------------------------------------------------------------------------- 1 | use nomt_core::page_id::PageId; 2 | 3 | use super::{ 4 | meta::{self, Meta}, 5 | DirtyPage, Shared, 6 | }; 7 | use crate::{beatree, bitbox, options::PanicOnSyncMode, page_cache::PageCache, rollback}; 8 | 9 | pub struct Sync { 10 | pub(crate) sync_seqn: u32, 11 | pub(crate) bitbox_num_pages: u32, 12 | pub(crate) bitbox_seed: [u8; 16], 13 | pub(crate) panic_on_sync: Option, 14 | } 15 | 16 | impl Sync { 17 | pub fn new( 18 | sync_seqn: u32, 19 | bitbox_num_pages: u32, 20 | bitbox_seed: [u8; 16], 21 | panic_on_sync: Option, 22 | ) -> Self { 23 | Self { 24 | sync_seqn, 25 | bitbox_num_pages, 26 | bitbox_seed, 27 | panic_on_sync, 28 | } 29 | } 30 | 31 | pub fn sync( 32 | &mut self, 33 | shared: &Shared, 34 | value_tx: impl IntoIterator + Send + 'static, 35 | bitbox: bitbox::DB, 36 | beatree: beatree::Tree, 37 | rollback: Option, 38 | page_cache: PageCache, 39 | updated_pages: impl IntoIterator + Send + 'static, 40 | ) -> anyhow::Result<()> { 41 | let sync_seqn = self.sync_seqn + 1; 42 | 43 | let mut bitbox_sync = bitbox.sync(); 44 | let mut beatree_sync = beatree.sync(); 45 | let mut rollback_sync = rollback.map(|rollback| rollback.sync()); 46 | 47 | bitbox_sync.begin_sync(sync_seqn, page_cache, updated_pages); 48 | beatree_sync.begin_sync(value_tx); 49 | let (rollback_start_live, rollback_end_live) = match rollback_sync { 50 | Some(ref mut rollback) => rollback.begin_sync(), 51 | None => (0, 0), 52 | }; 53 | 54 | bitbox_sync.wait_pre_meta()?; 55 | let beatree_meta_wd = beatree_sync.wait_pre_meta()?; 56 | 57 | if let Some(PanicOnSyncMode::PostWal) = self.panic_on_sync { 58 | panic!("panic_on_sync is true (post-wal)") 59 | } 60 | 61 | let new_meta = Meta { 62 | magic: meta::MAGIC, 63 | version: meta::VERSION, 64 | ln_freelist_pn: beatree_meta_wd.ln_freelist_pn, 65 | ln_bump: beatree_meta_wd.ln_bump, 66 | bbn_freelist_pn: beatree_meta_wd.bbn_freelist_pn, 67 | bbn_bump: beatree_meta_wd.bbn_bump, 68 | sync_seqn, 69 | bitbox_num_pages: self.bitbox_num_pages, 70 | bitbox_seed: self.bitbox_seed, 71 | rollback_start_live, 72 | rollback_end_live, 73 | }; 74 | Meta::write(&shared.io_pool.page_pool(), &shared.meta_fd, &new_meta)?; 75 | self.sync_seqn += 1; 76 | 77 | if let Some(PanicOnSyncMode::PostMeta) = self.panic_on_sync { 78 | panic!("panic_on_sync is true (post-meta)"); 79 | } 80 | 81 | if let Some(ref mut rollback) = rollback_sync { 82 | rollback.post_meta(); 83 | } 84 | 85 | bitbox_sync.post_meta(shared.io_pool.make_handle())?; 86 | beatree_sync.post_meta(); 87 | 88 | if let Some(ref rollback) = rollback_sync { 89 | rollback.wait_post_meta()?; 90 | } 91 | Ok(()) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /nomt/src/sys/linux.rs: -------------------------------------------------------------------------------- 1 | //! Linux-specific code. 2 | 3 | use super::unix::cvt_r; 4 | use std::fs::File; 5 | use std::os::fd::AsRawFd; 6 | 7 | /// Returns an instance of `FsCheck` for the given file. 8 | pub fn fs_check(file: &File) -> std::io::Result { 9 | unsafe { 10 | // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed 11 | // by reference. This should be memory-safe because the `statfs` struct is 12 | // zeroed and the `f_type` field should be set by the ffi call. 13 | let mut stat: libc::statfs = std::mem::zeroed(); 14 | cvt_r(|| libc::fstatfs(file.as_raw_fd(), &mut stat))?; 15 | Ok(FsCheck { stat }) 16 | } 17 | } 18 | 19 | /// A utility struct to get filesystem information at a given path. 20 | pub struct FsCheck { 21 | stat: libc::statfs, 22 | } 23 | 24 | impl FsCheck { 25 | /// Returns true if the filesystem is tmpfs. 26 | pub fn is_tmpfs(&self) -> bool { 27 | self.stat.f_type == libc::TMPFS_MAGIC 28 | } 29 | } 30 | 31 | /// fallocate changes the size of the file to the given length if it's less than the current size. 32 | /// If the file is larger than the given length, the file is not truncated. 33 | /// 34 | /// Doesn't work on tmpfs. 35 | pub fn falloc_zero_file(file: &File, len: u64) -> std::io::Result<()> { 36 | cvt_r(|| unsafe { 37 | // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed 38 | // by reference. 39 | libc::fallocate( 40 | file.as_raw_fd(), 41 | libc::FALLOC_FL_ZERO_RANGE, 42 | 0 as _, 43 | len as _, 44 | ) 45 | }) 46 | .map(drop) 47 | } 48 | -------------------------------------------------------------------------------- /nomt/src/sys/macos.rs: -------------------------------------------------------------------------------- 1 | //! macOS-specific code. 2 | -------------------------------------------------------------------------------- /nomt/src/sys/mod.rs: -------------------------------------------------------------------------------- 1 | //! Platform-specific code. 2 | //! 3 | //! At the moment we only target Linux and macOS. 4 | 5 | cfg_if::cfg_if! { 6 | if #[cfg(target_os = "linux")] { 7 | pub mod linux; 8 | pub mod unix; 9 | } else if #[cfg(target_os = "macos")] { 10 | pub mod macos; 11 | pub mod unix; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /nomt/src/sys/unix.rs: -------------------------------------------------------------------------------- 1 | //! Common Unix definitions. 2 | 3 | use std::{fs::File, os::fd::AsRawFd as _}; 4 | 5 | pub fn try_lock_exclusive(file: &File) -> std::io::Result<()> { 6 | cvt_r(|| unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) }).map(drop) 7 | } 8 | 9 | pub fn unlock(file: &File) -> std::io::Result<()> { 10 | unsafe { cvt_r(|| libc::flock(file.as_raw_fd(), libc::LOCK_UN)).map(drop) } 11 | } 12 | 13 | pub(super) fn cvt_r(mut f: F) -> std::io::Result 14 | where 15 | F: FnMut() -> i32, 16 | { 17 | fn cvt(res: i32) -> std::io::Result { 18 | if res == -1 { 19 | Err(std::io::Error::last_os_error()) 20 | } else { 21 | Ok(res) 22 | } 23 | } 24 | 25 | loop { 26 | match cvt(f()) { 27 | Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => (), 28 | other => break other, 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /nomt/src/task.rs: -------------------------------------------------------------------------------- 1 | pub type TaskResult = std::thread::Result; 2 | 3 | /// Spawn the given task within the given ThreadPool. 4 | /// Use the provided Sender to send the result of the task execution. 5 | /// 6 | /// The result will contain the effective result or the payload 7 | /// of the panic that occurred. 8 | pub fn spawn_task( 9 | thread_pool: &threadpool::ThreadPool, 10 | task: F, 11 | tx: crossbeam_channel::Sender>, 12 | ) where 13 | R: Send + 'static, 14 | F: FnOnce() -> R + Send + 'static, 15 | { 16 | thread_pool.execute(move || { 17 | let res = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| task())); 18 | let _ = tx.send(res); 19 | }); 20 | } 21 | 22 | /// Blocks waiting for completion of the task spawned with [`spawn_task`]. 23 | /// It requires the receiver associated to the sender used to spawn the task. 24 | /// 25 | /// Panics if the sender is dropped. 26 | pub fn join_task(receiver: &crossbeam_channel::Receiver>) -> R 27 | where 28 | R: Send + 'static, 29 | { 30 | // UNWRAP: The sender is not expected to be dropped by the spawned task. 31 | let res = receiver.recv().unwrap(); 32 | match res { 33 | Ok(res) => res, 34 | Err(err_payload) => std::panic::resume_unwind(err_payload), 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /nomt/tests/add_remove.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | use hex_literal::hex; 5 | use nomt::trie::Node; 6 | 7 | #[test] 8 | fn add_remove_1000() { 9 | let mut accounts = 0; 10 | let mut t = Test::new("add_remove"); 11 | 12 | let expected_roots = [ 13 | hex!("0000000000000000000000000000000000000000000000000000000000000000"), 14 | hex!("4a7a6fe118037086a49ff10484f4d80b0a9f31f1060eeb1c9f0162634604b0d9"), 15 | hex!("7d5b013105d7b835225256f2233a458e1a158a53d20e0d3834886df89a26c27b"), 16 | hex!("1a290e07bcacfb58ddcd0b9da348c740ca1bf87b05ed96752a1503ed7c187b69"), 17 | hex!("5e9abfee6d927b084fed3e1306bbe65f0880d0b7de12522c38813014927f1336"), 18 | hex!("57b39e06b2ee98dccd882033eb4136f5376699128b421c83bdc7c6ca96168938"), 19 | hex!("7fd75809ef0e2133102eb5e31e47cb577149dcaebb42cddeb2fd6754256b365f"), 20 | hex!("7c00cb11ec8262385078613e7b7977e50b0751f8cb2384fdccc048eea02acb63"), 21 | hex!("516d6911c3b0a36c9227922ca0273a4aee44886201bd186f7ee7e538a769eaa5"), 22 | hex!("381b24719ff91b13d36cf0dd7622f391f4a461452ed7547a46a992ee4a4025aa"), 23 | hex!("207793e2ce76c1feb68c7259f883229f985706c8cc2fcf99f481b622a54ba375"), 24 | ]; 25 | 26 | let mut root = Node::default(); 27 | for i in 0..10 { 28 | let _ = t.read_id(0); 29 | for _ in 0..100 { 30 | common::set_balance(&mut t, accounts, 1000); 31 | accounts += 1; 32 | } 33 | { 34 | root = t.commit().0.into_inner(); 35 | } 36 | 37 | assert_eq!(root, common::expected_root(accounts)); 38 | assert_eq!(root, expected_roots[i + 1]); 39 | } 40 | 41 | assert_eq!(root, expected_roots[10]); 42 | 43 | for i in 0..10 { 44 | for _ in 0..100 { 45 | accounts -= 1; 46 | common::kill(&mut t, accounts); 47 | } 48 | { 49 | root = t.commit().0.into_inner(); 50 | } 51 | 52 | assert_eq!(root, common::expected_root(accounts)); 53 | assert_eq!(root, expected_roots[10 - i - 1]); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /nomt/tests/compute_root.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | use nomt::{hasher::Blake3Hasher, trie::NodeKind}; 5 | 6 | #[test] 7 | fn root_on_empty_db() { 8 | let t = Test::new("compute_root_empty"); 9 | let root = t.root(); 10 | assert_eq!( 11 | NodeKind::of::(&root.into_inner()), 12 | NodeKind::Terminator 13 | ); 14 | } 15 | 16 | #[test] 17 | fn root_on_leaf() { 18 | { 19 | let mut t = Test::new("compute_root_leaf"); 20 | t.write([1; 32], Some(vec![1, 2, 3])); 21 | t.commit(); 22 | } 23 | 24 | let t = Test::new_with_params("compute_root_leaf", 1, 1, None, false); 25 | let root = t.root(); 26 | assert_eq!( 27 | NodeKind::of::(&root.into_inner()), 28 | NodeKind::Leaf 29 | ); 30 | } 31 | 32 | #[test] 33 | fn root_on_internal() { 34 | { 35 | let mut t = Test::new("compute_root_internal"); 36 | t.write([0; 32], Some(vec![1, 2, 3])); 37 | t.write([1; 32], Some(vec![1, 2, 3])); 38 | t.commit(); 39 | } 40 | 41 | let t = Test::new_with_params("compute_root_internal", 1, 1, None, false); 42 | let root = t.root(); 43 | assert_eq!( 44 | NodeKind::of::(&root.into_inner()), 45 | NodeKind::Internal 46 | ); 47 | } 48 | -------------------------------------------------------------------------------- /nomt/tests/exclusive_dir.rs: -------------------------------------------------------------------------------- 1 | //! Tests the directory lock behavior. 2 | 3 | use std::path::PathBuf; 4 | 5 | use nomt::{hasher::Blake3Hasher, Nomt, Options}; 6 | 7 | fn setup_nomt(path: &str, should_clean_up: bool) -> anyhow::Result> { 8 | let path = { 9 | let mut p = PathBuf::from("test"); 10 | p.push(path); 11 | p 12 | }; 13 | if should_clean_up && path.exists() { 14 | std::fs::remove_dir_all(&path)?; 15 | } 16 | let mut o = Options::new(); 17 | o.path(path); 18 | o.bitbox_seed([0; 16]); 19 | Nomt::open(o) 20 | } 21 | 22 | #[test] 23 | fn smoke() { 24 | let _nomt = setup_nomt("smoke", true).unwrap(); 25 | } 26 | 27 | #[test] 28 | fn dir_lock() { 29 | let _nomt_1 = setup_nomt("dir_lock", true).unwrap(); 30 | let nomt_2 = setup_nomt("dir_lock", false); 31 | assert!(matches!(nomt_2, Err(e) if e.to_string().contains("Resource temporarily unavailable"))); 32 | } 33 | 34 | #[test] 35 | fn dir_unlock() { 36 | let nomt_1 = setup_nomt("dir_unlock", true).unwrap(); 37 | drop(nomt_1); 38 | let _nomt_2 = setup_nomt("dir_unlock", false).unwrap(); 39 | } 40 | -------------------------------------------------------------------------------- /nomt/tests/extend_range_protocol.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | use common::Test; 3 | use std::path::Path; 4 | 5 | // nomt::beatree::branch::LEAF_NODE_BODY_SIZE is 6 | // expected to be 4096 and thus the merge threshold is 2047. 7 | // 8 | // This parameter makes it possible to define the following vector of 9 | // keys and values whose size, when inserted into the database, will result 10 | // in the expected set of leaves. Each line adheres to the half full 11 | // requirement, and the first element of the next row does not fit 12 | // in the previous leaf, requiring a new one. The last row does not 13 | // need to meet the half full requirement, as it may be the rightmost leaf. 14 | #[rustfmt::skip] 15 | const KEYS_AND_VALUE_SIZES: [(u8, usize); 16] =[ 16 | // leaf 1 17 | (1, 1100), (2, 1000), (3, 1000), 18 | // leaf 2 19 | (4, 900), (5, 900), (7, 900), (8, 900), 20 | // leaf 3 21 | (10, 1200), (11, 1100), (13, 700), 22 | // leaf 4 23 | (15, 1300), (16, 1100), (17, 700), 24 | // leaf 5 25 | (18, 1100), (19, 1000), (20, 500), 26 | ]; 27 | 28 | // 2 update workers will be used and the first half of `to_delete` items 29 | // which fall under the same set of leaves are assigned to the first worker 30 | // and all the remaining keys to the next worker. This makes possible 31 | // to expect the type of communication between the two workers 32 | fn insert_delete_and_read(name: impl AsRef, to_delete: Vec) { 33 | let mut t = Test::new_with_params(name, 2, 64_000, None, true); 34 | 35 | // insert values 36 | for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() { 37 | t.write(key(k), Some(vec![k; value_size])); 38 | } 39 | t.commit(); 40 | 41 | // delete values 42 | for k in to_delete.clone() { 43 | t.write(key(k), None); 44 | } 45 | t.commit(); 46 | 47 | // read values 48 | for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() { 49 | if to_delete.contains(&k) { 50 | let res = t.read(key(k)); 51 | assert_eq!(None, res); 52 | } else { 53 | let value = vec![k; value_size]; 54 | let res = t.read(key(k)); 55 | assert_eq!(Some(value), res); 56 | } 57 | } 58 | } 59 | 60 | fn key(id: u8) -> [u8; 32] { 61 | let mut key = [0; 32]; 62 | key[0] = id; 63 | key 64 | } 65 | 66 | #[test] 67 | fn extend_range_protocol_underfull_to_degenerate_split() { 68 | insert_delete_and_read("underfull_to_degenerate_split", vec![7, 8, 13]) 69 | } 70 | 71 | #[test] 72 | fn extend_range_protocol_final_unchanged_range() { 73 | insert_delete_and_read("final_unchanged_range", vec![7, 8, 10, 11, 13]) 74 | } 75 | 76 | #[test] 77 | fn extend_range_protocol_unchanged_range_to_changed() { 78 | insert_delete_and_read("unchanged_range_to_changed", vec![7, 8, 10, 11, 13, 20]) 79 | } 80 | 81 | #[test] 82 | fn extend_range_protocol_remove_cutoff() { 83 | insert_delete_and_read( 84 | "remove_cutoff", 85 | vec![7, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20], 86 | ); 87 | } 88 | -------------------------------------------------------------------------------- /nomt/tests/fill_and_empty.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | use common::Test; 3 | use rand::{prelude::SliceRandom, Rng, SeedableRng}; 4 | use std::time::{SystemTime, UNIX_EPOCH}; 5 | 6 | fn seed() -> [u8; 16] { 7 | SystemTime::now() 8 | .duration_since(UNIX_EPOCH) 9 | .expect("no time?") 10 | .as_nanos() 11 | .to_le_bytes()[0..16] 12 | .try_into() 13 | .unwrap() 14 | } 15 | 16 | fn fill_and_empty(seed: [u8; 16], commit_concurrency: usize) { 17 | let mut rng = rand_pcg::Lcg64Xsh32::from_seed(seed); 18 | 19 | let db_size = 1 << 12; 20 | let commit_size = db_size / 16; 21 | 22 | let mut items = std::collections::BTreeSet::new(); 23 | while items.len() < db_size as usize { 24 | items.insert(rand_key(&mut rng)); 25 | } 26 | let mut items: Vec<_> = items.into_iter().collect(); 27 | items.shuffle(&mut rng); 28 | 29 | let mut to_delete: Vec = (0..db_size as usize).collect(); 30 | to_delete.shuffle(&mut rng); 31 | 32 | let mut t = Test::new_with_params( 33 | format!("fill_and_empty_{}", commit_concurrency), // name 34 | commit_concurrency, 35 | 15000, // hashtable_buckets 36 | None, // panic_on_sync 37 | true, // cleanup_dir 38 | ); 39 | 40 | // inserting all the values 41 | let mut to_check = vec![]; 42 | for i in 0..db_size { 43 | let key = items[i]; 44 | let value = vec![i as u8; 400]; 45 | 46 | to_check.push((key, value.clone())); 47 | t.write(key, Some(value)); 48 | 49 | if (i + 1) % commit_size == 0 { 50 | t.commit(); 51 | // check for presence 52 | for (key, value) in to_check.drain(..) { 53 | assert_eq!(t.read(key), Some(value)); 54 | } 55 | } 56 | } 57 | 58 | // deleting all the values in different order 59 | let mut to_check = vec![]; 60 | for i in 0..db_size { 61 | let key = items[to_delete[i]]; 62 | 63 | to_check.push(key); 64 | t.write(key, None); 65 | 66 | if (i + 1) % commit_size == 0 { 67 | t.commit(); 68 | // check for absence 69 | for key in to_check.drain(..) { 70 | assert_eq!(t.read(key), None); 71 | } 72 | } 73 | } 74 | 75 | assert!(t.commit().0.is_empty()); 76 | } 77 | 78 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] { 79 | let mut key = [0; 32]; 80 | rng.fill(&mut key[..]); 81 | key 82 | } 83 | 84 | #[test] 85 | fn fill_and_empty_1_commit_worker() { 86 | let seed = seed(); 87 | let test_result = std::panic::catch_unwind(|| { 88 | fill_and_empty(seed, 1); 89 | }); 90 | if let Err(cause) = test_result { 91 | eprintln!( 92 | "fill_and_empty_1_commit_worker failed with seed: {:?}", 93 | seed 94 | ); 95 | std::panic::resume_unwind(cause); 96 | } 97 | } 98 | 99 | #[test] 100 | fn fill_and_empty_64_commit_worker() { 101 | let seed = seed(); 102 | let test_result = std::panic::catch_unwind(|| { 103 | fill_and_empty(seed, 64); 104 | }); 105 | if let Err(cause) = test_result { 106 | eprintln!( 107 | "fill_and_empty_64_commit_worker failed with seed: {:?}", 108 | seed 109 | ); 110 | std::panic::resume_unwind(cause); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /nomt/tests/large_values.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | 5 | #[test] 6 | fn large_values() { 7 | let mut t = Test::new("large_values"); 8 | 9 | let large1 = vec![1; 4096 * 128]; 10 | let large2 = vec![2; 4096 * 80 - 1245]; 11 | 12 | t.write_id(0, Some(large1.clone())); 13 | t.write_id(1, Some(large2.clone())); 14 | let _ = t.commit(); 15 | assert_eq!(&*t.read_id(0).unwrap(), &large1); 16 | assert_eq!(&*t.read_id(1).unwrap(), &large2); 17 | t.write_id(1, None); 18 | let _ = t.commit(); 19 | assert_eq!(&*t.read_id(0).unwrap(), &large1); 20 | assert!(t.read_id(1).is_none()); 21 | } 22 | -------------------------------------------------------------------------------- /nomt/tests/last_layer_trie.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | 5 | #[test] 6 | fn last_layer_trie() { 7 | let mut t = Test::new_with_params( 8 | "last_layer_trie", // name 9 | 1, // commit_concurrency 10 | 10_000, // hashtable_buckets 11 | None, // panic_on_sync 12 | true, // cleanup_dir 13 | ); 14 | 15 | let key1 = [170; 32]; 16 | let mut key2 = key1.clone(); 17 | key2[31] = 171; 18 | 19 | // write two leaf nodes at the last layer of the trie 20 | t.write(key1, Some(vec![1; 128])); 21 | t.write(key2, Some(vec![2; 128])); 22 | t.commit(); 23 | assert_eq!(t.read(key1), Some(vec![1; 128])); 24 | assert_eq!(t.read(key2), Some(vec![2; 128])); 25 | 26 | // modify two leaf nodes at the last layer of the trie 27 | t.write(key1, Some(vec![3; 100])); 28 | t.write(key2, Some(vec![4; 100])); 29 | t.commit(); 30 | assert_eq!(t.read(key1), Some(vec![3; 100])); 31 | assert_eq!(t.read(key2), Some(vec![4; 100])); 32 | 33 | // delete two leaf nodes at the last layer of the trie 34 | t.write(key1, None); 35 | t.write(key2, None); 36 | t.commit(); 37 | assert_eq!(t.read(key1), None); 38 | assert_eq!(t.read(key2), None); 39 | } 40 | -------------------------------------------------------------------------------- /nomt/tests/overlay.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | 5 | fn expected_root(items: Vec<([u8; 32], Vec)>) -> nomt_core::trie::Node { 6 | nomt_core::update::build_trie::( 7 | 0, 8 | items 9 | .into_iter() 10 | .map(|(k, v)| (k, *blake3::hash(&v).as_bytes())), 11 | |_| {}, 12 | ) 13 | } 14 | 15 | #[test] 16 | fn overlay_multiple_forks() { 17 | let mut test = Test::new("overlay_multiple_forks"); 18 | 19 | let overlay_a = test.update().0; 20 | let overlay_b1 = { 21 | test.start_overlay_session([&overlay_a]); 22 | test.write([1; 32], Some(vec![1, 2, 3])); 23 | test.update().0 24 | }; 25 | let overlay_b2 = { 26 | test.start_overlay_session([&overlay_a]); 27 | test.write([1; 32], Some(vec![4, 5, 6])); 28 | test.update().0 29 | }; 30 | 31 | { 32 | test.start_overlay_session([&overlay_b1, &overlay_a]); 33 | assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3])); 34 | } 35 | 36 | { 37 | test.start_overlay_session([&overlay_b2, &overlay_a]); 38 | assert_eq!(test.read([1; 32]), Some(vec![4, 5, 6])); 39 | } 40 | } 41 | 42 | #[test] 43 | fn overlay_root_calculation() { 44 | let mut test = Test::new("overlay_root_calculation"); 45 | test.write([1; 32], Some(vec![1, 2, 3])); 46 | let overlay_a = test.update().0; 47 | 48 | assert_eq!( 49 | overlay_a.root().into_inner(), 50 | expected_root(vec![([1; 32], vec![1, 2, 3])]), 51 | ); 52 | 53 | test.start_overlay_session([&overlay_a]); 54 | test.write([2; 32], Some(vec![4, 5, 6])); 55 | let overlay_b = test.update().0; 56 | 57 | assert_eq!( 58 | overlay_b.root().into_inner(), 59 | expected_root(vec![([1; 32], vec![1, 2, 3]), ([2; 32], vec![4, 5, 6])]), 60 | ); 61 | 62 | test.start_overlay_session([&overlay_b, &overlay_a]); 63 | test.write([1; 32], Some(vec![7, 8, 9])); 64 | test.write([3; 32], Some(vec![0, 1, 0])); 65 | let overlay_c = test.update().0; 66 | 67 | assert_eq!( 68 | overlay_c.root().into_inner(), 69 | expected_root(vec![ 70 | ([1; 32], vec![7, 8, 9]), 71 | ([2; 32], vec![4, 5, 6]), 72 | ([3; 32], vec![0, 1, 0]) 73 | ]), 74 | ); 75 | } 76 | 77 | #[test] 78 | #[should_panic] 79 | fn overlays_must_be_committed_in_order() { 80 | let mut test = Test::new("overlays_committed_in_order"); 81 | let overlay_a = test.update().0; 82 | test.start_overlay_session([&overlay_a]); 83 | let overlay_b = test.update().0; 84 | 85 | test.commit_overlay(overlay_b); 86 | } 87 | 88 | #[test] 89 | #[should_panic] 90 | fn overlay_competing_committed() { 91 | let mut test = Test::new("overlays_competing_committed"); 92 | let overlay_a = test.update().0; 93 | test.start_overlay_session([&overlay_a]); 94 | let overlay_b1 = test.update().0; 95 | test.start_overlay_session([&overlay_a]); 96 | let overlay_b2 = test.update().0; 97 | 98 | test.commit_overlay(overlay_a); 99 | test.commit_overlay(overlay_b1); 100 | 101 | test.commit_overlay(overlay_b2); 102 | } 103 | 104 | #[test] 105 | fn overlay_commit_in_order_works() { 106 | let mut test = Test::new("overlays_commit_in_order_works"); 107 | let overlay_a = test.update().0; 108 | test.start_overlay_session([&overlay_a]); 109 | let overlay_b = test.update().0; 110 | 111 | test.commit_overlay(overlay_a); 112 | test.commit_overlay(overlay_b); 113 | } 114 | 115 | #[test] 116 | fn overlay_changes_land_on_disk_when_committed() { 117 | { 118 | let mut test = Test::new("overlay_changes_land_on_disk"); 119 | test.write([1; 32], Some(vec![1, 2, 3])); 120 | test.write([2; 32], Some(vec![4, 5, 6])); 121 | test.write([3; 32], Some(vec![7, 8, 9])); 122 | 123 | let overlay = test.update().0; 124 | test.commit_overlay(overlay); 125 | } 126 | 127 | let mut test = Test::new_with_params( 128 | "overlay_changes_land_on_disk", 129 | /* commit_concurrency */ 1, 130 | /* hashtable_buckets */ 1, 131 | /* panic_on_sync */ None, 132 | /* cleanup_dir */ false, 133 | ); 134 | 135 | assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3])); 136 | assert_eq!(test.read([2; 32]), Some(vec![4, 5, 6])); 137 | assert_eq!(test.read([3; 32]), Some(vec![7, 8, 9])); 138 | } 139 | 140 | #[test] 141 | fn overlay_uncommitted_not_on_disk() { 142 | { 143 | let mut test = Test::new("overlay_uncommitted_not_on_disk"); 144 | test.write([1; 32], Some(vec![1, 2, 3])); 145 | test.write([2; 32], Some(vec![4, 5, 6])); 146 | test.write([3; 32], Some(vec![7, 8, 9])); 147 | 148 | let _overlay = test.update().0; 149 | } 150 | 151 | let mut test = Test::new_with_params( 152 | "overlay_uncommitted_not_on_disk", 153 | /* commit_concurrency */ 1, 154 | /* hashtable_buckets */ 1, 155 | /* panic_on_sync */ None, 156 | /* cleanup_dir */ false, 157 | ); 158 | 159 | assert_eq!(test.read([1; 32]), None); 160 | assert_eq!(test.read([2; 32]), None); 161 | assert_eq!(test.read([3; 32]), None); 162 | } 163 | -------------------------------------------------------------------------------- /nomt/tests/prev_root_check.rs: -------------------------------------------------------------------------------- 1 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams}; 2 | use std::path::PathBuf; 3 | 4 | /// Setup a NOMT with the given path, rollback enabled, and the given commit concurrency. 5 | /// 6 | /// It's important that tests that run in parallel don't use the same path. 7 | fn setup_nomt(path: &str) -> Nomt { 8 | let path = { 9 | let mut p = PathBuf::from("test"); 10 | p.push(path); 11 | p 12 | }; 13 | if path.exists() { 14 | std::fs::remove_dir_all(&path).unwrap(); 15 | } 16 | let mut o = Options::new(); 17 | o.path(path); 18 | o.commit_concurrency(1); 19 | Nomt::open(o).unwrap() 20 | } 21 | 22 | #[test] 23 | fn test_prev_root_commits() { 24 | let nomt = setup_nomt("prev_root_commits"); 25 | let session1 = nomt.begin_session(SessionParams::default()); 26 | let finished1 = session1 27 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 28 | .unwrap(); 29 | 30 | let session2 = nomt.begin_session(SessionParams::default()); 31 | let finished2 = session2 32 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 33 | .unwrap(); 34 | 35 | finished1.commit(&nomt).unwrap(); 36 | 37 | finished2.commit(&nomt).unwrap_err(); 38 | } 39 | 40 | #[test] 41 | fn test_prev_root_overlay_invalidated() { 42 | let nomt = setup_nomt("prev_root_overlay_invalidated"); 43 | let session1 = nomt.begin_session(SessionParams::default()); 44 | let finished1 = session1 45 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 46 | .unwrap(); 47 | let overlay1 = finished1.into_overlay(); 48 | 49 | let session2 = nomt.begin_session(SessionParams::default()); 50 | let finished2 = session2 51 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 52 | .unwrap(); 53 | 54 | finished2.commit(&nomt).unwrap(); 55 | 56 | overlay1.commit(&nomt).unwrap_err(); 57 | } 58 | 59 | #[test] 60 | fn test_prev_root_overlay_invalidates_session() { 61 | let nomt = setup_nomt("prev_root_overlays"); 62 | let session1 = nomt.begin_session(SessionParams::default()); 63 | let finished1 = session1 64 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 65 | .unwrap(); 66 | let overlay1 = finished1.into_overlay(); 67 | 68 | let session2 = nomt.begin_session(SessionParams::default()); 69 | let finished2 = session2 70 | .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))]) 71 | .unwrap(); 72 | 73 | overlay1.commit(&nomt).unwrap(); 74 | 75 | finished2.commit(&nomt).unwrap_err(); 76 | } 77 | -------------------------------------------------------------------------------- /nomt/tests/wal.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | use nomt::PanicOnSyncMode; 5 | 6 | #[test] 7 | fn wal_recovery_test_post_meta_swap() { 8 | // Initialize the db with panic on sync equals true. 9 | let mut t = Test::new_with_params( 10 | "wal_add_remove_1000", 11 | 1, // commit_concurrency, 12 | 1000000, // hashtable_buckets, 13 | Some(PanicOnSyncMode::PostMeta), // panic_on_sync 14 | true, // clean 15 | ); 16 | 17 | common::set_balance(&mut t, 0, 1000); 18 | common::set_balance(&mut t, 1, 2000); 19 | common::set_balance(&mut t, 2, 3000); 20 | 21 | let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { 22 | t.commit(); 23 | })); 24 | assert!(r.is_err()); 25 | drop(t); 26 | 27 | // Re-open the db without cleaning the DB dir and without panic on sync. 28 | let mut t = Test::new_with_params( 29 | "wal_add_remove_1000", 30 | 1, // commit_concurrency, 31 | 1000000, // hashtable_buckets, 32 | None, // panic_on_sync 33 | false, // clean 34 | ); 35 | assert_eq!(common::read_balance(&mut t, 0), Some(1000)); 36 | assert_eq!(common::read_balance(&mut t, 1), Some(2000)); 37 | assert_eq!(common::read_balance(&mut t, 2), Some(3000)); 38 | } 39 | 40 | #[test] 41 | fn wal_recovery_test_pre_meta_swap() { 42 | // Initialize the db with panic on sync equals true. 43 | let mut t = Test::new_with_params( 44 | "wal_pre_meta_swap", 45 | 1, // commit_concurrency, 46 | 1000000, // hashtable_buckets, 47 | Some(PanicOnSyncMode::PostWal), // panic_on_sync 48 | true, // clean 49 | ); 50 | 51 | for i in 0..1000 { 52 | common::set_balance(&mut t, i, 1000); 53 | } 54 | 55 | let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { 56 | t.commit(); 57 | })); 58 | assert!(r.is_err()); 59 | drop(t); 60 | 61 | // Re-open the db without cleaning the DB dir and without panic on sync. 62 | let mut t = Test::new_with_params( 63 | "wal_pre_meta_swap", 64 | 1, // commit_concurrency, 65 | 1000000, // hashtable_buckets, 66 | None, // panic_on_sync 67 | false, // clean 68 | ); 69 | 70 | // DB should open cleanly and not have any incomplete changes; the WAL is too new and will be 71 | // discarded. 72 | for i in 0..1000 { 73 | assert_eq!(common::read_balance(&mut t, i), None); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /nomt/tests/witness_check.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | use common::Test; 4 | use nomt::{hasher::Blake3Hasher, proof, trie::LeafData}; 5 | 6 | #[test] 7 | fn produced_witness_validity() { 8 | let mut accounts = 0; 9 | let mut t = Test::new("witness_validity"); 10 | 11 | let (prev_root, _) = { 12 | for _ in 0..10 { 13 | common::set_balance(&mut t, accounts, 1000); 14 | accounts += 1; 15 | } 16 | t.commit() 17 | }; 18 | 19 | let (new_root, witness) = { 20 | // read all existing accounts. 21 | for i in 0..accounts { 22 | t.read_id(i); 23 | } 24 | 25 | // read some nonexistent accounts. 26 | for i in 100..105 { 27 | t.read_id(i); 28 | } 29 | 30 | // kill half the existing ones. 31 | for i in 0..5 { 32 | common::kill(&mut t, i); 33 | } 34 | 35 | // and add 5 more. 36 | for _ in 0..5 { 37 | common::set_balance(&mut t, accounts, 1000); 38 | accounts += 1; 39 | } 40 | t.commit() 41 | }; 42 | 43 | assert_eq!(witness.operations.reads.len(), 15); // 10 existing + 5 nonexisting 44 | assert_eq!(witness.operations.writes.len(), 10); // 5 deletes + 5 inserts 45 | 46 | let mut updates = Vec::new(); 47 | for (i, witnessed_path) in witness.path_proofs.iter().enumerate() { 48 | let verified = witnessed_path 49 | .inner 50 | .verify::(&witnessed_path.path.path(), prev_root.into_inner()) 51 | .unwrap(); 52 | for read in witness 53 | .operations 54 | .reads 55 | .iter() 56 | .skip_while(|r| r.path_index != i) 57 | .take_while(|r| r.path_index == i) 58 | { 59 | match read.value { 60 | None => assert!(verified.confirm_nonexistence(&read.key).unwrap()), 61 | Some(ref v) => { 62 | let leaf = LeafData { 63 | key_path: read.key, 64 | value_hash: *v, 65 | }; 66 | assert!(verified.confirm_value(&leaf).unwrap()); 67 | } 68 | } 69 | } 70 | 71 | let mut write_ops = Vec::new(); 72 | for write in witness 73 | .operations 74 | .writes 75 | .iter() 76 | .skip_while(|r| r.path_index != i) 77 | .take_while(|r| r.path_index == i) 78 | { 79 | write_ops.push((write.key, write.value.clone())); 80 | } 81 | 82 | if !write_ops.is_empty() { 83 | updates.push(proof::PathUpdate { 84 | inner: verified, 85 | ops: write_ops, 86 | }); 87 | } 88 | } 89 | 90 | assert_eq!( 91 | proof::verify_update::(prev_root.into_inner(), &updates).unwrap(), 92 | new_root.into_inner(), 93 | ); 94 | } 95 | 96 | #[test] 97 | fn empty_witness() { 98 | let mut accounts = 0; 99 | let mut t = Test::new("empty_witness"); 100 | 101 | let (prev_root, _) = { 102 | for _ in 0..10 { 103 | common::set_balance(&mut t, accounts, 1000); 104 | accounts += 1; 105 | } 106 | t.commit() 107 | }; 108 | 109 | // Create a commit with no operations performed 110 | let (new_root, witness) = t.commit(); 111 | 112 | // The roots should be identical since no changes were made 113 | assert_eq!(prev_root, new_root); 114 | 115 | // The witness should be empty 116 | assert_eq!(witness.operations.reads.len(), 0); 117 | assert_eq!(witness.operations.writes.len(), 0); 118 | assert_eq!(witness.path_proofs.len(), 0); 119 | 120 | // Verify that an empty update produces the same root 121 | let updates: Vec = Vec::new(); 122 | assert_eq!( 123 | proof::verify_update::(prev_root.into_inner(), &updates).unwrap(), 124 | new_root.into_inner(), 125 | ); 126 | } 127 | 128 | #[test] 129 | fn test_verify_update_with_identical_paths() { 130 | use nomt::{ 131 | hasher::Blake3Hasher, 132 | proof::{verify_update, PathUpdate}, 133 | trie::ValueHash, 134 | }; 135 | 136 | let account0 = 0; 137 | 138 | // Create a simple trie, create an update witness. 139 | let mut t = Test::new("identical_paths_test"); 140 | common::set_balance(&mut t, account0, 1000); 141 | let (root, _) = t.commit(); 142 | t.read_id(account0); 143 | let (_, witness) = t.commit(); 144 | 145 | // Using that witness extract and verify the proof. 146 | let witnessed_path = &witness.path_proofs[0]; 147 | let verified_proof = witnessed_path 148 | .inner 149 | .verify::(&witnessed_path.path.path(), root.into_inner()) 150 | .unwrap(); 151 | 152 | // Create two identical PathUpdate objects 153 | let mut updates = Vec::new(); 154 | 155 | // First update 156 | let value1 = ValueHash::default(); 157 | let ops1 = vec![([0; 32], Some(value1))]; 158 | updates.push(PathUpdate { 159 | inner: verified_proof.clone(), 160 | ops: ops1, 161 | }); 162 | 163 | // Second update with identical path 164 | let value2 = ValueHash::default(); 165 | let ops2 = vec![([1; 32], Some(value2))]; 166 | updates.push(PathUpdate { 167 | inner: verified_proof, // Using the same verified proof 168 | ops: ops2, 169 | }); 170 | 171 | // Try to verify the update. We expect an error due to identical paths, because that violates 172 | // the requirement of ascending keys. 173 | verify_update::(root.into_inner(), &updates).unwrap_err(); 174 | } 175 | -------------------------------------------------------------------------------- /torture/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "torture" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | nix.workspace = true 8 | libc.workspace = true 9 | anyhow.workspace = true 10 | cfg-if.workspace = true 11 | serde.workspace = true 12 | bincode.workspace = true 13 | nomt = { path = "../nomt" } 14 | tokio.workspace = true 15 | tokio-util.workspace = true 16 | tokio-stream.workspace = true 17 | futures.workspace = true 18 | tempfile.workspace = true 19 | rand.workspace = true 20 | rand_pcg.workspace = true 21 | rand_distr.workspace = true 22 | imbl.workspace = true 23 | tokio-serde.workspace = true 24 | tracing.workspace = true 25 | tracing-subscriber.workspace = true 26 | hex.workspace = true 27 | futures-util.workspace = true 28 | clap.workspace = true 29 | trickfs = { path = "../trickfs" } 30 | which.workspace = true 31 | ruint.workspace = true 32 | -------------------------------------------------------------------------------- /torture/src/logging.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, IsTerminal as _}; 2 | use std::path::Path; 3 | 4 | use tracing::level_filters::LevelFilter; 5 | use tracing::{span, Level}; 6 | use tracing_subscriber::fmt::MakeWriter; 7 | use tracing_subscriber::{fmt, EnvFilter}; 8 | 9 | const ENV_NAME_COMMON: &str = "TORTURE_ALL_LOG"; 10 | const ENV_NAME_AGENT: &str = "TORTURE_AGENT_LOG"; 11 | const ENV_NAME_SUPERVISOR: &str = "TORTURE_SUPERVISOR_LOG"; 12 | 13 | enum Kind { 14 | Agent, 15 | Supervisor, 16 | } 17 | 18 | fn istty() -> bool { 19 | io::stdout().is_terminal() && io::stderr().is_terminal() 20 | } 21 | 22 | /// Creates env filter for the agent or supervisor (depending on the `agent_not_supervisor` 23 | /// argument). 24 | /// 25 | /// This function tries to read the most specific environment variable first, then falls back to 26 | /// the common one ([`ENV_NAME_COMMON`]). 27 | fn env_filter(kind: Kind) -> EnvFilter { 28 | let specific_env_name = match kind { 29 | Kind::Agent => ENV_NAME_AGENT, 30 | Kind::Supervisor => ENV_NAME_SUPERVISOR, 31 | }; 32 | 33 | return try_parse_env(specific_env_name).unwrap_or_else(|| { 34 | try_parse_env(ENV_NAME_COMMON).unwrap_or_else(|| { 35 | EnvFilter::builder() 36 | .with_default_directive(LevelFilter::INFO.into()) 37 | .parse("") 38 | .unwrap() 39 | }) 40 | }); 41 | 42 | fn try_parse_env(var_name: &str) -> Option { 43 | match std::env::var(var_name) { 44 | Ok(env) => Some( 45 | EnvFilter::builder() 46 | .with_default_directive(LevelFilter::INFO.into()) 47 | .parse(env) 48 | .unwrap(), 49 | ), 50 | Err(std::env::VarError::NotPresent) => { 51 | return None; 52 | } 53 | Err(std::env::VarError::NotUnicode(_)) => { 54 | panic!("Environment variable {} is not unicode", var_name); 55 | } 56 | } 57 | } 58 | } 59 | 60 | fn create_subscriber(kind: Kind, writer: W, ansi: bool) -> impl tracing::Subscriber 61 | where 62 | W: for<'writer> MakeWriter<'writer> + 'static + Sync + Send, 63 | { 64 | let format = fmt::format() 65 | .with_level(true) 66 | .with_target(false) 67 | .with_thread_ids(false) 68 | .with_thread_names(false) 69 | .compact() 70 | .with_timer(fmt::time::SystemTime::default()); 71 | 72 | fmt::Subscriber::builder() 73 | .with_env_filter(env_filter(kind)) 74 | .with_writer(writer) 75 | .with_ansi(ansi) 76 | .event_format(format) 77 | .finish() 78 | } 79 | 80 | pub fn init_supervisor() { 81 | let subscriber = create_subscriber(Kind::Supervisor, io::stdout, istty()); 82 | tracing::subscriber::set_global_default(subscriber) 83 | .expect("Failed to set supervisor subscriber"); 84 | } 85 | 86 | pub fn workload_subscriber(workload_dir: &impl AsRef) -> impl tracing::Subscriber { 87 | let log_file = std::fs::File::options() 88 | .create(true) 89 | .append(true) 90 | .open(workload_dir.as_ref().join("log.txt")) 91 | .expect("Failed to create log file"); 92 | create_subscriber(Kind::Supervisor, log_file, false) 93 | } 94 | 95 | pub fn init_agent(agent_id: &str, workload_dir: &impl AsRef) { 96 | let log_file = std::fs::File::options() 97 | .create(false) 98 | .append(true) 99 | .open(workload_dir.as_ref().join("log.txt")) 100 | .expect("Log file is expected to be created by the supervisor"); 101 | let subscriber = create_subscriber(Kind::Agent, log_file, false); 102 | 103 | // Set the agent global subscriber 104 | tracing::subscriber::set_global_default(subscriber).expect("Failed to set agent subscriber"); 105 | 106 | let pid = std::process::id(); 107 | let span = span!(Level::INFO, "agent", agent_id, pid); 108 | let _enter = span.enter(); 109 | // We intentionally `forget` the guard so the span remains open 110 | // for the lifetime of the entire agent process if desired. 111 | std::mem::forget(_enter); 112 | } 113 | -------------------------------------------------------------------------------- /torture/src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use tokio::net::UnixStream; 3 | 4 | mod agent; 5 | mod logging; 6 | mod message; 7 | mod panic; 8 | mod spawn; 9 | mod supervisor; 10 | 11 | #[tokio::main] 12 | async fn main() -> Result<()> { 13 | if let Some(chan) = spawn::am_spawned() { 14 | let chan = UnixStream::from_std(chan)?; 15 | agent::run(chan).await?; 16 | } else { 17 | supervisor::run().await?; 18 | } 19 | Ok(()) 20 | } 21 | -------------------------------------------------------------------------------- /torture/src/panic.rs: -------------------------------------------------------------------------------- 1 | /// Panics are caught with `std::panic::catch_unwind` which returns an `std::thread::Result`, 2 | /// the variant `Err` will contain a `Box` error from which it is possible 3 | /// to extract an error message. Those utilities allow to handle those panic error messages. 4 | use std::any::Any; 5 | 6 | /// Attempt to create a `String` with the given context and downcast 7 | /// the error to look for a message within it. If no message is found, 8 | /// the `String` will contain only the context. 9 | pub fn panic_to_string(context: &str, err: Box) -> String { 10 | if let Some(err) = err.downcast_ref::<&str>() { 11 | return format!("{}: {}", context, err); 12 | } 13 | if let Some(err) = err.downcast_ref::() { 14 | return format!("{}: {}", context, err); 15 | } 16 | format!("{} (no message)", context) 17 | } 18 | 19 | /// Creates a `anyhow::Result::Err(..)` from a context and an error 20 | /// possibly containing a message. 21 | pub fn panic_to_err(context: &str, err: Box) -> anyhow::Result { 22 | Err(anyhow::anyhow!("{}", panic_to_string(context, err))) 23 | } 24 | -------------------------------------------------------------------------------- /torture/src/spawn.rs: -------------------------------------------------------------------------------- 1 | // A low-level module for spawning a child process and figuring out if we are the parent or the 2 | // child using the same binary. 3 | // 4 | // The parent spawns a child process and passes a socket to it. The socket is passed to the child 5 | // via a predefined file descriptor. The child then uses this file descriptor to communicate with 6 | // the parent. 7 | // 8 | // For a process launched using the common binary, it can check if it is a child by checking if the 9 | // [`CANARY_SOCKET_FD`] is valid. 10 | // 11 | // The main goal of this module is to tuck away the low-level machinery like working with libc and 12 | // nix into a single place. 13 | 14 | use anyhow::Result; 15 | use cfg_if::cfg_if; 16 | use std::{ 17 | os::{ 18 | fd::{AsRawFd as _, FromRawFd as _, RawFd}, 19 | unix::net::UnixStream, 20 | }, 21 | path::PathBuf, 22 | sync::atomic::{AtomicBool, Ordering}, 23 | }; 24 | use tokio::process::{Child, Command}; 25 | use tracing::trace; 26 | 27 | /// A special file descriptor that is used to pass a socket to the child process. 28 | /// 29 | /// We pick a high number to avoid conflicts with other file descriptors. 30 | const CANARY_SOCKET_FD: RawFd = 1000; 31 | 32 | /// Check whether the given file descriptor is valid. 33 | fn is_valid_fd(fd: RawFd) -> bool { 34 | unsafe { libc::fcntl(fd, libc::F_GETFD) != -1 } 35 | } 36 | 37 | /// Check whether the file descriptor is set to non-blocking mode. 38 | fn is_nonblocking(fd: RawFd) -> bool { 39 | unsafe { libc::fcntl(fd, libc::F_GETFL) & libc::O_NONBLOCK == libc::O_NONBLOCK } 40 | } 41 | 42 | /// Check if the file descriptor corresponds to a Unix domain socket. 43 | /// In our case, we're verifying that the socket type is SOCK_STREAM. 44 | fn is_unix_socket(fd: RawFd) -> bool { 45 | let mut sock_type: libc::c_int = 0; 46 | let mut type_len = std::mem::size_of::() as libc::socklen_t; 47 | unsafe { 48 | libc::getsockopt( 49 | fd, 50 | libc::SOL_SOCKET, 51 | libc::SO_TYPE, 52 | &mut sock_type as *mut _ as *mut _, 53 | &mut type_len, 54 | ) == 0 55 | && sock_type == libc::SOCK_STREAM 56 | } 57 | } 58 | 59 | /// Checks for evidence that this process is a child of a parent process that spawned it. 60 | /// 61 | /// Returns a UnixStream if the process is a child, otherwise returns None. 62 | /// 63 | /// Panics if called more than once. 64 | pub fn am_spawned() -> Option { 65 | static CALLED: AtomicBool = AtomicBool::new(false); 66 | if CALLED.swap(true, Ordering::SeqCst) { 67 | // This function should not be called more than once to protect against multiple ownership 68 | // of the file descriptor. 69 | panic!(); 70 | } 71 | 72 | if !is_valid_fd(CANARY_SOCKET_FD) { 73 | return None; 74 | } 75 | 76 | if !is_unix_socket(CANARY_SOCKET_FD) { 77 | panic!("not unix socket"); 78 | } 79 | 80 | if !is_nonblocking(CANARY_SOCKET_FD) { 81 | panic!("non blocking"); 82 | } 83 | 84 | let stream = unsafe { 85 | // SAFETY: 86 | // - The file descriptor is valid (checked above with fcntl) 87 | // - We verified it's actually a Unix domain socket (checked with getsockopt) 88 | // - This code can only run once due to the TAKEN atomic bool, ensuring we have exclusive 89 | // ownership, passing it down into the UnixStream instance. 90 | // - No other code could have taken ownership as this is the first access (TAKEN was false) 91 | UnixStream::from_raw_fd(CANARY_SOCKET_FD) 92 | }; 93 | Some(stream) 94 | } 95 | 96 | pub fn spawn_child(workload_dir_path: PathBuf) -> Result<(Child, UnixStream)> { 97 | let (sock1, sock2) = UnixStream::pair()?; 98 | 99 | // Those sockets are going to be used in tokio and as such they should be both set to 100 | // non-blocking mode. 101 | sock1.set_nonblocking(true)?; 102 | sock2.set_nonblocking(true)?; 103 | 104 | let child = spawn_child_with_sock(sock2.as_raw_fd(), workload_dir_path)?; 105 | drop(sock2); // Close parent's end in child 106 | 107 | Ok((child, sock1)) 108 | } 109 | 110 | fn spawn_child_with_sock(socket_fd: RawFd, workload_dir_path: PathBuf) -> Result { 111 | trace!(?socket_fd, "Spawning child process"); 112 | 113 | // Prepare argv for the child process. 114 | // 115 | // Contains only the program binary path and a null terminator. 116 | cfg_if! { 117 | if #[cfg(target_os = "linux")] { 118 | // Nothing beats the simplicity of /proc/self/exe on Linux. 119 | let program = std::ffi::OsString::from("/proc/self/exe"); 120 | } else { 121 | let program = std::env::current_exe()?; 122 | } 123 | } 124 | 125 | let out_file = std::fs::File::options() 126 | .create(false) 127 | .append(true) 128 | .open(workload_dir_path.join("log.txt")) 129 | .expect("Log file is expected to be created by the supervisor"); 130 | let mut cmd = Command::new(program); 131 | cmd.stdout(out_file.try_clone().unwrap()); 132 | cmd.stderr(out_file); 133 | // Override the PGID of the spawned process. The motivation for this is ^C handling. To handle 134 | // ^C the shell will send the SIGINT to all processes in the process group. We are handling 135 | // SIGINT manually in the supervisor process. 136 | cmd.process_group(0); 137 | unsafe { 138 | cmd.pre_exec(move || { 139 | // Duplicate the socket_fd to the CANARY_SOCKET_FD. 140 | // Close the original socket_fd in the child process. 141 | libc::dup2(socket_fd, CANARY_SOCKET_FD); 142 | libc::close(socket_fd); 143 | Ok(()) 144 | }); 145 | } 146 | let child = cmd.spawn()?; 147 | 148 | let pid = child 149 | .id() 150 | .map(|pid| pid.to_string()) 151 | .unwrap_or_else(|| "".to_string()); 152 | trace!("spawned child process, pid={pid}"); 153 | Ok(child) 154 | } 155 | -------------------------------------------------------------------------------- /torture/src/supervisor/cli.rs: -------------------------------------------------------------------------------- 1 | use clap::{Args, Parser, Subcommand}; 2 | 3 | #[derive(Parser, Debug)] 4 | pub struct Cli { 5 | #[command(subcommand)] 6 | pub command: Commands, 7 | } 8 | 9 | #[derive(Subcommand, Debug)] 10 | pub enum Commands { 11 | /// Execute swarm testing. Multiple workloads will be executed at the same 12 | /// time, enabling and disabling different nomt features. 13 | Swarm(SwarmParams), 14 | /// Execute a single workload given a seed. 15 | Run(RunParams), 16 | } 17 | 18 | #[derive(Clone, Debug, Args)] 19 | pub struct SwarmParams { 20 | /// The maximum number of failures before the supervisor stops. 21 | /// 22 | /// If not provided, the supervisor will stop after the first failure. 23 | #[arg(short, long, default_value_t = 1)] 24 | pub flag_limit: usize, 25 | 26 | /// Folder that will be used as the working directory by the Supervisor. 27 | /// It will contain all workload folders. 28 | #[arg(long = "workdir")] 29 | pub workdir: Option, 30 | 31 | /// The maximum percentage of total disk space that torture will occupy. 32 | #[clap(value_parser=clap::value_parser!(u8).range(1..=100))] 33 | #[arg(long, default_value_t = 70)] 34 | pub max_disk: u8, 35 | 36 | /// The maximum percentage of total memory that torture will occupy. 37 | #[clap(value_parser=clap::value_parser!(u8).range(1..=100))] 38 | #[arg(long, default_value_t = 70)] 39 | pub max_memory: u8, 40 | } 41 | 42 | #[derive(Clone, Debug, Args)] 43 | pub struct RunParams { 44 | /// The 8-byte seed to use for the random number generator. 45 | pub seed: u64, 46 | 47 | /// Amount of disk space in bytes assigned to the workload. [Default: 20GiB] 48 | #[arg(short = 'd', long, default_value_t = 20 * 1024 * 1024 * 1024)] 49 | pub assigned_disk: u64, 50 | 51 | /// Amount of memory in bytes assigned to the workload. [Default: 3GiB] 52 | #[arg(short = 'm' ,long, default_value_t = 3 * 1024 * 1024 * 1024)] 53 | pub assigned_memory: u64, 54 | 55 | /// Folder that will be used as the working directory by the Supervisor. 56 | /// It will contain the folder of the workload that it is being executed. 57 | #[arg(long = "workdir")] 58 | pub workdir: Option, 59 | 60 | /// Check whether the entire state is up to date as expected. 61 | /// 62 | /// This applies after every rollback. 63 | #[arg(long = "ensure_snapshot", default_value = "false")] 64 | pub ensure_snapshot: bool, 65 | } 66 | -------------------------------------------------------------------------------- /torture/src/supervisor/controller.rs: -------------------------------------------------------------------------------- 1 | use crate::message::{InitOutcome, OpenOutcome}; 2 | 3 | use super::{comms, config::WorkloadConfiguration}; 4 | use anyhow::Result; 5 | use std::{ 6 | path::PathBuf, 7 | sync::atomic::{AtomicBool, AtomicUsize, Ordering}, 8 | }; 9 | use tokio::{net::UnixStream, process::Child}; 10 | 11 | /// A controller is responsible for overseeing a single agent process and handle its lifecycle. 12 | pub struct SpawnedAgentController { 13 | child: Child, 14 | rr: comms::RequestResponse, 15 | torn_down: AtomicBool, 16 | agent_number: usize, 17 | } 18 | 19 | // This is a safe-guard to ensure that the [`SpawnedAgentController::teardown`] is called 20 | // properly. 21 | impl Drop for SpawnedAgentController { 22 | fn drop(&mut self) { 23 | if self.torn_down.load(Ordering::Relaxed) { 24 | // The controller was torn down properly, disarm. 25 | return; 26 | } 27 | if std::thread::panicking() { 28 | // The controller was not torn down properly, but we are panicking. 29 | eprintln!("controller was not torn down properly"); 30 | return; 31 | } 32 | panic!("controller was not torn down properly"); 33 | } 34 | } 35 | 36 | impl SpawnedAgentController { 37 | pub async fn init( 38 | &mut self, 39 | workdir: String, 40 | workload_id: u64, 41 | trickfs: bool, 42 | ) -> Result { 43 | let id = format!("agent-{}-{}", workload_id, self.agent_number); 44 | let response = self 45 | .rr 46 | .send_request(crate::message::ToAgent::Init(crate::message::InitPayload { 47 | id, 48 | workdir, 49 | trickfs, 50 | })) 51 | .await?; 52 | match response { 53 | crate::message::ToSupervisor::InitResponse(outcome) => return Ok(outcome), 54 | _ => { 55 | panic!("expected init, unexpected response: {:?}", response); 56 | } 57 | } 58 | } 59 | 60 | pub async fn open(&self, config: &WorkloadConfiguration) -> Result { 61 | let rollback = if config.is_rollback_enable() { 62 | Some(config.max_rollback_commits) 63 | } else { 64 | None 65 | }; 66 | 67 | let response = self 68 | .rr 69 | .send_request(crate::message::ToAgent::Open(crate::message::OpenPayload { 70 | bitbox_seed: config.bitbox_seed, 71 | rollback, 72 | commit_concurrency: config.commit_concurrency, 73 | io_workers: config.io_workers, 74 | hashtable_buckets: config.hashtable_buckets, 75 | warm_up: config.warm_up, 76 | preallocate_ht: config.preallocate_ht, 77 | page_cache_size: config.page_cache_size, 78 | leaf_cache_size: config.leaf_cache_size, 79 | prepopulate_page_cache: config.prepopulate_page_cache, 80 | page_cache_upper_levels: config.page_cache_upper_levels, 81 | })) 82 | .await?; 83 | match response { 84 | crate::message::ToSupervisor::OpenResponse(outcome) => return Ok(outcome), 85 | _ => { 86 | panic!("expected open, unexpected response: {:?}", response); 87 | } 88 | } 89 | } 90 | 91 | /// Kills the process, shuts down the comms, and cleans up the resources. 92 | /// 93 | /// This returns only when the process is dead and the resources are cleaned up. 94 | /// 95 | /// The controller must be torn down manually. Dropping the controller is disallowed. This is 96 | /// done to control precisely when the agent process is killed. 97 | pub async fn teardown(mut self) { 98 | self.torn_down.store(true, Ordering::Relaxed); 99 | let _ = self.child.kill().await; 100 | } 101 | 102 | /// Resolves when the agent process exits. 103 | pub async fn died(&mut self) { 104 | let _ = self.child.wait().await; 105 | } 106 | 107 | pub fn rr(&self) -> &comms::RequestResponse { 108 | &self.rr 109 | } 110 | 111 | /// Returns the PID of the agent process. 112 | /// 113 | /// Returns `None` if the agent is torn down. 114 | pub fn pid(&self) -> Option { 115 | if self.torn_down.load(Ordering::Relaxed) { 116 | None 117 | } else { 118 | self.child.id() 119 | } 120 | } 121 | } 122 | 123 | /// Spawns an agent process creating a controller. 124 | /// 125 | /// The controller is placed in the `place` argument. `place` must be `None` when calling this 126 | /// function. 127 | pub async fn spawn_agent_into( 128 | place: &mut Option, 129 | output_path: PathBuf, 130 | ) -> Result<()> { 131 | assert!(place.is_none(), "the controller must be empty"); 132 | 133 | let (child, sock) = crate::spawn::spawn_child(output_path)?; 134 | 135 | let stream = UnixStream::from_std(sock)?; 136 | 137 | let (rr, task) = comms::run(stream); 138 | let _ = tokio::spawn(task); 139 | 140 | // Assign a unique ID to the agent. 141 | static AGENT_COUNT: AtomicUsize = AtomicUsize::new(0); 142 | let agent_number = AGENT_COUNT.fetch_add(1, Ordering::Relaxed); 143 | 144 | *place = Some(SpawnedAgentController { 145 | agent_number, 146 | child, 147 | rr, 148 | torn_down: AtomicBool::new(false), 149 | }); 150 | Ok(()) 151 | } 152 | -------------------------------------------------------------------------------- /torture/src/supervisor/pbt.rs: -------------------------------------------------------------------------------- 1 | //! Collection of process backtraces. 2 | //! 3 | //! This uses the grug-brain developer approach: just invoke the LLDB or GDB to get the backtrace. 4 | 5 | use futures::future::join3; 6 | use std::{path::Path, time::Duration}; 7 | use tokio::{ 8 | fs, 9 | io::{AsyncRead, AsyncReadExt as _}, 10 | process::Command, 11 | time::timeout, 12 | }; 13 | use which::which; 14 | 15 | pub async fn collect_process_backtrace(filename: &Path, pid: u32) -> anyhow::Result<()> { 16 | // Determine which debugger tool to use. 17 | let command_str = if which("lldb").is_ok() { 18 | lldb(pid) 19 | } else if which("gdb").is_ok() { 20 | gdb(pid) 21 | } else { 22 | anyhow::bail!("no lldb or gdb in PATH") 23 | }; 24 | 25 | // Run the command using a shell 26 | // Spawn the command using a shell so that we have a Child handle. 27 | let mut child = Command::new("sh") 28 | .arg("-c") 29 | .arg(&command_str) 30 | .stdout(std::process::Stdio::piped()) 31 | .stderr(std::process::Stdio::piped()) 32 | .spawn()?; 33 | 34 | let mut stdout_pipe = child.stdout.take().expect("stdout pipe"); 35 | let mut stderr_pipe = child.stderr.take().expect("stderr pipe"); 36 | 37 | async fn read_pipe(pipe: &mut (impl AsyncRead + Unpin)) -> anyhow::Result { 38 | let mut reader = tokio::io::BufReader::new(pipe); 39 | let mut buf = Vec::new(); 40 | reader.read_to_end(&mut buf).await?; 41 | let stdout = String::from_utf8(buf)?; 42 | Ok(stdout) 43 | } 44 | 45 | let stdout_fut = read_pipe(&mut stdout_pipe); 46 | let stderr_fut = read_pipe(&mut stderr_pipe); 47 | 48 | let (exit_code, stdout, stderr) = match timeout( 49 | Duration::from_secs(5), 50 | join3(child.wait(), stdout_fut, stderr_fut), 51 | ) 52 | .await 53 | { 54 | Ok(v) => v, 55 | Err(_) => { 56 | // Timed out. 57 | // 58 | // Do the best-effort attempt at killing the child process. 59 | // 60 | // FIXME: Ideally we kill not just the child process but the entire process group. 61 | tokio::spawn(async move { child.kill().await }); 62 | anyhow::bail!("Debugger command timed out after 5 seconds"); 63 | } 64 | }; 65 | 66 | let exit_code = exit_code?; 67 | let stderr = stderr?; 68 | let stdout = stdout?; 69 | 70 | if !exit_code.success() { 71 | anyhow::bail!("command '{}' failed: {}", command_str, stderr); 72 | } 73 | 74 | // Write the backtrace into the file specified by filename. 75 | fs::write(&filename, &stdout).await?; 76 | 77 | Ok(()) 78 | } 79 | 80 | /// Generate the lldb command for obtaining the backtrace. 81 | fn lldb(pid: u32) -> String { 82 | format!( 83 | "lldb -p {} -o \"thread backtrace all\" -o \"detach\" -o \"quit\"", 84 | pid 85 | ) 86 | } 87 | 88 | /// Generate the gdb command for obtaining the backtrace. 89 | fn gdb(pid: u32) -> String { 90 | format!( 91 | "gdb -p {} -batch -ex \"thread apply all bt\" -ex \"detach\" -ex \"quit\"", 92 | pid 93 | ) 94 | } 95 | -------------------------------------------------------------------------------- /torture/src/supervisor/swarm.rs: -------------------------------------------------------------------------------- 1 | use rand::Rng; 2 | 3 | pub enum SwarmFeatures { 4 | /// Trigger on and off trickfs to return ENOSPC. 5 | /// 6 | /// Will be used only when the assigned memory is smaller than 7 | /// `TRICKFS_MEMORY_THRESHOLD`. 8 | TrickfsENOSPC, 9 | /// Trigger on and off trickfs to inject latencies in every response. 10 | /// 11 | /// Will be used only when the assigned memory is smaller than 12 | /// `TRICKFS_MEMORY_THRESHOLD`. 13 | TrickfsLatencyInjection, 14 | /// Ensure that the changeset was correctly applied 15 | EnsureChangeset, 16 | /// Randomly sample the state after every crash or rollback to check the 17 | /// correctness of the state of the database. 18 | SampleSnapshot, 19 | /// Whether merkle page fetches should be warmed up while sessions are ongoing. 20 | WarmUp, 21 | /// Whether to preallocate the hashtable file. 22 | PreallocateHt, 23 | /// Whether each commit should perform a bunch of reads before applying a changeset. 24 | Read, 25 | /// Whether rollback should be performed. 26 | Rollback, 27 | /// Whether rollback crash should be exercised. 28 | RollbackCrash, 29 | /// Whether commit crash should be exercised. 30 | CommitCrash, 31 | /// Whether to prepopulate the upper levels of the page cache on startup. 32 | PrepopulatePageCache, 33 | /// Whether new keys should be inserted during commits. 34 | NewKeys, 35 | /// Whether keys should be deleted during commits. 36 | DeleteKeys, 37 | /// Whether keys should be updated during commits. 38 | UpdateKeys, 39 | /// Whether inserted values should be overflow ones. 40 | OverflowValues, 41 | } 42 | 43 | pub fn new_features_set(rng: &mut rand_pcg::Pcg64) -> Vec { 44 | let mut features = vec![ 45 | SwarmFeatures::EnsureChangeset, 46 | SwarmFeatures::SampleSnapshot, 47 | SwarmFeatures::WarmUp, 48 | SwarmFeatures::PreallocateHt, 49 | SwarmFeatures::Read, 50 | SwarmFeatures::Rollback, 51 | SwarmFeatures::RollbackCrash, 52 | SwarmFeatures::CommitCrash, 53 | SwarmFeatures::PrepopulatePageCache, 54 | SwarmFeatures::NewKeys, 55 | SwarmFeatures::DeleteKeys, 56 | SwarmFeatures::UpdateKeys, 57 | SwarmFeatures::OverflowValues, 58 | ]; 59 | 60 | // Features removal mechanism -> coin tossing for almost every feature. 61 | for idx in (0..features.len()).rev() { 62 | if rng.gen_bool(0.5) { 63 | features.remove(idx); 64 | } 65 | } 66 | 67 | // Trickfs related features are a little bit treated differently. 68 | // Trickfs rely entirely on memory thus features related to it gets executed 69 | // less often, in particular they will follow a bias coin tossing with 70 | // `p = 0.052` being the probability of being added to the set of features. 71 | // 72 | // The probability of using Trickfs is 10% (= p*p + 2 * (p * (1-p))). 73 | let p = 0.052; 74 | if rng.gen_bool(p) { 75 | features.push(SwarmFeatures::TrickfsLatencyInjection); 76 | } 77 | if rng.gen_bool(p) { 78 | features.push(SwarmFeatures::TrickfsENOSPC); 79 | } 80 | 81 | features 82 | } 83 | -------------------------------------------------------------------------------- /trickfs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "trickfs" 3 | version = "0.1.0" 4 | authors.workspace = true 5 | homepage.workspace = true 6 | repository.workspace = true 7 | edition.workspace = true 8 | license.workspace = true 9 | 10 | [dependencies] 11 | fuser.workspace = true 12 | libc.workspace = true 13 | log.workspace = true 14 | tempfile.workspace = true 15 | rand.workspace = true 16 | rand_pcg.workspace = true 17 | rand_distr.workspace = true 18 | 19 | [dev-dependencies] 20 | env_logger.workspace = true 21 | -------------------------------------------------------------------------------- /trickfs/README.md: -------------------------------------------------------------------------------- 1 | # trickfs 2 | 3 | A FUSE filesystem useful for failure injection. 4 | 5 | # Using trickfs. 6 | 7 | Typically you would not need to run trickfs directly, because it should be used as a dependency 8 | in other projects. However, if you want to test the filesystem, you can do so by running the 9 | following command: 10 | 11 | ```sh 12 | cargo run --release --bin trickmnt 13 | ``` 14 | 15 | # Building 16 | 17 | Building the project requires fuse3 and fuse to be available. On Ubuntu, you can install them with 18 | the following commands: 19 | 20 | ```sh 21 | sudo apt update 22 | sudo apt install libfuse3-dev libfuse-dev 23 | ``` 24 | 25 | On macOS you may need to install osxfuse: 26 | 27 | ```sh 28 | brew install macfuse 29 | ``` 30 | -------------------------------------------------------------------------------- /trickfs/src/latency.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::VecDeque, 3 | sync::mpsc::{self, Receiver, RecvTimeoutError, Sender}, 4 | time::{Duration, Instant}, 5 | }; 6 | 7 | use rand::SeedableRng; 8 | use rand_distr::Distribution; 9 | 10 | /// Max possible delay, in micros, used as injected latency. 11 | const MAX_LATENCY_MICROS: u64 = 1000; 12 | type Reply = Box; 13 | 14 | /// An injector of latencies. 15 | /// 16 | /// This allows to schedule replies after a certain delay. 17 | /// Delays are randomly chosen following a Pareto Distribution. 18 | /// 80% of the delay will be below 20% of MAX_LATENCY_MICROS 19 | pub struct LatencyInjector { 20 | rng: rand_pcg::Pcg64, 21 | distr: rand_distr::Pareto, 22 | tx: Sender<(Reply, Duration)>, 23 | } 24 | 25 | impl LatencyInjector { 26 | pub fn new(seed: u64) -> Self { 27 | let (tx, rx) = mpsc::channel(); 28 | let _ = std::thread::spawn(|| scheduler(rx)); 29 | Self { 30 | rng: rand_pcg::Pcg64::seed_from_u64(seed), 31 | distr: rand_distr::Pareto::new(1.0, 1.16).unwrap(), 32 | tx, 33 | } 34 | } 35 | 36 | pub fn schedule_reply(&mut self, reply: Reply) { 37 | // Shift and scale, values above 100.0 (0.05%) are clipped to MAX_LATENCY_MICROS. 38 | let f = f64::min((self.distr.sample(&mut self.rng) - 1.0) / 100.0, 1.0); 39 | let micros = (f * MAX_LATENCY_MICROS as f64).round() as u64; 40 | let delay = Duration::from_micros(micros); 41 | self.tx.send((reply, delay)).unwrap(); 42 | } 43 | } 44 | 45 | /// Task used to execute every scheduled reply. 46 | fn scheduler(rx: Receiver<(Reply, Duration)>) { 47 | let mut scheduled: VecDeque<(Reply, Instant)> = VecDeque::new(); 48 | loop { 49 | let (_, deadline) = match scheduled.front() { 50 | Some((reply, deadline)) => (reply, deadline), 51 | None => { 52 | // Nothing scheduled, wait for next reply. 53 | match rx.recv() { 54 | Ok((reply, delay)) => { 55 | schedule_new_reply(&mut scheduled, reply, delay); 56 | } 57 | Err(_) => break, 58 | } 59 | continue; 60 | } 61 | }; 62 | 63 | // Wait for a new reply to be scheduled or until we reach the deadline 64 | // of the first reply in the queue. 65 | let timeout = deadline.saturating_duration_since(std::time::Instant::now()); 66 | match rx.recv_timeout(timeout) { 67 | Ok((reply, delay)) => schedule_new_reply(&mut scheduled, reply, delay), 68 | Err(RecvTimeoutError::Timeout) => { 69 | let (reply, _) = scheduled.pop_front().unwrap(); 70 | reply(); 71 | } 72 | Err(RecvTimeoutError::Disconnected) => break, 73 | }; 74 | } 75 | 76 | // Answer to all pending replies. 77 | for (reply, _) in scheduled { 78 | reply(); 79 | } 80 | } 81 | 82 | /// Insert the reply into the scheduled queue following an order by the deadlines. 83 | fn schedule_new_reply(scheduled: &mut VecDeque<(Reply, Instant)>, reply: Reply, delay: Duration) { 84 | let deadline = std::time::Instant::now() + delay.clone(); 85 | // If two replies happen to have the same deadline, then they will be kept in FIFO order. 86 | let idx = match scheduled.binary_search_by_key(&deadline, |(_, d)| *d) { 87 | Ok(idx) => idx + 1, 88 | Err(idx) => idx, 89 | }; 90 | scheduled.insert(idx, (reply, deadline)); 91 | } 92 | -------------------------------------------------------------------------------- /trickfs/trickmnt/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "trickmnt" 3 | version = "0.1.0" 4 | authors.workspace = true 5 | homepage.workspace = true 6 | repository.workspace = true 7 | edition.workspace = true 8 | license.workspace = true 9 | 10 | [dependencies] 11 | trickfs = { path = ".." } 12 | clap = { version = "4.3.5", features = ["derive"] } 13 | env_logger = "0.11.6" 14 | log = "0.4.22" 15 | anyhow = "1.0.95" 16 | -------------------------------------------------------------------------------- /trickfs/trickmnt/src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | #[derive(Parser, Debug)] 4 | #[command(author, version, about, long_about = None)] 5 | struct Args { 6 | /// Path to the directory where trickfs will be mounted 7 | #[arg(short, long, default_value = "/tmp/trick")] 8 | mountpoint: String, 9 | } 10 | 11 | fn waitline() { 12 | log::info!("press return to stop..."); 13 | let _ = std::io::stdin().read_line(&mut String::new()); 14 | } 15 | 16 | fn main() -> anyhow::Result<()> { 17 | env_logger::builder() 18 | .filter_level(log::LevelFilter::Info) 19 | .init(); 20 | 21 | let args = Args::parse(); 22 | 23 | let handle = trickfs::spawn_trick(args.mountpoint, 0).unwrap(); 24 | waitline(); 25 | drop(handle); 26 | 27 | Ok(()) 28 | } 29 | --------------------------------------------------------------------------------