├── .editorconfig
├── .github
    ├── actions
    │   └── install-fuse
    │   │   └── action.yml
    └── workflows
    │   ├── bench.yml
    │   └── ci.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benchtop
    ├── Cargo.lock
    ├── Cargo.toml
    └── src
    │   ├── backend.rs
    │   ├── bench.rs
    │   ├── cli.rs
    │   ├── custom_workload.rs
    │   ├── main.rs
    │   ├── nomt.rs
    │   ├── sov_db.rs
    │   ├── sp_trie.rs
    │   ├── timer.rs
    │   ├── transfer_workload.rs
    │   └── workload.rs
├── core
    ├── Cargo.toml
    └── src
    │   ├── hasher.rs
    │   ├── lib.rs
    │   ├── page.rs
    │   ├── page_id.rs
    │   ├── proof
    │       ├── mod.rs
    │       ├── multi_proof.rs
    │       └── path_proof.rs
    │   ├── trie.rs
    │   ├── trie_pos.rs
    │   ├── update.rs
    │   └── witness.rs
├── docs
    ├── CONTRIBUTING.md
    ├── images
    │   ├── binary_merkle_patricia_tree.png
    │   ├── nomt_number_rule.png
    │   ├── nomt_pages.jpg
    │   └── nomt_put.png
    └── nomt_specification.md
├── examples
    ├── commit_batch
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── lib.rs
    │   │   └── main.rs
    ├── read_value
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    └── witness_verification
    │   ├── Cargo.toml
    │   └── src
    │       └── main.rs
├── fuzz
    ├── .gitignore
    ├── Cargo.toml
    └── fuzz_targets
    │   ├── api_surface.rs
    │   ├── bitwise_memcpy.rs
    │   ├── common
    │       └── mod.rs
    │   ├── prefix_len.rs
    │   ├── reconstruct_key.rs
    │   ├── separate.rs
    │   └── separator_len.rs
├── nomt
    ├── Cargo.toml
    ├── benches
    │   └── beatree.rs
    ├── src
    │   ├── beatree
    │   │   ├── README.md
    │   │   ├── allocator
    │   │   │   ├── free_list.rs
    │   │   │   └── mod.rs
    │   │   ├── benches.rs
    │   │   ├── branch
    │   │   │   ├── mod.rs
    │   │   │   └── node.rs
    │   │   ├── index.rs
    │   │   ├── iterator.rs
    │   │   ├── leaf
    │   │   │   ├── mod.rs
    │   │   │   └── node.rs
    │   │   ├── leaf_cache.rs
    │   │   ├── mod.rs
    │   │   ├── ops
    │   │   │   ├── bit_ops.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── overflow.rs
    │   │   │   ├── reconstruction.rs
    │   │   │   └── update
    │   │   │   │   ├── branch_ops.rs
    │   │   │   │   ├── branch_stage.rs
    │   │   │   │   ├── branch_updater.rs
    │   │   │   │   ├── extend_range_protocol.rs
    │   │   │   │   ├── leaf_stage.rs
    │   │   │   │   ├── leaf_updater.rs
    │   │   │   │   ├── mod.rs
    │   │   │   │   └── tests.rs
    │   │   └── writeout.rs
    │   ├── bitbox
    │   │   ├── ht_file.rs
    │   │   ├── meta_map.rs
    │   │   ├── mod.rs
    │   │   ├── wal
    │   │   │   ├── mod.rs
    │   │   │   ├── read.rs
    │   │   │   ├── tests.rs
    │   │   │   └── write.rs
    │   │   └── writeout.rs
    │   ├── io
    │   │   ├── fsyncer.rs
    │   │   ├── linux.rs
    │   │   ├── mod.rs
    │   │   ├── page_pool.rs
    │   │   └── unix.rs
    │   ├── lib.rs
    │   ├── merkle
    │   │   ├── cache_prepopulate.rs
    │   │   ├── mod.rs
    │   │   ├── page_set.rs
    │   │   ├── page_walker.rs
    │   │   ├── seek.rs
    │   │   └── worker.rs
    │   ├── metrics.rs
    │   ├── options.rs
    │   ├── overlay.rs
    │   ├── page_cache.rs
    │   ├── page_diff.rs
    │   ├── page_region.rs
    │   ├── rollback
    │   │   ├── delta.rs
    │   │   ├── mod.rs
    │   │   ├── reverse_delta_worker.rs
    │   │   └── tests.rs
    │   ├── rw_pass_cell
    │   │   ├── loom_tests.rs
    │   │   └── mod.rs
    │   ├── seglog
    │   │   ├── mod.rs
    │   │   ├── segment_filename.rs
    │   │   └── segment_rw.rs
    │   ├── store
    │   │   ├── flock.rs
    │   │   ├── meta.rs
    │   │   ├── mod.rs
    │   │   ├── page_loader.rs
    │   │   └── sync.rs
    │   ├── sys
    │   │   ├── linux.rs
    │   │   ├── macos.rs
    │   │   ├── mod.rs
    │   │   └── unix.rs
    │   └── task.rs
    └── tests
    │   ├── add_remove.rs
    │   ├── common
    │       └── mod.rs
    │   ├── compute_root.rs
    │   ├── exclusive_dir.rs
    │   ├── extend_range_protocol.rs
    │   ├── fill_and_empty.rs
    │   ├── large_values.rs
    │   ├── last_layer_trie.rs
    │   ├── overlay.rs
    │   ├── prev_root_check.rs
    │   ├── rollback.rs
    │   ├── wal.rs
    │   └── witness_check.rs
├── torture
    ├── Cargo.toml
    └── src
    │   ├── agent.rs
    │   ├── logging.rs
    │   ├── main.rs
    │   ├── message.rs
    │   ├── panic.rs
    │   ├── spawn.rs
    │   └── supervisor
    │       ├── cli.rs
    │       ├── comms.rs
    │       ├── config.rs
    │       ├── controller.rs
    │       ├── mod.rs
    │       ├── pbt.rs
    │       ├── resource.rs
    │       ├── swarm.rs
    │       └── workload.rs
└── trickfs
    ├── Cargo.toml
    ├── README.md
    ├── src
        ├── latency.rs
        └── lib.rs
    └── trickmnt
        ├── Cargo.toml
        └── src
            └── main.rs


/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | [*]
 3 | indent_style=space
 4 | indent_size=space
 5 | tab_width=4
 6 | end_of_line=lf
 7 | charset=utf-8
 8 | trim_trailing_whitespace=true
 9 | max_line_length=100
10 | insert_final_newline=true
11 | 
12 | [*.yml]
13 | indent_style=space
14 | indent_size=2
15 | tab_width=8
16 | end_of_line=lf
17 | 
18 | [*.sh]
19 | indent_style=space
20 | indent_size=4
21 | tab_width=8
22 | end_of_line=lf
23 | 
24 | [*.json]
25 | indent_style=space
26 | indent_size=2
27 | tab_width=8
28 | end_of_line=lf
29 | 
30 | 


--------------------------------------------------------------------------------
/.github/actions/install-fuse/action.yml:
--------------------------------------------------------------------------------
 1 | name: Install Ubuntu Dependencies
 2 | description: "Installs dependencies on Ubuntu"
 3 | 
 4 | runs:
 5 |   using: "composite"
 6 |   steps:
 7 |     - name: Update apt-get
 8 |       shell: bash
 9 |       run: sudo apt-get update
10 | 
11 |     - name: Install FUSE libraries
12 |       shell: bash
13 |       run: sudo apt-get install -y libfuse3-dev libfuse-dev
14 | 
15 |     - name: Allow non-root users to mount FUSE filesystems
16 |       shell: bash
17 |       run: echo "user_allow_other" | sudo tee -a /etc/fuse.conf
18 | 


--------------------------------------------------------------------------------
/.github/workflows/bench.yml:
--------------------------------------------------------------------------------
 1 | name: Benchtop
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   bench:
14 |     name: NOMT - run benchtop
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       SIZE: 22
18 |       BUCKETS: 4000000
19 |       RUST_BACKTRACE: 1
20 |     steps:
21 |       - name: Free Disk Space (Ubuntu)
22 |         uses: jlumbroso/free-disk-space@main
23 |         with:
24 |           tool-cache: false
25 |           android: true
26 |           dotnet: true
27 |           haskell: true
28 |           large-packages: true
29 |           docker-images: true
30 |           swap-storage: true
31 |       - uses: actions/checkout@v4
32 |       - run: |
33 |           # Install required dependencies
34 |           sudo apt-get update
35 |           sudo apt-get install -y libclang-dev
36 |       - run: df -h /
37 |       - run: |
38 |           # First build the binary
39 |           cargo build --release --verbose --manifest-path=benchtop/Cargo.toml
40 |           
41 |           # Verify binary exists before proceeding
42 |           if [ ! -f "benchtop/target/release/benchtop" ]; then
43 |             echo "Binary not found at benchtop/target/release/benchtop"
44 |             exit 1
45 |           fi
46 |           
47 |           # Create directories first to avoid potential issues
48 |           mkdir -p /tmp
49 |           
50 |           # Save our binary
51 |           cp benchtop/target/release/benchtop /tmp/benchtop
52 |           
53 |           # Verify copy succeeded
54 |           if [ ! -f "/tmp/benchtop" ]; then
55 |             echo "Failed to copy binary to /tmp"
56 |             exit 1
57 |           fi
58 |           
59 |           # Now safe to clean up
60 |           cargo clean
61 |           rm -rf ~/.cargo/registry
62 |           rm -rf ~/.cargo/git
63 |           rm -rf ~/.rustup
64 |           
65 |           # Create target directory after cleanup
66 |           mkdir -p target/release
67 |           
68 |           # Move binary to final location
69 |           mv /tmp/benchtop target/release/benchtop
70 |           
71 |           # Final verification
72 |           if [ ! -f "target/release/benchtop" ] || [ ! -x "target/release/benchtop" ]; then
73 |             echo "Final binary is missing or not executable"
74 |             exit 1
75 |           fi
76 |           
77 |           # Make absolutely sure it's executable
78 |           chmod +x target/release/benchtop
79 |           
80 |       - run: >-
81 |           ./target/release/benchtop init
82 |           -b nomt
83 |           -c $SIZE
84 |           -w transfer
85 |           --buckets $BUCKETS
86 |       - run: >-
87 |           ./target/release/benchtop run
88 |           -w transfer
89 |           -b nomt
90 |           -s 10000
91 |           -c $SIZE
92 |           --time-limit 30s
93 |           --workload-concurrency 6
94 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   nomt_test:
14 |     name: NOMT - test
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       # Avoid shrinking the inputs when an error is found in the leaf/branch stage tests.
18 |       NO_STAGES_SHRINKING: true
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: ./.github/actions/install-fuse
22 |       - uses: dtolnay/rust-toolchain@stable
23 |       - run: cargo build --verbose --workspace --locked
24 |       - run: cargo test --verbose --workspace
25 |   benchtop_check:
26 |     name: NOMT - check benchtop
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - uses: ./.github/actions/install-fuse
31 |       - uses: dtolnay/rust-toolchain@stable
32 |       - run: cargo check --verbose --manifest-path=benchtop/Cargo.toml --locked
33 |   loom_rw_pass_cell:
34 |     name: NOMT - loom rw_pass_cell
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |       - uses: actions/checkout@v4
38 |       - uses: dtolnay/rust-toolchain@stable
39 |       - run: RUSTFLAGS="--cfg loom" cargo test -p nomt --release --lib rw_pass_cell
40 |   doc:
41 |     name: NOMT - doc
42 |     runs-on: ubuntu-latest
43 |     env:
44 |       # Treat rustdoc warnings as errors.
45 |       RUSTDOCFLAGS: "-D warnings"
46 |     steps:
47 |       - uses: actions/checkout@v4
48 |       - uses: ./.github/actions/install-fuse
49 |       - uses: dtolnay/rust-toolchain@stable
50 |       - run: cargo doc --verbose --workspace --document-private-items
51 |   fmt:
52 |     name: NOMT - fmt
53 |     runs-on: ubuntu-latest
54 |     steps:
55 |       - uses: actions/checkout@v4
56 |       - uses: dtolnay/rust-toolchain@stable
57 |       - run: cargo fmt --all --check
58 |       - run: cargo fmt --manifest-path=benchtop/Cargo.toml --check
59 |   darwin_check:
60 |     name: NOMT - check darwin target
61 |     runs-on: ubuntu-latest
62 |     env:
63 |       # This is a workaround for the blake3 crate.
64 |       CARGO_FEATURE_PURE: 1
65 |     steps:
66 |       - uses: actions/checkout@v4
67 |       - uses: dtolnay/rust-toolchain@stable
68 |         with:
69 |           targets: x86_64-apple-darwin
70 |       # Build only the NOMT crate. Not everything builds cleanly under this configuration.
71 |       - run: cargo check --verbose -p nomt --locked --target x86_64-apple-darwin
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Common ignores
 2 | .DS_Store
 3 | .idea
 4 | .vscode
 5 | .envrc
 6 | 
 7 | /target
 8 | 
 9 | # samply / benchtop
10 | profile.json
11 | /test
12 | /nomt/test
13 | 
14 | # xtask
15 | /benchtop/regression.toml
16 | /benchtop/sov_db*
17 | /benchtop/nomt_db*
18 | /benchtop/sp_trie_db*
19 | /benchtop/target
20 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | members = [
 4 |     "core",
 5 |     "nomt",
 6 |     "fuzz",
 7 |     "torture",
 8 |     "examples/*",
 9 |     "trickfs",
10 |     "trickfs/trickmnt",
11 | ]
12 | exclude = ["benchtop"]
13 | 
14 | [workspace.package]
15 | authors = ["thrum"]
16 | homepage = "https://thrum.dev"
17 | repository = "https://github.com/thrumdev/nomt"
18 | edition = "2021"
19 | license = "MIT/Apache-2.0"
20 | 
21 | [workspace.dependencies]
22 | borsh = { version = "1.5.7", default-features = false, features = ["derive"] }
23 | bitvec = { version = "1", default-features = false, features = ["alloc"] }
24 | hex = { version = "0.4.3", default-features = false, features = ["alloc"] }
25 | ruint = { version = "1.12.1", default-features = false }
26 | arrayvec = { version = "0.7", default-features = false }
27 | blake3 = { version = "1.5.1", default-features = false }
28 | sha2 = { version = "0.10.6", default-features = false }
29 | anyhow = { version = "1.0.81", features = ["backtrace"] }
30 | parking_lot = { version = "0.12.3", features = ["arc_lock", "send_guard"] }
31 | threadpool = "1.8.1"
32 | twox-hash = "2.1.0"
33 | fxhash = "0.2.1"
34 | dashmap = "5.5.3"
35 | crossbeam = "0.8.4"
36 | crossbeam-channel = "0.5.13"
37 | slab = "0.4.9"
38 | rand = "0.8.5"
39 | ahash = "0.8.11"
40 | imbl = "3.0.0"
41 | lru = "0.12.3"
42 | libc = "0.2.155"
43 | criterion = { version = "0.3" }
44 | thread_local = "1.1.8"
45 | cfg-if = "1.0.0"
46 | io-uring = "0.6.4"
47 | loom = { version = "0.7", features = ["checkpoint"] }
48 | rand_pcg = "0.3.1"
49 | hex-literal = "0.4"
50 | tempfile = "3.8.1"
51 | lazy_static = "1.5.0"
52 | quickcheck = "1.0.3"
53 | nix = { version = "0.29", features = ["process"] }
54 | serde = { version = "1.0.216", default-features = false, features = ["derive"] }
55 | bincode = "1.3.3"
56 | tokio = { version = "1.42.0", features = ["full"] }
57 | tokio-util = { version = "0.7.13", features = ["codec"] }
58 | tokio-stream = "0.1.17"
59 | futures = "0.3.31"
60 | tokio-serde = { version = "0.9.0", features = ["bincode"] }
61 | tracing = { version = "0.1.41", features = ["attributes"] }
62 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
63 | futures-util = "0.3.31"
64 | clap = { version = "4.5.23", features = ["derive"] }
65 | which = "4"
66 | fuser = { version = "0.15.1", features = ["abi-7-23"] }
67 | log = "0.4.22"
68 | rand_distr = "0.4.3"
69 | env_logger = "0.11.6"
70 | digest = { version = "0.10.7" }
71 | 
72 | [profile.release]
73 | debug = 1
74 | debug-assertions = true
75 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## NOMT: Nearly Optimal Merkle Trie
 2 | 
 3 | An implementation of a novel binary Merkle Trie and DB, written in Rust.
 4 | 
 5 | NOMT is an embedded key-value store that maintains a Merklized representation of key-value pairs with a simple key-value API, powering high throughput authenticated commits with billions of key-value pairs on relatively inexpensive hardware. It is largely designed for use in a blockchain setting as a drop-in replacement for RocksDB, MDBX, LevelDB, or ParityDB.
 6 | 
 7 | NOMT is optimized for fast random lookups of values, fast merkle tree updates, and fast writeout. It supports the generation of Merkle multiproofs for large batches of changes.
 8 | 
 9 | NOMT is designed to take advantage of hardware improvements in Solid State Drives (SSDs) using NVMe and Linux's io-uring API for asynchronous I/O. NOMT adequately supports generic Unix as well as macOS for daily development and testing, but primarily targets Linux for performance. The impressive trend in performance and capacity in modern SSDs enables us to build a DB that scales along with the hardware.
10 | 
11 | NOMT exposes a many-readers-one-writer API organized around batch transactions referred to as `Session`s. Predictable performance in a metered execution environment is a key goal of NOMT, and therefore only one `Session` may be live at a time.
12 | 
13 | ## Project Structure
14 | 
15 | <pre>
16 | NOMT: Project Root.
17 | ├──<a href="./benchtop">benchtop</a>: A benchmarking tool for NOMT.
18 | |--<a href="./core">core</a>: Core logic, primarily for verifying and updating the NOMT.
19 | |--<a href="./docs">docs</a>: Documentation
20 | |--<a href="./fuzz">fuzz</a>: Fuzzing suite.
21 | ├──<a href="./examples">examples</a>: Various examples of using NOMT.
22 | │   ├── <a href="./examples/commit_batch">commit_batch</a>: Demonstration of a simple commit.
23 | │   ├── <a href="./examples/read_value">read_value</a>: Reading a value from the NOMT.
24 | │   ├── <a href="./examples/witness_verification">witness_verification</a>: Demonstration of how to verify a witness in a light-client setting.
25 | |--<a href="./nomt">nomt</a>: Implementation of the NOMT database.
26 | |──<a href="./torture">torture</a>: Extensive testing suite for NOMT.
27 | |--<a href="./trickfs">trickfs</a>: A FUSE filesystem aiding deeper testing. Experimental.
28 | │   ├──<a href="./trickfs/trickmnt">trickmnt</a>: A tool that allows mounting trickfs.
29 | </pre>
30 | 
31 | ## Architecture
32 | 
33 | Internally, NOMT consists of two parallel stores, Beatree and Bitbox. Beatree stores raw key-value pairs and is based around a B-Tree variant optimized for stable, fast random access patterns and high-entropy keys. Bitbox stores a custom sparse binary merkle tree in an on-disk hashtable in a format amenable to SSDs.
34 | 
35 | For more information on NOMT, the thesis behind it, and performance targets, see [this November 2024 presentation](https://x.com/TheKusamarian/status/1855477208762261910) by @rphmeier or [view the slides here](https://hackmd.io/@Xo-wxO7bQkKidH1LrqACsw/rkG0lmjWyg#/).
36 | 
37 | We have built a benchmarking tool, `benchtop`, which is located in the `benchtop` directory as a separate subcrate.
38 | 
39 | ## Contributing
40 | 
41 | See [CONTRIBUTING.md](docs/CONTRIBUTING.md).
42 | 
43 | If you would like to discuss the development of NOMT or follow along with contributor discussions, join the official [Telegram Channel](https://t.me/thrum_nomt).
44 | 
45 | ## Acknowledgements
46 | 
47 | The development of this project is supported financially by [Sovereign Labs](https://www.sovereign.xyz/), creators of the [Sovereign SDK](https://github.com/Sovereign-Labs/sovereign-sdk/). The idea for this project originated in [this post by Preston Evans](https://sovereign.mirror.xyz/jfx_cJ_15saejG9ZuQWjnGnG-NfahbazQH98i1J3NN8).
48 | 


--------------------------------------------------------------------------------
/benchtop/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "benchtop"
 3 | version = "0.1.0"
 4 | authors = ["thrum"]
 5 | homepage = "https://thrum.dev"
 6 | repository = "https://github.com/thrumdev/nomt"
 7 | edition = "2021"
 8 | license = "MIT/Apache-2.0"
 9 | 
10 | [dependencies]
11 | 
12 | # benchmarking
13 | clap = { version = "4.4.8", features = ["derive"] }
14 | anyhow = { version = "1.0.75" }
15 | hdrhistogram = "7.5.4"
16 | fxhash = "0.2.1"
17 | rand = "0.8.5"
18 | rand_distr = "0.4.3"
19 | sha2 = { version = "0.10.6" }
20 | ruint = { version = "1.12.1" }
21 | toml = "0.8.12"
22 | serde = "1.0.199"
23 | humantime = "2.1.0"
24 | rayon = "1.10"
25 | lru = "0.12.5"
26 | libc = "0.2.155"
27 | 
28 | # sov-db
29 | sov-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
30 | sov-schema-db = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
31 | sov-prover-storage-manager = { git = "https://github.com/Sovereign-Labs/sovereign-sdk", optional = true }
32 | jmt = { git = "https://github.com/penumbra-zone/jmt.git", rev = "1d007e11cb68aa5ca13e9a5af4a12e6439d5f7b6", optional = true }
33 | 
34 | # sp-trie
35 | sp-trie = { version = "32.0.0", optional = true }
36 | sp-state-machine = { version = "0.35.0", optional = true }
37 | trie-db = { version = "0.28.0", optional = true }
38 | hash-db = { version = "0.16.0", optional = true }
39 | sp-core = { version = "31.0.0", optional = true }
40 | kvdb = { version = "0.13.0", optional = true }
41 | kvdb-rocksdb = { version = "0.19.0", optional = true }
42 | array-bytes = { version = "6.1", optional = true }
43 | 
44 | # nomt
45 | nomt = { path = "../nomt" }
46 | 
47 | [profile.release]
48 | debug = true
49 | 
50 | [features]
51 | sov-db=["dep:sov-db", "sov-schema-db", "sov-prover-storage-manager", "jmt" ]
52 | sp-trie=["dep:sp-trie", "sp-state-machine", "trie-db", "hash-db", "sp-core", "kvdb", "kvdb-rocksdb", "array-bytes" ]
53 | 


--------------------------------------------------------------------------------
/benchtop/src/backend.rs:
--------------------------------------------------------------------------------
  1 | use crate::{nomt::NomtDB, timer::Timer, workload::Workload};
  2 | 
  3 | #[cfg(feature = "sov-db")]
  4 | use crate::sov_db::SovDB;
  5 | 
  6 | #[cfg(feature = "sp-trie")]
  7 | use crate::sp_trie::SpTrieDB;
  8 | 
  9 | #[derive(Debug, Clone, clap::ValueEnum)]
 10 | pub enum Backend {
 11 |     SovDB,
 12 |     Nomt,
 13 |     SpTrie,
 14 | }
 15 | 
 16 | impl Backend {
 17 |     pub fn all_backends() -> Vec<Self> {
 18 |         vec![Backend::SovDB, Backend::SpTrie, Backend::Nomt]
 19 |     }
 20 | 
 21 |     // If reset is true, then erase any previous backend's database
 22 |     // and restart from an empty database.
 23 |     // Otherwise, use the already present database.
 24 |     pub fn instantiate(
 25 |         &self,
 26 |         reset: bool,
 27 |         commit_concurrency: usize,
 28 |         io_workers: usize,
 29 |         hashtable_buckets: Option<u32>,
 30 |         page_cache_size: Option<usize>,
 31 |         leaf_cache_size: Option<usize>,
 32 |         page_cache_upper_levels: usize,
 33 |         prepopulate_page_cache: bool,
 34 |         overlay_window_length: usize,
 35 |     ) -> DB {
 36 |         match self {
 37 |             Backend::SovDB => {
 38 |                 #[cfg(not(feature = "sov-db"))]
 39 |                 panic!("benchtop not compiled with feature sov-db. rebuild");
 40 |                 #[cfg(feature = "sov-db")]
 41 |                 DB::Sov(SovDB::open(reset))
 42 |             }
 43 |             Backend::Nomt => DB::Nomt(NomtDB::open(
 44 |                 reset,
 45 |                 commit_concurrency,
 46 |                 io_workers,
 47 |                 hashtable_buckets,
 48 |                 page_cache_size,
 49 |                 leaf_cache_size,
 50 |                 page_cache_upper_levels,
 51 |                 prepopulate_page_cache,
 52 |                 overlay_window_length,
 53 |             )),
 54 |             Backend::SpTrie => {
 55 |                 #[cfg(not(feature = "sp-trie"))]
 56 |                 panic!("benchtop not compiled with feature sp-trie. rebuild");
 57 |                 #[cfg(feature = "sp-trie")]
 58 |                 DB::SpTrie(SpTrieDB::open(reset))
 59 |             }
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | /// A transaction over the database which allows reading and writing.
 65 | pub trait Transaction {
 66 |     /// Read a value from the database. If a value was previously written, return that.
 67 |     fn read(&mut self, key: &[u8]) -> Option<Vec<u8>>;
 68 | 
 69 |     /// Note that a value was read from a cache, for inclusion in a storage proof.
 70 |     fn note_read(&mut self, key: &[u8], value: Option<Vec<u8>>);
 71 | 
 72 |     /// Write a value to the database. `None` means to delete the previous value.
 73 |     fn write(&mut self, key: &[u8], value: Option<&[u8]>);
 74 | }
 75 | 
 76 | /// A wrapper around all databases implemented in this tool.
 77 | pub enum DB {
 78 |     #[cfg(feature = "sov-db")]
 79 |     Sov(SovDB),
 80 |     #[cfg(feature = "sp-trie")]
 81 |     SpTrie(SpTrieDB),
 82 |     Nomt(NomtDB),
 83 | }
 84 | 
 85 | impl DB {
 86 |     /// Execute a workload repeatedly until done or a time limit is reached.
 87 |     pub fn execute(
 88 |         &mut self,
 89 |         mut timer: Option<&mut Timer>,
 90 |         workload: &mut dyn Workload,
 91 |         timeout: Option<std::time::Instant>,
 92 |     ) {
 93 |         while !workload.is_done() {
 94 |             if timeout
 95 |                 .as_ref()
 96 |                 .map_or(false, |t| std::time::Instant::now() > *t)
 97 |             {
 98 |                 break;
 99 |             }
100 |             let timer = timer.as_deref_mut();
101 |             match self {
102 |                 #[cfg(feature = "sov-db")]
103 |                 DB::Sov(db) => db.execute(timer, workload),
104 |                 #[cfg(feature = "sp-trie")]
105 |                 DB::SpTrie(db) => db.execute(timer, workload),
106 |                 DB::Nomt(db) => db.execute(timer, workload),
107 |             }
108 |         }
109 |     }
110 | 
111 |     /// Execute several workloads in parallel, repeatedly, until all done or a time limit is reached.
112 |     ///
113 |     /// Only works with the NOMT backend.
114 |     pub fn parallel_execute(
115 |         &mut self,
116 |         mut timer: Option<&mut Timer>,
117 |         thread_pool: &rayon::ThreadPool,
118 |         workloads: &mut [Box<dyn Workload>],
119 |         timeout: Option<std::time::Instant>,
120 |     ) -> anyhow::Result<()> {
121 |         while workloads.iter().any(|w| !w.is_done()) {
122 |             if timeout
123 |                 .as_ref()
124 |                 .map_or(false, |t| std::time::Instant::now() > *t)
125 |             {
126 |                 break;
127 |             }
128 |             let timer = timer.as_deref_mut();
129 |             match self {
130 |                 #[cfg(feature = "sov-db")]
131 |                 DB::Sov(_) => {
132 |                     anyhow::bail!("parallel execution is only supported with the NOMT backend.")
133 |                 }
134 |                 #[cfg(feature = "sp-trie")]
135 |                 DB::SpTrie(_) => {
136 |                     anyhow::bail!("parallel execution is only supported with the NOMT backend.")
137 |                 }
138 |                 DB::Nomt(db) => db.parallel_execute(timer, thread_pool, workloads),
139 |             }
140 |         }
141 | 
142 |         Ok(())
143 |     }
144 | 
145 |     /// Print metrics collected by the Backend if it supports metrics collection
146 |     pub fn print_metrics(&self) {
147 |         match self {
148 |             DB::Nomt(db) => db.print_metrics(),
149 |             #[cfg(any(feature = "sp-trie", feature = "sov-db"))]
150 |             _ => (),
151 |         }
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/benchtop/src/bench.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Backend,
  3 |     cli::bench::BenchType,
  4 |     timer::Timer,
  5 |     workload,
  6 |     workload::{Init, Workload},
  7 | };
  8 | use anyhow::Result;
  9 | 
 10 | pub fn bench(bench_type: BenchType) -> Result<()> {
 11 |     let common_params = match bench_type {
 12 |         BenchType::Isolate(ref params) => &params.common_params,
 13 |         BenchType::Sequential(ref params) => &params.common_params,
 14 |     };
 15 | 
 16 |     let (init, workload) = workload::parse(
 17 |         common_params.workload.name.as_str(),
 18 |         common_params.workload.size,
 19 |         common_params
 20 |             .workload
 21 |             .initial_capacity
 22 |             .map(|s| 1u64 << s)
 23 |             .unwrap_or(0),
 24 |         common_params.workload.percentage_cold,
 25 |     )?;
 26 |     let commit_concurrency = common_params.workload.commit_concurrency;
 27 |     let io_workers = common_params.workload.io_workers;
 28 | 
 29 |     let backends = if common_params.backends.is_empty() {
 30 |         Backend::all_backends()
 31 |     } else {
 32 |         common_params.backends.clone()
 33 |     };
 34 | 
 35 |     match bench_type {
 36 |         BenchType::Isolate(params) => bench_isolate(
 37 |             init,
 38 |             workload,
 39 |             backends,
 40 |             params.iterations,
 41 |             true,
 42 |             commit_concurrency,
 43 |             io_workers,
 44 |         )
 45 |         .map(|_| ()),
 46 |         BenchType::Sequential(params) => bench_sequential(
 47 |             init,
 48 |             workload,
 49 |             backends,
 50 |             params.op_limit,
 51 |             params.time_limit,
 52 |             true,
 53 |             commit_concurrency,
 54 |             io_workers,
 55 |         )
 56 |         .map(|_| ()),
 57 |     }
 58 | }
 59 | 
 60 | // Benchmark the workload across multiple backends multiple times.
 61 | // Each iteration will be executed on a freshly initialized database.
 62 | //
 63 | // Return the mean execution time of the workloads for each backends
 64 | // in the order the backends are provided
 65 | pub fn bench_isolate(
 66 |     mut init: Init,
 67 |     mut workload: Box<dyn Workload>,
 68 |     backends: Vec<Backend>,
 69 |     iterations: u64,
 70 |     print: bool,
 71 |     commit_concurrency: usize,
 72 |     io_workers: usize,
 73 | ) -> Result<Vec<u64>> {
 74 |     let mut mean_results = vec![];
 75 |     for backend in backends {
 76 |         let mut timer = Timer::new(format!("{}", backend));
 77 | 
 78 |         for _ in 0..iterations {
 79 |             let mut db = backend.instantiate(true, commit_concurrency, io_workers);
 80 |             db.execute(None, &mut init);
 81 |             db.execute(Some(&mut timer), &mut *workload);
 82 |             db.print_metrics();
 83 |         }
 84 | 
 85 |         if print {
 86 |             timer.print();
 87 |         }
 88 |         mean_results.push(timer.get_mean_workload_duration()?);
 89 |     }
 90 | 
 91 |     Ok(mean_results)
 92 | }
 93 | 
 94 | // Benchmark the workload across multiple backends multiple times.
 95 | // Each iteration will be executed on the same db repeatedly
 96 | // without clearing it until a time or operation count limit is reaced.
 97 | //
 98 | // Return the mean execution time of the workloads for each backends
 99 | // in the order the backends are provided
100 | pub fn bench_sequential(
101 |     mut init: Init,
102 |     mut workload: Box<dyn Workload>,
103 |     backends: Vec<Backend>,
104 |     op_limit: Option<u64>,
105 |     time_limit: Option<u64>,
106 |     print: bool,
107 |     commit_concurrency: usize,
108 |     io_workers: usize,
109 | ) -> Result<Vec<u64>> {
110 |     if let (None, None) = (op_limit, time_limit) {
111 |         anyhow::bail!("You need to specify at least one limiter between operations and time")
112 |     }
113 | 
114 |     let mut mean_results = vec![];
115 | 
116 |     for backend in backends {
117 |         let mut timer = Timer::new(format!("{}", backend));
118 |         let mut db = backend.instantiate(true, commit_concurrency, io_workers);
119 | 
120 |         let mut elapsed_time = 0;
121 |         let mut op_count = 0;
122 | 
123 |         db.execute(None, &mut init);
124 | 
125 |         loop {
126 |             db.execute(Some(&mut timer), &mut *workload);
127 | 
128 |             // check if time limit exceeded
129 |             elapsed_time += timer.get_last_workload_duration()?;
130 |             match time_limit {
131 |                 Some(limit) if elapsed_time >= (limit * 1000000) => break,
132 |                 _ => (),
133 |             };
134 | 
135 |             // check if op limit exceeded
136 |             op_count += workload.size() as u64;
137 |             match op_limit {
138 |                 Some(limit) if op_count >= limit => break,
139 |                 _ => (),
140 |             };
141 |         }
142 | 
143 |         db.print_metrics();
144 | 
145 |         if print {
146 |             timer.print();
147 |         }
148 |         mean_results.push(timer.get_mean_workload_duration()?);
149 |     }
150 |     Ok(mean_results)
151 | }
152 | 


--------------------------------------------------------------------------------
/benchtop/src/custom_workload.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Transaction,
  3 |     cli::StateItemDistribution,
  4 |     workload::{Distribution, Workload},
  5 | };
  6 | use rand::Rng;
  7 | 
  8 | #[derive(Clone)]
  9 | pub struct RwInit {
 10 |     cur_val: u64,
 11 |     num_vals: u64,
 12 | }
 13 | 
 14 | impl Workload for RwInit {
 15 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
 16 |         const MAX_INIT_PER_ITERATION: u64 = 64 * 1024 * 1024;
 17 | 
 18 |         if self.num_vals == 0 {
 19 |             return;
 20 |         }
 21 | 
 22 |         let count = std::cmp::min(self.num_vals - self.cur_val, MAX_INIT_PER_ITERATION);
 23 |         for _ in 0..count {
 24 |             transaction.write(&encode_id(self.cur_val), Some(&[64u8; 32]));
 25 |             self.cur_val += 1;
 26 |         }
 27 |         println!(
 28 |             "populating {:.1}%",
 29 |             100.0 * (self.cur_val as f64) / (self.num_vals as f64)
 30 |         );
 31 |     }
 32 | 
 33 |     fn is_done(&self) -> bool {
 34 |         self.num_vals == self.cur_val
 35 |     }
 36 | }
 37 | 
 38 | /// Greate a workload for initializing a database with the given amount of key-value pairs.
 39 | pub fn init(db_size: u64) -> RwInit {
 40 |     RwInit {
 41 |         cur_val: 0,
 42 |         num_vals: db_size,
 43 |     }
 44 | }
 45 | 
 46 | fn encode_id(id: u64) -> [u8; 8] {
 47 |     id.to_be_bytes()
 48 | }
 49 | 
 50 | /// Build N `RwWorkload`s, one for each thread.
 51 | pub fn build(
 52 |     reads: u8,
 53 |     writes: u8,
 54 |     workload_size: u64,
 55 |     fresh: u8,
 56 |     db_size: u64,
 57 |     op_limit: u64,
 58 |     threads: usize,
 59 |     distribution: StateItemDistribution,
 60 | ) -> Vec<RwWorkload> {
 61 |     let thread_workload_size = workload_size / threads as u64;
 62 |     let db_step = db_size / threads as u64;
 63 | 
 64 |     (0..threads)
 65 |         .map(|i| {
 66 |             let db_start = db_step * i as u64;
 67 | 
 68 |             RwWorkload {
 69 |                 reads,
 70 |                 writes,
 71 |                 fresh,
 72 |                 workload_size: if i == threads - 1 {
 73 |                     thread_workload_size + workload_size % threads as u64
 74 |                 } else {
 75 |                     thread_workload_size
 76 |                 },
 77 |                 ops_remaining: op_limit / threads as u64,
 78 |                 distribution: Distribution::new(distribution, db_start, db_start + db_step),
 79 |             }
 80 |         })
 81 |         .collect()
 82 | }
 83 | 
 84 | // The read-write workload will follow these rules:
 85 | // 1. Reads and writes are randomly and uniformly distributed across the key space.
 86 | // 2. The DB size indicates the number of entries in the database.
 87 | // 3. The workload size represents the total number of operations, where reads and writes
 88 | //     are numbers that need to sum to 100 and represent a percentage of the total size.
 89 | // 4. Fresh indicates the percentage of reads and writes that will be performed on
 90 | //     non-existing keys
 91 | pub struct RwWorkload {
 92 |     pub reads: u8,
 93 |     pub writes: u8,
 94 |     pub workload_size: u64,
 95 |     pub fresh: u8,
 96 |     pub ops_remaining: u64,
 97 |     pub distribution: Distribution,
 98 | }
 99 | 
100 | impl Workload for RwWorkload {
101 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
102 |         let from_percentage = |p: u8| (self.workload_size as f64 * p as f64 / 100.0) as u64;
103 |         let fresh = |size: u64| (size as f64 * self.fresh as f64 / 100.0) as u64;
104 | 
105 |         // total reads and writes
106 |         let n_reads = from_percentage(self.reads);
107 |         let n_writes = from_percentage(self.writes);
108 |         // fresh reads and writes
109 |         let n_reads_fresh = fresh(n_reads);
110 |         let n_writes_fresh = fresh(n_writes);
111 | 
112 |         let mut rng = rand::thread_rng();
113 | 
114 |         for i in 0..n_reads {
115 |             let _ = if i < n_reads_fresh {
116 |                 // fresh read, technically there is a chance to generate
117 |                 // a random key that is already present in the database,
118 |                 // but it is very unlikely
119 |                 transaction.read(&rand_key(&mut rng))
120 |             } else {
121 |                 // read already existing key
122 |                 let key = self.distribution.sample(&mut rng);
123 |                 transaction.read(&encode_id(key))
124 |             };
125 |         }
126 | 
127 |         for i in 0..n_writes {
128 |             let value = rand_key(&mut rng);
129 |             if i < n_writes_fresh {
130 |                 // fresh write
131 |                 transaction.write(&rand_key(&mut rng), Some(&value));
132 |             } else {
133 |                 // substitute key
134 |                 let key = self.distribution.sample(&mut rng);
135 |                 transaction.write(&encode_id(key), Some(&value));
136 |             };
137 |         }
138 | 
139 |         self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size);
140 |     }
141 | 
142 |     fn is_done(&self) -> bool {
143 |         self.ops_remaining == 0
144 |     }
145 | }
146 | 
147 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] {
148 |     // keys must be uniformly distributed
149 |     let mut key = [0; 32];
150 |     rng.fill(&mut key[..16]);
151 |     key
152 | }
153 | 


--------------------------------------------------------------------------------
/benchtop/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod backend;
  2 | mod cli;
  3 | mod custom_workload;
  4 | mod nomt;
  5 | 
  6 | #[cfg(feature = "sov-db")]
  7 | mod sov_db;
  8 | #[cfg(feature = "sp-trie")]
  9 | mod sp_trie;
 10 | 
 11 | mod timer;
 12 | mod transfer_workload;
 13 | mod workload;
 14 | 
 15 | use anyhow::Result;
 16 | use clap::Parser;
 17 | use cli::{Cli, Commands, InitParams, RunParams};
 18 | use timer::Timer;
 19 | 
 20 | pub fn main() -> Result<()> {
 21 |     let cli = Cli::parse();
 22 | 
 23 |     match cli.command {
 24 |         Commands::Init(params) => init(params),
 25 |         Commands::Run(params) => run(params),
 26 |     }
 27 | }
 28 | 
 29 | pub fn init(params: InitParams) -> Result<()> {
 30 |     let workload_params = params.workload;
 31 |     let (mut init, _) = workload::parse(&workload_params, u64::max_value())?;
 32 | 
 33 |     let mut db = params.backend.instantiate(
 34 |         true,
 35 |         workload_params.commit_concurrency,
 36 |         workload_params.io_workers,
 37 |         workload_params.hashtable_buckets,
 38 |         workload_params.page_cache_size,
 39 |         workload_params.leaf_cache_size,
 40 |         workload_params.page_cache_upper_levels,
 41 |         workload_params.prepopulate_page_cache,
 42 |         0,
 43 |     );
 44 |     db.execute(None, &mut *init, None);
 45 | 
 46 |     Ok(())
 47 | }
 48 | 
 49 | pub fn run(params: RunParams) -> Result<()> {
 50 |     let workload_params = params.workload;
 51 |     let (mut init, mut workloads) = workload::parse(
 52 |         &workload_params,
 53 |         params.limits.ops.unwrap_or(u64::max_value()),
 54 |     )?;
 55 | 
 56 |     let mut db = params.backend.instantiate(
 57 |         params.reset,
 58 |         workload_params.commit_concurrency,
 59 |         workload_params.io_workers,
 60 |         workload_params.hashtable_buckets,
 61 |         workload_params.page_cache_size,
 62 |         workload_params.leaf_cache_size,
 63 |         workload_params.page_cache_upper_levels,
 64 |         workload_params.prepopulate_page_cache,
 65 |         workload_params.overlay_window_length,
 66 |     );
 67 | 
 68 |     if params.reset {
 69 |         db.execute(None, &mut *init, None);
 70 |     }
 71 | 
 72 |     let mut timer = Timer::new(format!("{}", params.backend));
 73 |     let warmup_timeout = params
 74 |         .warm_up
 75 |         .map(|time_limit| std::time::Instant::now() + time_limit.into());
 76 | 
 77 |     let thread_pool = rayon::ThreadPoolBuilder::new()
 78 |         .thread_name(|_| "benchtop-workload".into())
 79 |         .num_threads(workload_params.workload_concurrency as usize)
 80 |         .build()?;
 81 | 
 82 |     if let Some(t) = warmup_timeout {
 83 |         if workload_params.workload_concurrency == 1 {
 84 |             db.execute(Some(&mut timer), &mut *workloads[0], Some(t));
 85 |         } else {
 86 |             db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, Some(t))?;
 87 |         };
 88 | 
 89 |         timer = Timer::new(format!("{}", params.backend));
 90 |     }
 91 | 
 92 |     let timeout = params
 93 |         .limits
 94 |         .time
 95 |         .map(|time_limit| std::time::Instant::now() + time_limit.into());
 96 | 
 97 |     if workload_params.workload_concurrency == 1 {
 98 |         db.execute(Some(&mut timer), &mut *workloads[0], timeout);
 99 |     } else {
100 |         db.parallel_execute(Some(&mut timer), &thread_pool, &mut workloads, timeout)?;
101 |     };
102 | 
103 |     db.print_metrics();
104 |     timer.print(workload_params.size);
105 |     print_max_rss();
106 | 
107 |     Ok(())
108 | }
109 | 
110 | fn print_max_rss() {
111 |     let max_rss = get_max_rss().unwrap_or(0);
112 |     println!("max rss: {} MiB", max_rss / 1024);
113 |     fn get_max_rss() -> Option<usize> {
114 |         let mut usage: libc::rusage = unsafe { std::mem::zeroed() };
115 |         let ret = unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut usage) };
116 |         if ret == 0 {
117 |             Some(usage.ru_maxrss as usize)
118 |         } else {
119 |             None
120 |         }
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/benchtop/src/sp_trie.rs:
--------------------------------------------------------------------------------
  1 | use crate::{backend::Transaction, timer::Timer, workload::Workload};
  2 | use hash_db::{AsHashDB, HashDB, Prefix};
  3 | use kvdb::KeyValueDB;
  4 | use kvdb_rocksdb::{Database, DatabaseConfig};
  5 | use sha2::Digest;
  6 | use sp_trie::trie_types::TrieDBMutBuilderV1;
  7 | use sp_trie::{DBValue, LayoutV1, PrefixedMemoryDB, TrieDBMut};
  8 | use std::sync::Arc;
  9 | use trie_db::TrieMut;
 10 | 
 11 | type Hasher = sp_core::Blake2Hasher;
 12 | type Hash = sp_core::H256;
 13 | 
 14 | const SP_TRIE_DB_FOLDER: &str = "sp_trie_db";
 15 | 
 16 | const NUM_COLUMNS: u32 = 2;
 17 | const COL_TRIE: u32 = 0;
 18 | const COL_ROOT: u32 = 1;
 19 | 
 20 | const ROOT_KEY: &[u8] = b"root";
 21 | 
 22 | pub struct SpTrieDB {
 23 |     pub kvdb: Arc<dyn KeyValueDB>,
 24 |     pub root: Hash,
 25 | }
 26 | 
 27 | pub struct Trie<'a> {
 28 |     pub db: Arc<dyn KeyValueDB>,
 29 |     pub overlay: &'a mut PrefixedMemoryDB<Hasher>,
 30 | }
 31 | 
 32 | impl SpTrieDB {
 33 |     pub fn open(reset: bool) -> Self {
 34 |         if reset {
 35 |             // Delete previously existing db
 36 |             let _ = std::fs::remove_dir_all(SP_TRIE_DB_FOLDER);
 37 |         }
 38 | 
 39 |         let db_cfg = DatabaseConfig::with_columns(NUM_COLUMNS);
 40 |         let kvdb =
 41 |             Arc::new(Database::open(&db_cfg, SP_TRIE_DB_FOLDER).expect("Database backend error"));
 42 | 
 43 |         let root = match kvdb.get(COL_ROOT, ROOT_KEY).unwrap() {
 44 |             None => Hash::default(),
 45 |             Some(r) => Hash::from_slice(&r[..32]),
 46 |         };
 47 | 
 48 |         Self { kvdb, root }
 49 |     }
 50 | 
 51 |     pub fn execute(&mut self, mut timer: Option<&mut Timer>, workload: &mut dyn Workload) {
 52 |         let _timer_guard_total = timer.as_mut().map(|t| t.record_span("workload"));
 53 | 
 54 |         let mut new_root = self.root;
 55 |         let mut overlay = PrefixedMemoryDB::default();
 56 | 
 57 |         let mut trie = Trie {
 58 |             db: self.kvdb.clone(),
 59 |             overlay: &mut overlay,
 60 |         };
 61 | 
 62 |         let recorder: sp_trie::recorder::Recorder<Hasher> = Default::default();
 63 |         let _timer_guard_commit = {
 64 |             let mut trie_recorder = recorder.as_trie_recorder(new_root);
 65 | 
 66 |             let trie_db_mut = if self.root == Hash::default() {
 67 |                 TrieDBMutBuilderV1::new(&mut trie, &mut new_root)
 68 |                     .with_recorder(&mut trie_recorder)
 69 |                     .build()
 70 |             } else {
 71 |                 TrieDBMutBuilderV1::from_existing(&mut trie, &mut new_root)
 72 |                     .with_recorder(&mut trie_recorder)
 73 |                     .build()
 74 |             };
 75 | 
 76 |             let mut transaction = Tx {
 77 |                 trie: trie_db_mut,
 78 |                 timer,
 79 |             };
 80 |             workload.run_step(&mut transaction);
 81 |             let Tx {
 82 |                 trie: mut trie_db_mut,
 83 |                 mut timer,
 84 |             } = transaction;
 85 | 
 86 |             let timer_guard_commit = timer.as_mut().map(|t| t.record_span("commit_and_prove"));
 87 | 
 88 |             trie_db_mut.commit();
 89 |             timer_guard_commit
 90 |         };
 91 | 
 92 |         let _proof = recorder.drain_storage_proof().is_empty();
 93 | 
 94 |         let mut transaction = self.kvdb.transaction();
 95 |         for (key, (value, ref_count)) in overlay.drain() {
 96 |             if ref_count > 0 {
 97 |                 transaction.put(COL_TRIE, &key[..], &value[..])
 98 |             } else if ref_count < 0 {
 99 |                 transaction.delete(COL_TRIE, &key[..])
100 |             }
101 |         }
102 |         transaction.put(COL_ROOT, ROOT_KEY, new_root.as_bytes());
103 |         self.kvdb
104 |             .write(transaction)
105 |             .expect("Failed to write transaction");
106 | 
107 |         self.root = new_root;
108 |     }
109 | }
110 | 
111 | struct Tx<'a> {
112 |     trie: TrieDBMut<'a, LayoutV1<Hasher>>,
113 |     timer: Option<&'a mut Timer>,
114 | }
115 | 
116 | // sp_trie does not require hashed keys,
117 | // but if keys are not hashed, the comparison does not seem to be efficient.
118 | // Not applying hashing to keys would significantly speed up sp_trie.
119 | impl<'a> Transaction for Tx<'a> {
120 |     fn read(&mut self, key: &[u8]) -> Option<Vec<u8>> {
121 |         let key_path = sha2::Sha256::digest(key);
122 | 
123 |         let _timer_guard_read = self.timer.as_mut().map(|t| t.record_span("read"));
124 |         self.trie
125 |             .get(&key_path)
126 |             .expect("Impossible fetching from sp-trie db")
127 |     }
128 | 
129 |     fn note_read(&mut self, key: &[u8], _value: Option<Vec<u8>>) {
130 |         let _ = self.read(key);
131 |     }
132 | 
133 |     fn write(&mut self, key: &[u8], value: Option<&[u8]>) {
134 |         let key_path = sha2::Sha256::digest(key);
135 | 
136 |         self.trie
137 |             .insert(&key_path, &value.unwrap_or(&[]))
138 |             .expect("Impossible writing into sp-trie db");
139 |     }
140 | }
141 | 
142 | impl<'a> AsHashDB<Hasher, DBValue> for Trie<'a> {
143 |     fn as_hash_db(&self) -> &dyn hash_db::HashDB<Hasher, DBValue> {
144 |         self
145 |     }
146 | 
147 |     fn as_hash_db_mut<'b>(&'b mut self) -> &'b mut (dyn HashDB<Hasher, DBValue> + 'b) {
148 |         &mut *self
149 |     }
150 | }
151 | 
152 | impl<'a> HashDB<Hasher, DBValue> for Trie<'a> {
153 |     fn get(&self, key: &Hash, prefix: Prefix) -> Option<DBValue> {
154 |         if let Some(value) = self.overlay.get(key, prefix) {
155 |             return Some(value);
156 |         }
157 | 
158 |         let key = sp_trie::prefixed_key::<Hasher>(key, prefix);
159 |         self.db.get(0, &key).expect("Database backend error")
160 |     }
161 | 
162 |     fn contains(&self, hash: &Hash, prefix: Prefix) -> bool {
163 |         self.get(hash, prefix).is_some()
164 |     }
165 | 
166 |     fn insert(&mut self, prefix: Prefix, value: &[u8]) -> Hash {
167 |         self.overlay.insert(prefix, value)
168 |     }
169 | 
170 |     fn emplace(&mut self, key: Hash, prefix: Prefix, value: DBValue) {
171 |         self.overlay.emplace(key, prefix, value);
172 |     }
173 | 
174 |     fn remove(&mut self, key: &Hash, prefix: Prefix) {
175 |         self.overlay.remove(key, prefix)
176 |     }
177 | }
178 | 


--------------------------------------------------------------------------------
/benchtop/src/timer.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     cell::RefCell,
  3 |     collections::hash_map::{Entry, HashMap},
  4 |     rc::Rc,
  5 | };
  6 | 
  7 | // At least three spans are expected to be measured
  8 | // + `workload`
  9 | // + `read`
 10 | // + `commit_and_prove`
 11 | pub struct Timer {
 12 |     name: String,
 13 |     spans: HashMap<&'static str, Rc<RefCell<hdrhistogram::Histogram<u64>>>>,
 14 | }
 15 | 
 16 | impl Timer {
 17 |     pub fn new(name: String) -> Self {
 18 |         Self {
 19 |             name,
 20 |             spans: HashMap::new(),
 21 |         }
 22 |     }
 23 | 
 24 |     pub fn record_span(&mut self, span_name: &'static str) -> impl Drop {
 25 |         struct RecordSpan {
 26 |             h: Rc<RefCell<hdrhistogram::Histogram<u64>>>,
 27 |             start: std::time::Instant,
 28 |         }
 29 |         impl Drop for RecordSpan {
 30 |             fn drop(&mut self) {
 31 |                 let elapsed = self.start.elapsed().as_nanos() as u64;
 32 |                 self.h.borrow_mut().record(elapsed).unwrap();
 33 |             }
 34 |         }
 35 | 
 36 |         let h = self.spans.entry(span_name).or_insert_with(|| {
 37 |             Rc::new(RefCell::new(
 38 |                 hdrhistogram::Histogram::<u64>::new(3).unwrap(),
 39 |             ))
 40 |         });
 41 | 
 42 |         RecordSpan {
 43 |             h: h.clone(),
 44 |             start: std::time::Instant::now(),
 45 |         }
 46 |     }
 47 | 
 48 |     pub fn freeze(self) -> FrozenTimer {
 49 |         FrozenTimer {
 50 |             spans: self
 51 |                 .spans
 52 |                 .into_iter()
 53 |                 .map(|(name, histogram)| (name, Rc::into_inner(histogram).unwrap().into_inner()))
 54 |                 .collect(),
 55 |         }
 56 |     }
 57 | 
 58 |     pub fn add(&mut self, other: FrozenTimer) {
 59 |         for (span_name, new_data) in other.spans {
 60 |             match self.spans.entry(span_name) {
 61 |                 Entry::Occupied(e) => e.get().borrow_mut().add(new_data).unwrap(),
 62 |                 Entry::Vacant(e) => {
 63 |                     let _ = e.insert(Rc::new(RefCell::new(new_data)));
 64 |                 }
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     pub fn get_last_workload_duration(&self) -> anyhow::Result<u64> {
 70 |         let h = self
 71 |             .spans
 72 |             .get("workload")
 73 |             .ok_or(anyhow::anyhow!("`workload` span not recorded"))?;
 74 | 
 75 |         Ok(h.borrow()
 76 |             .iter_recorded()
 77 |             .last()
 78 |             .ok_or(anyhow::anyhow!("No recorded value for `workload` span"))?
 79 |             .value_iterated_to())
 80 |     }
 81 | 
 82 |     pub fn get_mean_workload_duration(&self) -> anyhow::Result<u64> {
 83 |         Ok(self
 84 |             .spans
 85 |             .get("workload")
 86 |             .ok_or(anyhow::anyhow!("`workload` span not recorded"))?
 87 |             .borrow()
 88 |             .mean() as u64)
 89 |     }
 90 | 
 91 |     pub fn print(&mut self, workload_size: u64) {
 92 |         println!("{}", self.name);
 93 | 
 94 |         let expected_spans = ["workload", "read", "commit_and_prove"];
 95 | 
 96 |         // print expectd spans in order
 97 |         for span_name in expected_spans {
 98 |             let h = self.spans.get(span_name);
 99 |             match h {
100 |                 Some(h) => println!(
101 |                     "  mean {}: {}",
102 |                     span_name,
103 |                     pretty_display_ns(h.borrow().mean() as u64)
104 |                 ),
105 |                 None => println!("{} not measured", span_name),
106 |             };
107 |         }
108 | 
109 |         if let Ok(workload_mean_ns) = self.get_mean_workload_duration() {
110 |             let ops_per_second = workload_size as f64 / (workload_mean_ns as f64 / 1_000_000_000.0);
111 |             println!("  mean throughput: {ops_per_second:.1} ops/s");
112 |         }
113 | 
114 |         // print all other measured spans
115 |         for (span_name, h) in &self.spans {
116 |             if expected_spans.contains(span_name) {
117 |                 continue;
118 |             }
119 | 
120 |             println!(
121 |                 "  mean {}: {}",
122 |                 span_name,
123 |                 pretty_display_ns(h.borrow().mean() as u64)
124 |             )
125 |         }
126 |     }
127 | }
128 | 
129 | pub struct FrozenTimer {
130 |     spans: HashMap<&'static str, hdrhistogram::Histogram<u64>>,
131 | }
132 | 
133 | pub fn pretty_display_ns(ns: u64) -> String {
134 |     // preserve 3 sig figs at minimum.
135 |     let (val, unit) = if ns > 100 * 1_000_000_000 {
136 |         (ns / 1_000_000_000, "s")
137 |     } else if ns > 100 * 1_000_000 {
138 |         (ns / 1_000_000, "ms")
139 |     } else if ns > 100 * 1_000 {
140 |         (ns / 1_000, "us")
141 |     } else {
142 |         (ns, "ns")
143 |     };
144 | 
145 |     format!("{val} {unit}")
146 | }
147 | 


--------------------------------------------------------------------------------
/benchtop/src/transfer_workload.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     backend::Transaction,
  3 |     cli::StateItemDistribution,
  4 |     workload::{Distribution, Workload},
  5 | };
  6 | use rand::Rng;
  7 | 
  8 | #[derive(Clone)]
  9 | pub struct TransferInit {
 10 |     cur_account: u64,
 11 |     num_accounts: u64,
 12 | }
 13 | 
 14 | impl Workload for TransferInit {
 15 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
 16 |         const MAX_INIT_PER_ITERATION: u64 = 64 * 1024;
 17 | 
 18 |         if self.num_accounts == 0 {
 19 |             return;
 20 |         }
 21 | 
 22 |         let count = std::cmp::min(self.num_accounts - self.cur_account, MAX_INIT_PER_ITERATION);
 23 |         for _ in 0..count {
 24 |             transaction.write(&encode_id(self.cur_account), Some(&encode_balance(1000)));
 25 |             self.cur_account += 1;
 26 |         }
 27 |         println!(
 28 |             "populating {:.1}%",
 29 |             100.0 * (self.cur_account as f64) / (self.num_accounts as f64)
 30 |         );
 31 |     }
 32 | 
 33 |     fn is_done(&self) -> bool {
 34 |         self.cur_account == self.num_accounts
 35 |     }
 36 | }
 37 | 
 38 | /// Create an initialization command for a transfer database.
 39 | pub fn init(num_accounts: u64) -> TransferInit {
 40 |     TransferInit {
 41 |         cur_account: 0,
 42 |         num_accounts,
 43 |     }
 44 | }
 45 | 
 46 | fn encode_id(id: u64) -> [u8; 8] {
 47 |     id.to_be_bytes()
 48 | }
 49 | 
 50 | fn encode_balance(balance: u64) -> [u8; 8] {
 51 |     balance.to_be_bytes()
 52 | }
 53 | 
 54 | fn decode_balance(encoded: &[u8]) -> u64 {
 55 |     let mut buf = [0; 8];
 56 |     buf.copy_from_slice(encoded);
 57 |     u64::from_be_bytes(buf)
 58 | }
 59 | 
 60 | /// Build a new workload meant to emulate transfers.
 61 | ///
 62 | /// `num_accounts` refers to the amount of accounts in the database.
 63 | ///
 64 | /// `percentage_cold_transfer` ranges from 0 to 100 and indicates the proportion of transfers
 65 | /// which should be sent to a fresh account.
 66 | pub fn build(
 67 |     num_accounts: u64,
 68 |     workload_size: u64,
 69 |     percentage_cold_transfer: u8,
 70 |     op_limit: u64,
 71 |     threads: usize,
 72 |     distribution: StateItemDistribution,
 73 | ) -> Vec<TransferWorkload> {
 74 |     let thread_workload_size = workload_size / threads as u64;
 75 |     let num_accounts_step = num_accounts / threads as u64;
 76 | 
 77 |     (0..threads)
 78 |         .map(|i| {
 79 |             let start_account = num_accounts_step * i as u64;
 80 |             let end_account = if i == threads - 1 {
 81 |                 num_accounts
 82 |             } else {
 83 |                 num_accounts_step * (i as u64 + 1)
 84 |             };
 85 |             TransferWorkload {
 86 |                 num_accounts,
 87 |                 workload_size: thread_workload_size,
 88 |                 percentage_cold_transfer,
 89 |                 ops_remaining: op_limit / threads as u64,
 90 |                 distribution: Distribution::new(distribution, start_account, end_account),
 91 |             }
 92 |         })
 93 |         .collect()
 94 | }
 95 | 
 96 | /// A transfer-like workload.
 97 | pub struct TransferWorkload {
 98 |     /// The number of accounts in the system.
 99 |     pub num_accounts: u64,
100 |     /// The size of the workload.
101 |     pub workload_size: u64,
102 |     /// The percentage of transfers to make to fresh accounts.
103 |     pub percentage_cold_transfer: u8,
104 |     /// The number of remaining operations before being considered 'done'.
105 |     pub ops_remaining: u64,
106 |     /// The random distribution to use to sample state items.
107 |     pub distribution: Distribution,
108 | }
109 | 
110 | impl Workload for TransferWorkload {
111 |     fn run_step(&mut self, transaction: &mut dyn Transaction) {
112 |         let cold_sends =
113 |             (self.workload_size as f64 * (self.percentage_cold_transfer as f64 / 100.0)) as u64;
114 |         let warm_sends = self.workload_size - cold_sends;
115 | 
116 |         let mut rng = rand::thread_rng();
117 |         for i in 0..self.workload_size {
118 |             let send_account = self.distribution.sample(&mut rng);
119 |             let recv_account = if i < warm_sends {
120 |                 let mut r = self.distribution.sample(&mut rng);
121 |                 while r == send_account {
122 |                     r = self.distribution.sample(&mut rng);
123 |                 }
124 |                 r
125 |             } else {
126 |                 // odds of two threads generating the same random account here are
127 |                 // incredibly low.
128 |                 rng.gen_range(self.num_accounts..u64::max_value())
129 |             };
130 | 
131 |             let send_balance = decode_balance(
132 |                 &transaction
133 |                     .read(&encode_id(send_account))
134 |                     .expect("account exists"),
135 |             );
136 |             let recv_balance = transaction
137 |                 .read(&encode_id(recv_account))
138 |                 .map_or(0, |v| decode_balance(&v));
139 | 
140 |             let new_send_balance = if send_balance == 0 {
141 |                 1000 // yay, free money.
142 |             } else {
143 |                 send_balance - 1
144 |             };
145 |             let new_recv_balance = recv_balance + 1;
146 | 
147 |             transaction.write(
148 |                 &encode_id(send_account),
149 |                 Some(&encode_balance(new_send_balance)),
150 |             );
151 |             transaction.write(
152 |                 &encode_id(recv_account),
153 |                 Some(&encode_balance(new_recv_balance)),
154 |             );
155 |         }
156 | 
157 |         self.ops_remaining = self.ops_remaining.saturating_sub(self.workload_size);
158 |     }
159 | 
160 |     fn is_done(&self) -> bool {
161 |         self.ops_remaining == 0
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt-core"
 3 | description = "Core trie operations for NOMT"
 4 | version = "1.0.0-preview"
 5 | authors.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | edition.workspace = true
 9 | license.workspace = true
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | bitvec.workspace = true
15 | hex.workspace = true
16 | ruint.workspace = true
17 | arrayvec.workspace = true
18 | borsh = { workspace = true, optional = true }
19 | blake3 = { workspace = true, optional = true }
20 | sha2 = { workspace = true, optional = true }
21 | serde = { workspace = true, optional = true }
22 | digest = { workspace = true }
23 | 
24 | [dev-dependencies]
25 | blake3.workspace = true
26 | 
27 | [features]
28 | default = ["std", "blake3-hasher", "sha2-hasher"]
29 | std = ["bitvec/std", "borsh?/std", "serde?/std"]
30 | borsh = ["dep:borsh"]
31 | blake3-hasher = ["dep:blake3"]
32 | sha2-hasher = ["dep:sha2"]
33 | serde = ["dep:serde", "serde/alloc"]
34 | 


--------------------------------------------------------------------------------
/core/src/hasher.rs:
--------------------------------------------------------------------------------
  1 | //! Hashers (feature-gated) and utilities for implementing them.
  2 | 
  3 | use crate::trie::{InternalData, LeafData, Node, NodeKind, TERMINATOR};
  4 | 
  5 | /// A trie node hash function specialized for 64 bytes of data.
  6 | ///
  7 | /// Note that it is illegal for the produced hash to equal [0; 32], as this value is reserved
  8 | /// for the terminator node.
  9 | ///
 10 | /// A node hasher should domain-separate internal and leaf nodes in some specific way. The
 11 | /// recommended approach for binary hashes is to set the MSB to 0 or 1 depending on the node kind.
 12 | /// However, for other kinds of hashes (e.g. Poseidon2 or other algebraic hashes), other labeling
 13 | /// schemes may be required.
 14 | pub trait NodeHasher {
 15 |     /// Hash a leaf. This should domain-separate the hash
 16 |     /// according to the node kind.
 17 |     fn hash_leaf(data: &LeafData) -> [u8; 32];
 18 | 
 19 |     /// Hash an internal node. This should domain-separate
 20 |     /// the hash according to the node kind.
 21 |     fn hash_internal(data: &InternalData) -> [u8; 32];
 22 | 
 23 |     /// Get the kind of the given node.
 24 |     fn node_kind(node: &Node) -> NodeKind;
 25 | }
 26 | 
 27 | /// A hasher for arbitrary-length values.
 28 | pub trait ValueHasher {
 29 |     /// Hash an arbitrary-length value.
 30 |     fn hash_value(value: &[u8]) -> [u8; 32];
 31 | }
 32 | 
 33 | /// Get the node kind, according to a most-significant bit labeling scheme.
 34 | ///
 35 | /// If the MSB is true, it's a leaf. If the node is empty, it's a [`TERMINATOR`]. Otherwise, it's
 36 | /// an internal node.
 37 | pub fn node_kind_by_msb(node: &Node) -> NodeKind {
 38 |     if node[0] >> 7 == 1 {
 39 |         NodeKind::Leaf
 40 |     } else if node == &TERMINATOR {
 41 |         NodeKind::Terminator
 42 |     } else {
 43 |         NodeKind::Internal
 44 |     }
 45 | }
 46 | 
 47 | /// Set the most-significant bit of the node.
 48 | pub fn set_msb(node: &mut Node) {
 49 |     node[0] |= 0b10000000;
 50 | }
 51 | 
 52 | pub fn unset_msb(node: &mut Node) {
 53 |     node[0] &= 0b01111111;
 54 | }
 55 | 
 56 | /// A simple trait for representing binary hash functions.
 57 | pub trait BinaryHash {
 58 |     /// Given a bit-string, produce a 32-bit hash.
 59 |     fn hash(input: &[u8]) -> [u8; 32];
 60 | 
 61 |     /// An optional specialization of `hash` where there are two 32-byte inputs, left and right.
 62 |     fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
 63 |         let mut buf = [0u8; 64];
 64 |         buf[0..32].copy_from_slice(left);
 65 |         buf[32..64].copy_from_slice(right);
 66 |         Self::hash(&buf)
 67 |     }
 68 | }
 69 | 
 70 | /// A node and value hasher constructed from a simple binary hasher.
 71 | ///
 72 | /// This implements a [`ValueHasher`] and [`NodeHasher`] where the node kind is tagged by setting
 73 | /// or unsetting the MSB of the hash value.
 74 | ///
 75 | /// The binary hash wrapped by this structure must behave approximately like a random oracle over
 76 | /// the space 2^256, i.e. all 256 bit outputs are valid and inputs are uniformly distributed.
 77 | ///
 78 | /// Functions like Sha2/Blake3/Keccak/Groestl all meet these criteria.
 79 | pub struct BinaryHasher<H>(core::marker::PhantomData<H>);
 80 | 
 81 | impl<H: BinaryHash> ValueHasher for BinaryHasher<H> {
 82 |     fn hash_value(value: &[u8]) -> [u8; 32] {
 83 |         H::hash(value)
 84 |     }
 85 | }
 86 | 
 87 | impl<H: BinaryHash> NodeHasher for BinaryHasher<H> {
 88 |     fn hash_leaf(data: &LeafData) -> [u8; 32] {
 89 |         let mut h = H::hash2_32_concat(&data.key_path, &data.value_hash);
 90 |         set_msb(&mut h);
 91 |         h
 92 |     }
 93 | 
 94 |     fn hash_internal(data: &InternalData) -> [u8; 32] {
 95 |         let mut h = H::hash2_32_concat(&data.left, &data.right);
 96 |         unset_msb(&mut h);
 97 |         h
 98 |     }
 99 | 
100 |     fn node_kind(node: &Node) -> NodeKind {
101 |         node_kind_by_msb(node)
102 |     }
103 | }
104 | 
105 | /// Blanket implementation for all implementations of `Digest`
106 | impl<H: digest::Digest<OutputSize = digest::typenum::U32> + Send + Sync> BinaryHash for H {
107 |     fn hash(input: &[u8]) -> [u8; 32] {
108 |         H::digest(input).into()
109 |     }
110 | }
111 | 
112 | #[cfg(any(feature = "blake3-hasher", test))]
113 | pub use blake3::Blake3Hasher;
114 | 
115 | /// A node hasher making use of blake3.
116 | #[cfg(any(feature = "blake3-hasher", test))]
117 | pub mod blake3 {
118 |     use super::{BinaryHash, BinaryHasher};
119 | 
120 |     /// A [`BinaryHash`] implementation for Blake3.
121 |     pub struct Blake3BinaryHasher;
122 | 
123 |     /// A wrapper around Blake3 for use in NOMT.
124 |     pub type Blake3Hasher = BinaryHasher<Blake3BinaryHasher>;
125 | 
126 |     impl BinaryHash for Blake3BinaryHasher {
127 |         fn hash(value: &[u8]) -> [u8; 32] {
128 |             blake3::hash(value).into()
129 |         }
130 | 
131 |         fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
132 |             let mut hasher = blake3::Hasher::new();
133 |             hasher.update(left);
134 |             hasher.update(right);
135 |             hasher.finalize().into()
136 |         }
137 |     }
138 | }
139 | 
140 | #[cfg(feature = "sha2-hasher")]
141 | pub use sha2::Sha2Hasher;
142 | 
143 | /// A node and value hasher making use of sha2-256.
144 | #[cfg(feature = "sha2-hasher")]
145 | pub mod sha2 {
146 |     use super::{BinaryHash, BinaryHasher};
147 |     use sha2::{Digest, Sha256};
148 | 
149 |     /// A [`BinaryHash`] implementation for Sha2.
150 |     pub struct Sha2BinaryHasher;
151 | 
152 |     /// A wrapper around sha2-256 for use in NOMT.
153 |     pub type Sha2Hasher = BinaryHasher<Sha2BinaryHasher>;
154 | 
155 |     impl BinaryHash for Sha2BinaryHasher {
156 |         fn hash(value: &[u8]) -> [u8; 32] {
157 |             let mut hasher = Sha256::new();
158 |             hasher.update(value);
159 |             hasher.finalize().into()
160 |         }
161 | 
162 |         fn hash2_32_concat(left: &[u8; 32], right: &[u8; 32]) -> [u8; 32] {
163 |             let mut hasher = Sha256::new();
164 |             hasher.update(left);
165 |             hasher.update(right);
166 |             hasher.finalize().into()
167 |         }
168 |     }
169 | }
170 | 


--------------------------------------------------------------------------------
/core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Core operations and types within the Nearly Optimal Merkle Trie.
 2 | //!
 3 | //! This crate defines the schema and basic operations over the merkle trie in a backend-agnostic
 4 | //! manner.
 5 | //!
 6 | //! The core types and proof verification routines of this crate do not require the
 7 | //! standard library, but do require Rust's alloc crate.
 8 | 
 9 | #![cfg_attr(all(not(feature = "std"), not(test)), no_std)]
10 | 
11 | extern crate alloc;
12 | 
13 | pub mod hasher;
14 | pub mod page;
15 | pub mod page_id;
16 | pub mod proof;
17 | pub mod trie;
18 | pub mod trie_pos;
19 | pub mod update;
20 | pub mod witness;
21 | 


--------------------------------------------------------------------------------
/core/src/page.rs:
--------------------------------------------------------------------------------
 1 | //! Pages: efficient node storage.
 2 | //!
 3 | //! Because each node in the trie is exactly 32 bytes, we can easily pack groups of nodes into
 4 | //! a predictable paged representation regardless of the information in the trie.
 5 | //!
 6 | //! Each page is 4096 bytes and stores up to 126 nodes plus a unique 32-byte page identifier,
 7 | //! with 32 bytes left over.
 8 | //!
 9 | //! A page stores a rootless sub-tree with depth 6: that is, it stores up to
10 | //! 2 + 4 + 8 + 16 + 32 + 64 nodes at known positions.
11 | //! Semantically, all nodes within the page should descend from the layer above, and the
12 | //! top two nodes are expected to be siblings. Each page logically has up to 64 child pages, which
13 | //! correspond to the rootless sub-tree descending from each of the 64 child nodes on the bottom
14 | //! layer.
15 | //!
16 | //! Every page is referred to by a unique ID, given by `parent_id * 2^6 + child_index + 1`, where
17 | //! the root page has ID `0x00..00`. The child index ranges from 0 to 63 and therefore can be
18 | //! represented as a 6 bit string. This module exposes functions for manipulating page IDs.
19 | //!
20 | //! The [`RawPage`] structure wraps a borrowed slice of 32-byte data and treats it as a page.
21 | 
22 | /// Depth of the rootless sub-binary tree stored in a page
23 | pub const DEPTH: usize = 6;
24 | 
25 | // Total number of nodes stored in one Page. It depends on the `DEPTH`
26 | // of the rootless sub-binary tree stored in a page following this formula:
27 | // (2^(DEPTH + 1)) - 2
28 | pub const NODES_PER_PAGE: usize = (1 << DEPTH + 1) - 2;
29 | 
30 | /// A raw, unsized page data slice.
31 | pub type RawPage = [[u8; 32]];
32 | 


--------------------------------------------------------------------------------
/core/src/proof/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Trie proofs and proof verification.
 2 | //!
 3 | //! The Merkle Trie defined in NOMT is an authenticated data structure, which means that it permits
 4 | //! efficient proving against the root. This module exposes types and functions necessary for
 5 | //! handling these kinds of proofs.
 6 | //!
 7 | //! Using the types and functions exposed from this module, you can verify the value of a single
 8 | //! key within the trie ([`PathProof`]), the values of multiple keys ([`MultiProof`]), or the result
 9 | //! of updating a trie with a set of changes ([`verify_update`]).
10 | 
11 | pub use multi_proof::{
12 |     verify as verify_multi_proof, verify_update as verify_multi_proof_update, MultiPathProof,
13 |     MultiProof, MultiProofVerificationError, VerifiedMultiProof,
14 | };
15 | pub use path_proof::{
16 |     verify_update, KeyOutOfScope, PathProof, PathProofTerminal, PathProofVerificationError,
17 |     PathUpdate, VerifiedPathProof, VerifyUpdateError,
18 | };
19 | 
20 | mod multi_proof;
21 | mod path_proof;
22 | 


--------------------------------------------------------------------------------
/core/src/trie.rs:
--------------------------------------------------------------------------------
 1 | //! This module defines the types of a binary merkle trie, generalized over a 256 bit hash function.
 2 | //! All lookup paths in the trie are 256 bits.
 3 | //!
 4 | //! All nodes are 256 bits. There are three kinds of nodes.
 5 | //!   1. Internal nodes, which each have two children. The value of an internal node is
 6 | //!       given by hashing the concatenation of the two child nodes and setting the MSB to 0.
 7 | //!   2. Leaf nodes, which have zero children. The value of a leaf node is given by hashing
 8 | //!       the concatenation of the 256-bit lookup path and the hash of the value stored at the leaf,
 9 | //!       and setting the MSB to 1.
10 | //!   3. [`TERMINATOR`] nodes, which have the special value of all 0s. These nodes have no children
11 | //!      and serve as a stand-in for an empty sub-trie at any height. Terminator nodes enable the
12 | //!      trie to be tractably represented.
13 | //!
14 | //! All node preimages are 512 bits.
15 | 
16 | use crate::hasher::NodeHasher;
17 | 
18 | /// A node in the binary trie. In this schema, it is always 256 bits and is the hash of either
19 | /// an [`LeafData`] or [`InternalData`], or zeroed if it's a [`TERMINATOR`].
20 | ///
21 | /// [`Node`]s are labeled by the [`NodeHasher`] used to indicate whether they are leaves or internal
22 | /// nodes. Typically, this is done by setting the MSB.
23 | pub type Node = [u8; 32];
24 | 
25 | /// The path to a key. All paths have a 256 bit fixed length.
26 | pub type KeyPath = [u8; 32];
27 | 
28 | /// The hash of a value. In this schema, it is always 256 bits.
29 | pub type ValueHash = [u8; 32];
30 | 
31 | /// The terminator hash is a special node hash value denoting an empty sub-tree.
32 | /// Concretely, when this appears at a given location in the trie,
33 | /// it implies that no key with a path beginning with the location has a value.
34 | ///
35 | /// This value may appear at any height.
36 | pub const TERMINATOR: Node = [0u8; 32];
37 | 
38 | /// Whether the node hash indicates the node is a leaf.
39 | pub fn is_leaf<H: NodeHasher>(hash: &Node) -> bool {
40 |     H::node_kind(hash) == NodeKind::Leaf
41 | }
42 | 
43 | /// Whether the node hash indicates the node is an internal node.
44 | pub fn is_internal<H: NodeHasher>(hash: &Node) -> bool {
45 |     H::node_kind(hash) == NodeKind::Internal
46 | }
47 | 
48 | /// Whether the node holds the special `EMPTY_SUBTREE` value.
49 | pub fn is_terminator<H: NodeHasher>(hash: &Node) -> bool {
50 |     H::node_kind(hash) == NodeKind::Terminator
51 | }
52 | 
53 | /// The kind of a node.
54 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
55 | pub enum NodeKind {
56 |     /// A terminator node indicates an empty sub-trie.
57 |     Terminator,
58 |     /// A leaf node indicates a sub-trie with a single leaf.
59 |     Leaf,
60 |     /// An internal node indicates at least two values.
61 |     Internal,
62 | }
63 | 
64 | impl NodeKind {
65 |     /// Get the kind of the provided node.
66 |     pub fn of<H: NodeHasher>(node: &Node) -> Self {
67 |         H::node_kind(node)
68 |     }
69 | }
70 | 
71 | /// The data of an internal (branch) node.
72 | #[derive(Debug, Clone, PartialEq, Eq)]
73 | pub struct InternalData {
74 |     /// The hash of the left child of this node.
75 |     pub left: Node,
76 |     /// The hash of the right child of this node.
77 |     pub right: Node,
78 | }
79 | 
80 | /// The data of a leaf node.
81 | #[derive(Debug, Default, Clone, PartialEq, Eq)]
82 | #[cfg_attr(
83 |     feature = "borsh",
84 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
85 | )]
86 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
87 | pub struct LeafData {
88 |     /// The total path to this value within the trie.
89 |     ///
90 |     /// The actual location of this node may be anywhere along this path, depending on the other
91 |     /// data within the trie.
92 |     pub key_path: KeyPath,
93 |     /// The hash of the value carried in this leaf.
94 |     pub value_hash: ValueHash,
95 | }
96 | 


--------------------------------------------------------------------------------
/core/src/witness.rs:
--------------------------------------------------------------------------------
 1 | //! Witnesses of NOMT sessions. These types encapsulate entire sets of reads and writes.
 2 | 
 3 | use crate::{
 4 |     proof::PathProof,
 5 |     trie::{KeyPath, ValueHash},
 6 |     trie_pos::TriePosition,
 7 | };
 8 | 
 9 | #[cfg(not(feature = "std"))]
10 | use alloc::vec::Vec;
11 | 
12 | /// A witness that can be used to prove the correctness of state trie retrievals and updates.
13 | ///
14 | /// Expected to be serializable.
15 | #[cfg_attr(
16 |     feature = "borsh",
17 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
18 | )]
19 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
20 | pub struct Witness {
21 |     /// Various paths down the trie used as part of this witness.
22 |     /// Note that the paths are not necessarily in lexicographic order.
23 |     pub path_proofs: Vec<WitnessedPath>,
24 |     /// The operations witnessed by the paths.
25 |     pub operations: WitnessedOperations,
26 | }
27 | 
28 | /// Operations provable by a corresponding witness.
29 | #[cfg_attr(
30 |     feature = "borsh",
31 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
32 | )]
33 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
34 | pub struct WitnessedOperations {
35 |     /// Read operations.
36 |     pub reads: Vec<WitnessedRead>,
37 |     /// Write operations.
38 |     pub writes: Vec<WitnessedWrite>,
39 | }
40 | 
41 | /// A path observed in the witness.
42 | #[cfg_attr(
43 |     feature = "borsh",
44 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
45 | )]
46 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
47 | pub struct WitnessedPath {
48 |     /// Proof of a query path along the trie.
49 |     pub inner: PathProof,
50 |     /// The query path itself.
51 |     pub path: TriePosition,
52 | }
53 | 
54 | /// A witness of a read value.
55 | #[cfg_attr(
56 |     feature = "borsh",
57 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
58 | )]
59 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
60 | pub struct WitnessedRead {
61 |     /// The key of the read value.
62 |     pub key: KeyPath,
63 |     /// The hash of the value witnessed. None means no value.
64 |     pub value: Option<ValueHash>,
65 |     /// The index of the path in the corresponding witness.
66 |     pub path_index: usize,
67 | }
68 | 
69 | /// A witness of a write operation.
70 | #[cfg_attr(
71 |     feature = "borsh",
72 |     derive(borsh::BorshDeserialize, borsh::BorshSerialize)
73 | )]
74 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
75 | pub struct WitnessedWrite {
76 |     /// The key of the written value.
77 |     pub key: KeyPath,
78 |     /// The hash of the written value. `None` means "delete".
79 |     pub value: Option<ValueHash>,
80 |     /// The index of the path in the corresponding witness.
81 |     pub path_index: usize,
82 | }
83 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contribute to NOMT
 2 | 
 3 | We license all code under MIT / Apache2.0 licenses. The maintainers reserve the right to refuse contributions and reject issues, even when useful.
 4 | 
 5 | ## Formatting
 6 | 
 7 | We use spaces for indentation and adhere to the vanilla `rustfmt` style.
 8 | 
 9 | Format your code using `rustfmt`:
10 |   1. `cargo install cargo-fmt`
11 |   2. `cargo fmt --all`
12 | 
13 | ## Documentation Policy
14 | 
15 | Well-commented code is readable code. We require all `pub` and `pub(crate)` items to be annotated with doc-strings. This leads to much better auto-generated documentation pages using `rustdoc` and a better experience for library users.
16 | 
17 | Public modules and crates should begin with doc-strings which explain the purpose of the module and crate and assist the reader in determining where to proceed.
18 | 
19 | ## Pull Requests and Tests
20 | 
21 | We require that the entire test-suite passes for every merged PR. A PR is the responsibility of the author. In submitting a PR, you are consenting to become responsible for it and continually improve, update, and request reviews for it until merged. Stale PRs are not the responsibility of the maintainers and may be closed.
22 | 
23 | ## Code of Conduct
24 | 
25 | We ask that all contributors maintain a respectful attitude towards each other.


--------------------------------------------------------------------------------
/docs/images/binary_merkle_patricia_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/binary_merkle_patricia_tree.png


--------------------------------------------------------------------------------
/docs/images/nomt_number_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_number_rule.png


--------------------------------------------------------------------------------
/docs/images/nomt_pages.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_pages.jpg


--------------------------------------------------------------------------------
/docs/images/nomt_put.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thrumdev/nomt/7f6db113d7bb081e27c4d5bf57df20943280b0f9/docs/images/nomt_put.png


--------------------------------------------------------------------------------
/examples/commit_batch/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "commit-batch"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt = { path = "../../nomt"  }
14 | anyhow = "1.0.81"
15 | sha2 = "0.10.6"
16 | 


--------------------------------------------------------------------------------
/examples/commit_batch/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt::{
 3 |     hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, Root, SessionParams, Witness, WitnessMode,
 4 | };
 5 | use sha2::Digest;
 6 | 
 7 | const NOMT_DB_FOLDER: &str = "nomt_db";
 8 | 
 9 | pub struct NomtDB;
10 | 
11 | impl NomtDB {
12 |     pub fn commit_batch() -> Result<(Root, Root, Witness)> {
13 |         // Define the options used to open NOMT
14 |         let mut opts = Options::new();
15 |         opts.path(NOMT_DB_FOLDER);
16 |         opts.commit_concurrency(1);
17 | 
18 |         // Open NOMT database, it will create the folder if it does not exist
19 |         let nomt = Nomt::<Blake3Hasher>::open(opts)?;
20 | 
21 |         // Create a new Session object
22 |         //
23 |         // During a session, the backend is responsible for returning read keys
24 |         // and receiving hints about future writes
25 |         //
26 |         // Writes do not occur immediately, instead,
27 |         // they are cached and applied all at once later on
28 |         let session =
29 |             nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write()));
30 | 
31 |         // Here we will move the data saved under b"key1" to b"key2" and deletes it
32 |         //
33 |         // NOMT expects keys to be uniformly distributed across the key space
34 |         let key_path_1 = sha2::Sha256::digest(b"key1").into();
35 |         let key_path_2 = sha2::Sha256::digest(b"key2").into();
36 | 
37 |         // First, read what is under key_path_1
38 |         //
39 |         // `read` will immediately return the value present in the database
40 |         let value = session.read(key_path_1)?;
41 | 
42 |         // We are going to perform writes on both key-paths, so we have NOMT warm up the on-disk
43 |         // data for both.
44 |         session.warm_up(key_path_1);
45 |         session.warm_up(key_path_2);
46 | 
47 |         // Retrieve the previous value of the root before committing changes
48 |         let prev_root = nomt.root();
49 | 
50 |         // To commit the batch to the backend we need to collect every
51 |         // performed actions into a vector where items are ordered by the key_path
52 |         let mut actual_access: Vec<_> = vec![
53 |             (key_path_1, KeyReadWrite::ReadThenWrite(value.clone(), None)),
54 |             (key_path_2, KeyReadWrite::Write(value)),
55 |         ];
56 |         actual_access.sort_by_key(|(k, _)| *k);
57 | 
58 |         // The final step in handling a session involves committing all changes
59 |         // to update the trie structure and obtaining the new root of the trie,
60 |         // along with a witness and the witnessed operations.
61 |         let mut finished = session.finish(actual_access).unwrap();
62 | 
63 |         // This field is set because the finished session was configured with
64 |         // `WitnessMode::read_write`.
65 |         let witness = finished.take_witness().unwrap();
66 |         let root = finished.root();
67 |         finished.commit(&nomt)?;
68 | 
69 |         Ok((prev_root, root, witness))
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/examples/commit_batch/src/main.rs:
--------------------------------------------------------------------------------
1 | fn main() -> anyhow::Result<()> {
2 |     commit_batch::NomtDB::commit_batch().map(|_| ())
3 | }
4 | 


--------------------------------------------------------------------------------
/examples/read_value/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "read_value"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt = { path = "../../nomt"  }
14 | anyhow = "1.0.81"
15 | sha2 = "0.10.6"
16 | 


--------------------------------------------------------------------------------
/examples/read_value/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams, WitnessMode};
 3 | use sha2::Digest;
 4 | 
 5 | const NOMT_DB_FOLDER: &str = "nomt_db";
 6 | 
 7 | fn main() -> Result<()> {
 8 |     // Define the options used to open NOMT
 9 |     let mut opts = Options::new();
10 |     opts.path(NOMT_DB_FOLDER);
11 |     opts.commit_concurrency(1);
12 | 
13 |     // Open nomt database. This will create the folder if it does not exist
14 |     let nomt = Nomt::<Blake3Hasher>::open(opts)?;
15 | 
16 |     // Instantiate a new Session object to handle read and write operations
17 |     // and generate a Witness later on
18 |     let session =
19 |         nomt.begin_session(SessionParams::default().witness_mode(WitnessMode::read_write()));
20 | 
21 |     // Reading a key from the database
22 |     let key_path = sha2::Sha256::digest(b"key").into();
23 |     let value = session.read(key_path)?;
24 | 
25 |     // Even though this key is only being read, we ask NOMT to warm up the on-disk data because
26 |     // we will prove the read.
27 |     session.warm_up(key_path);
28 | 
29 |     let mut finished = session
30 |         .finish(vec![(key_path, KeyReadWrite::Read(value))])
31 |         .unwrap();
32 |     let _witness = finished.take_witness();
33 |     finished.commit(&nomt)?;
34 | 
35 |     Ok(())
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/witness_verification/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "witness_verification"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | nomt-core = { path = "../../core"  }
14 | commit-batch = { path = "../commit_batch"  }
15 | anyhow = "1.0.81"
16 | blake3 = "1.5.1"
17 | 


--------------------------------------------------------------------------------
/examples/witness_verification/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use nomt_core::{hasher::Blake3Hasher, proof, trie::LeafData};
 3 | 
 4 | fn main() -> Result<()> {
 5 |     // The witness produced in the example `commit_batch` will be used
 6 |     let (prev_root, new_root, witness) = commit_batch::NomtDB::commit_batch().unwrap();
 7 | 
 8 |     let mut updates = Vec::new();
 9 | 
10 |     // A witness is composed of multiple WitnessedPath objects,
11 |     // which stores all the necessary information to verify the operations
12 |     // performed on the same path
13 |     for (i, witnessed_path) in witness.path_proofs.iter().enumerate() {
14 |         // Constructing the verified operations
15 |         let verified = witnessed_path
16 |             .inner
17 |             .verify::<Blake3Hasher>(&witnessed_path.path.path(), prev_root.into_inner())
18 |             .unwrap();
19 | 
20 |         // Among all read operations performed the ones that interact
21 |         // with the current verified path are selected
22 |         //
23 |         // Each witnessed operation contains an index to the path it needs to be verified against
24 |         //
25 |         // This information could already be known if we committed the batch initially,
26 |         // and thus, the witnessed field could be discarded entirely.
27 |         for read in witness
28 |             .operations
29 |             .reads
30 |             .iter()
31 |             .skip_while(|r| r.path_index != i)
32 |             .take_while(|r| r.path_index == i)
33 |         {
34 |             match read.value {
35 |                 // Check for non-existence if the return value was None
36 |                 None => assert!(verified.confirm_nonexistence(&read.key).unwrap()),
37 |                 // Verify the correctness of the returned value when it is Some(_)
38 |                 Some(value_hash) => {
39 |                     let leaf = LeafData {
40 |                         key_path: read.key,
41 |                         value_hash,
42 |                     };
43 |                     assert!(verified.confirm_value(&leaf).unwrap());
44 |                 }
45 |             }
46 |         }
47 | 
48 |         // The correctness of write operations cannot be easily verified like reads.
49 |         // Write operations need to be collected.
50 |         // All writes that have worked on shared prefixes,
51 |         // such as the witnessed_path, need to be bundled together.
52 |         // Later, it needs to be verified that all these writes bring
53 |         // the new trie to the expected state
54 |         let mut write_ops = Vec::new();
55 |         for write in witness
56 |             .operations
57 |             .writes
58 |             .iter()
59 |             .skip_while(|r| r.path_index != i)
60 |             .take_while(|r| r.path_index == i)
61 |         {
62 |             write_ops.push((write.key, write.value));
63 |         }
64 | 
65 |         if !write_ops.is_empty() {
66 |             updates.push(proof::PathUpdate {
67 |                 inner: verified,
68 |                 ops: write_ops,
69 |             });
70 |         }
71 |     }
72 | 
73 |     assert_eq!(
74 |         proof::verify_update::<Blake3Hasher>(prev_root.into_inner(), &updates).unwrap(),
75 |         new_root.into_inner(),
76 |     );
77 | 
78 |     Ok(())
79 | }
80 | 


--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | corpus
3 | artifacts
4 | coverage
5 | 


--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt-fuzz"
 3 | version = "0.0.0"
 4 | publish = false
 5 | edition = "2021"
 6 | 
 7 | [package.metadata]
 8 | cargo-fuzz = true
 9 | 
10 | [dependencies]
11 | libfuzzer-sys = "0.4"
12 | arbitrary = { version = "1.3.1", features = ["derive"] }
13 | tempfile.workspace = true
14 | bitvec.workspace = true
15 | 
16 | [dependencies.nomt]
17 | path = "../nomt"
18 | features = ["fuzz"]
19 | 
20 | [[bin]]
21 | name = "api_surface"
22 | path = "fuzz_targets/api_surface.rs"
23 | test = false
24 | doc = false
25 | bench = false
26 | 
27 | [[bin]]
28 | name = "bitwise_memcpy"
29 | path = "fuzz_targets/bitwise_memcpy.rs"
30 | test = false
31 | doc = false
32 | bench = false
33 | 
34 | [[bin]]
35 | name = "separate"
36 | path = "fuzz_targets/separate.rs"
37 | test = false
38 | doc = false
39 | bench = false
40 | 
41 | [[bin]]
42 | name = "prefix_len"
43 | path = "fuzz_targets/prefix_len.rs"
44 | test = false
45 | doc = false
46 | bench = false
47 | 
48 | [[bin]]
49 | name = "separator_len"
50 | path = "fuzz_targets/separator_len.rs"
51 | test = false
52 | doc = false
53 | bench = false
54 | 
55 | [[bin]]
56 | name = "reconstruct_key"
57 | path = "fuzz_targets/reconstruct_key.rs"
58 | test = false
59 | doc = false
60 | bench = false
61 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/bitwise_memcpy.rs:
--------------------------------------------------------------------------------
  1 | #![no_main]
  2 | 
  3 | use arbitrary::Arbitrary;
  4 | use bitvec::{order::Msb0, view::BitView};
  5 | use libfuzzer_sys::fuzz_target;
  6 | use nomt::beatree::bitwise_memcpy;
  7 | 
  8 | const MAX_BYTES_LEN: usize = 1 << 12; // 4KiB
  9 | 
 10 | fuzz_target!(|run: Run| {
 11 |     let Run {
 12 |         source,
 13 |         mut destination,
 14 |     } = run;
 15 | 
 16 |     let expected = reference_bitwise_memcpy(&source, &destination);
 17 | 
 18 |     bitwise_memcpy(
 19 |         &mut destination.bytes,
 20 |         destination.bit_start,
 21 |         &source.bytes,
 22 |         source.bit_start,
 23 |         source.bit_len,
 24 |     );
 25 | 
 26 |     assert_eq!(expected, destination.bytes);
 27 | });
 28 | 
 29 | #[derive(Debug)]
 30 | struct Run {
 31 |     source: Source,
 32 |     destination: Destination,
 33 | }
 34 | 
 35 | #[derive(Debug)]
 36 | struct Source {
 37 |     bit_start: usize,
 38 |     bit_len: usize,
 39 |     bytes: Vec<u8>,
 40 | }
 41 | 
 42 | #[derive(Debug)]
 43 | struct Destination {
 44 |     bit_start: usize,
 45 |     bytes: Vec<u8>,
 46 | }
 47 | 
 48 | impl<'a> Arbitrary<'a> for Run {
 49 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 50 |         let source = Source::arbitrary(input)?;
 51 | 
 52 |         // Destination must be long enough to store the source.
 53 |         let destination_bit_start = input.int_in_range(0..=7)?;
 54 |         let min_destination_len = (destination_bit_start + source.bit_len + 7) / 8;
 55 |         let destination_len = input.int_in_range(min_destination_len..=MAX_BYTES_LEN)?;
 56 |         let mut destination_bytes = vec![0; destination_len];
 57 |         input.fill_buffer(&mut destination_bytes)?;
 58 | 
 59 |         let run = Run {
 60 |             source,
 61 |             destination: Destination {
 62 |                 bit_start: destination_bit_start,
 63 |                 bytes: destination_bytes,
 64 |             },
 65 |         };
 66 | 
 67 |         Ok(run)
 68 |     }
 69 | }
 70 | 
 71 | impl<'a> Arbitrary<'a> for Source {
 72 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 73 |         let bytes_len = (input.int_in_range(0..=MAX_BYTES_LEN)? as usize).next_multiple_of(8);
 74 | 
 75 |         let mut bytes: Vec<u8> = vec![0; bytes_len];
 76 |         input.fill_buffer(&mut bytes)?;
 77 | 
 78 |         let bit_start = if bytes_len != 0 {
 79 |             input.int_in_range(0..=7)?
 80 |         } else {
 81 |             0
 82 |         };
 83 | 
 84 |         let bit_len = if bytes_len > 0 {
 85 |             // `bitwise_memcpy` requires to the source length to be the smallest length,
 86 |             // multiple of 8 bytes that the contain the source bits.
 87 |             let min_bit_len = ((bytes_len - 8) * 8).saturating_sub(bit_start) + 1;
 88 |             let max_bit_len = (bytes_len * 8) - bit_start;
 89 |             input.int_in_range(min_bit_len..=max_bit_len)?
 90 |         } else {
 91 |             0
 92 |         };
 93 | 
 94 |         Ok(Self {
 95 |             bit_start,
 96 |             bit_len,
 97 |             bytes,
 98 |         })
 99 |     }
100 | }
101 | 
102 | fn reference_bitwise_memcpy(source: &Source, destination: &Destination) -> Vec<u8> {
103 |     let mut destination_bytes = destination.bytes.clone();
104 | 
105 |     destination_bytes.view_bits_mut::<Msb0>()[destination.bit_start..][..source.bit_len]
106 |         .copy_from_bitslice(
107 |             &source.bytes.view_bits::<Msb0>()[source.bit_start..][..source.bit_len],
108 |         );
109 | 
110 |     destination_bytes
111 | }
112 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/common/mod.rs:
--------------------------------------------------------------------------------
 1 | use arbitrary::Arbitrary;
 2 | use bitvec::{order::Msb0, view::BitView};
 3 | 
 4 | #[derive(Debug)]
 5 | pub struct Run {
 6 |     pub prefix_bit_len: usize,
 7 |     pub a: [u8; 32],
 8 |     pub b: [u8; 32],
 9 | }
10 | 
11 | impl<'a> Arbitrary<'a> for Run {
12 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
13 |         let prefix_bit_len = input.int_in_range(0..=255)?;
14 |         let mut a = [0; 32];
15 |         let mut b = [0; 32];
16 |         input.fill_buffer(&mut a)?;
17 |         input.fill_buffer(&mut b)?;
18 |         b.view_bits_mut::<Msb0>()[0..prefix_bit_len]
19 |             .copy_from_bitslice(&a.view_bits::<Msb0>()[0..prefix_bit_len]);
20 | 
21 |         let effective_prefix_bit_len = a
22 |             .view_bits::<Msb0>()
23 |             .iter()
24 |             .zip(b.view_bits::<Msb0>().iter())
25 |             .take_while(|(a, b)| a == b)
26 |             .count();
27 | 
28 |         if effective_prefix_bit_len != prefix_bit_len {
29 |             Err(arbitrary::Error::IncorrectFormat)
30 |         } else {
31 |             Ok(Self {
32 |                 prefix_bit_len,
33 |                 a,
34 |                 b,
35 |             })
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/prefix_len.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | mod common;
 4 | 
 5 | use common::Run;
 6 | use libfuzzer_sys::fuzz_target;
 7 | use nomt::beatree::prefix_len;
 8 | 
 9 | fuzz_target!(|run: Run| {
10 |     let Run {
11 |         prefix_bit_len,
12 |         a,
13 |         b,
14 |     } = run;
15 | 
16 |     assert_eq!(prefix_bit_len, prefix_len(&a, &b));
17 | });
18 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/reconstruct_key.rs:
--------------------------------------------------------------------------------
  1 | #![no_main]
  2 | 
  3 | use arbitrary::Arbitrary;
  4 | use bitvec::{order::Msb0, view::BitView};
  5 | use libfuzzer_sys::fuzz_target;
  6 | use nomt::beatree::reconstruct_key;
  7 | 
  8 | fuzz_target!(|run: Run| {
  9 |     let Run {
 10 |         raw_separator,
 11 |         raw_prefix,
 12 |     } = run;
 13 | 
 14 |     let expected = reference_reconstruct_key(&raw_prefix, &raw_separator);
 15 | 
 16 |     let maybe_prefix = if raw_prefix.bit_len == 0 {
 17 |         None
 18 |     } else {
 19 |         Some((&raw_prefix.bytes[..], raw_prefix.bit_len))
 20 |     };
 21 | 
 22 |     let raw_separator = (
 23 |         &raw_separator.bytes[..],
 24 |         raw_separator.bit_start,
 25 |         raw_separator.bit_len,
 26 |     );
 27 | 
 28 |     assert_eq!(expected, reconstruct_key(maybe_prefix, raw_separator));
 29 | });
 30 | 
 31 | #[derive(Debug)]
 32 | struct Run {
 33 |     raw_separator: RawSeparator,
 34 |     raw_prefix: RawPrefix,
 35 | }
 36 | 
 37 | #[derive(Debug)]
 38 | struct RawSeparator {
 39 |     bit_start: usize,
 40 |     bit_len: usize,
 41 |     bytes: Vec<u8>,
 42 | }
 43 | 
 44 | #[derive(Debug)]
 45 | struct RawPrefix {
 46 |     bit_len: usize,
 47 |     bytes: Vec<u8>,
 48 | }
 49 | 
 50 | impl<'a> Arbitrary<'a> for Run {
 51 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 52 |         let raw_separator = RawSeparator::arbitrary(input)?;
 53 | 
 54 |         let raw_prefix_bit_len = input.int_in_range(0..=(256 - raw_separator.bit_len))?;
 55 |         let raw_prefix_min_byte_len = (raw_prefix_bit_len + 7) / 8;
 56 |         let raw_prefix_byte_len = input.int_in_range(raw_prefix_min_byte_len..=(1 << 12))?;
 57 |         let mut raw_prefix_bytes = vec![0; raw_prefix_byte_len];
 58 |         input.fill_buffer(&mut raw_prefix_bytes)?;
 59 | 
 60 |         let run = Run {
 61 |             raw_separator,
 62 |             raw_prefix: RawPrefix {
 63 |                 bit_len: raw_prefix_bit_len,
 64 |                 bytes: raw_prefix_bytes,
 65 |             },
 66 |         };
 67 | 
 68 |         Ok(run)
 69 |     }
 70 | }
 71 | 
 72 | impl<'a> Arbitrary<'a> for RawSeparator {
 73 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
 74 |         let bit_start = input.int_in_range(0..=7)?;
 75 | 
 76 |         let bit_len = input.int_in_range(0..=(256 - bit_start))?;
 77 | 
 78 |         let bytes_len = (((bit_start + bit_len + 7) / 8) as usize).next_multiple_of(8);
 79 |         let mut bytes: Vec<u8> = vec![0; bytes_len];
 80 |         input.fill_buffer(&mut bytes)?;
 81 | 
 82 |         Ok(Self {
 83 |             bit_start,
 84 |             bit_len,
 85 |             bytes,
 86 |         })
 87 |     }
 88 | }
 89 | 
 90 | fn reference_reconstruct_key(maybe_prefix: &RawPrefix, separator: &RawSeparator) -> [u8; 32] {
 91 |     let mut key = [0; 32];
 92 | 
 93 |     let mut key_start_separator = 0;
 94 |     let RawPrefix { bit_len, bytes } = maybe_prefix;
 95 |     if *bit_len != 0 {
 96 |         key.view_bits_mut::<Msb0>()[..*bit_len]
 97 |             .copy_from_bitslice(&bytes.view_bits::<Msb0>()[..*bit_len]);
 98 |         key_start_separator = *bit_len;
 99 |     }
100 | 
101 |     let RawSeparator {
102 |         bit_start,
103 |         bit_len,
104 |         bytes,
105 |     } = separator;
106 | 
107 |     key.view_bits_mut::<Msb0>()[key_start_separator..][..*bit_len]
108 |         .copy_from_bitslice(&bytes.view_bits::<Msb0>()[*bit_start..][..*bit_len]);
109 | 
110 |     key
111 | }
112 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/separate.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | mod common;
 4 | 
 5 | use bitvec::{order::Msb0, view::BitView};
 6 | use common::Run;
 7 | use libfuzzer_sys::fuzz_target;
 8 | use nomt::beatree::separate;
 9 | 
10 | fuzz_target!(|run: Run| {
11 |     let Run {
12 |         prefix_bit_len,
13 |         mut a,
14 |         mut b,
15 |     } = run;
16 | 
17 |     if a > b {
18 |         std::mem::swap(&mut a, &mut b);
19 |     }
20 | 
21 |     let mut expected = [0u8; 32];
22 |     expected.view_bits_mut::<Msb0>()[..prefix_bit_len + 1]
23 |         .copy_from_bitslice(&b.view_bits::<Msb0>()[..prefix_bit_len + 1]);
24 | 
25 |     assert_eq!(expected, separate(&a, &b));
26 | });
27 | 


--------------------------------------------------------------------------------
/fuzz/fuzz_targets/separator_len.rs:
--------------------------------------------------------------------------------
 1 | #![no_main]
 2 | 
 3 | use arbitrary::Arbitrary;
 4 | use bitvec::{order::Msb0, view::BitView};
 5 | use libfuzzer_sys::fuzz_target;
 6 | 
 7 | fuzz_target!(|run: Run| {
 8 |     let Run {
 9 |         separator_len,
10 |         separator,
11 |     } = run;
12 | 
13 |     assert_eq!(separator_len, nomt::beatree::separator_len(&separator));
14 | });
15 | 
16 | #[derive(Debug)]
17 | struct Run {
18 |     separator_len: usize,
19 |     separator: [u8; 32],
20 | }
21 | 
22 | impl<'a> Arbitrary<'a> for Run {
23 |     fn arbitrary(input: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
24 |         let mut separator_len = input.int_in_range(0..=255)?;
25 |         let mut separator = [0; 32];
26 |         input.fill_buffer(&mut separator)?;
27 |         separator.view_bits_mut::<Msb0>()[separator_len..].fill(false);
28 | 
29 |         if separator == [0u8; 32] {
30 |             separator_len = 1;
31 |         } else {
32 |             let effective_separator_len = 256 - separator.view_bits::<Msb0>().trailing_zeros();
33 |             if separator_len != effective_separator_len {
34 |                 return Err(arbitrary::Error::IncorrectFormat);
35 |             }
36 |         };
37 | 
38 |         Ok(Self {
39 |             separator_len,
40 |             separator,
41 |         })
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/nomt/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nomt"
 3 | description = "Nearly Optimal Merkle Trie - Schema and Database"
 4 | version = "1.0.0-preview"
 5 | authors.workspace = true
 6 | homepage.workspace = true
 7 | repository.workspace = true
 8 | edition.workspace = true
 9 | license.workspace = true
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | anyhow.workspace = true
15 | nomt-core = { path = "../core", default-features = false, features = ["std"] }
16 | parking_lot.workspace = true
17 | threadpool.workspace = true
18 | bitvec.workspace = true
19 | twox-hash.workspace = true
20 | fxhash.workspace = true
21 | dashmap.workspace = true
22 | crossbeam.workspace = true
23 | crossbeam-channel.workspace = true
24 | slab.workspace = true
25 | rand.workspace = true
26 | ahash.workspace = true
27 | imbl.workspace = true
28 | lru.workspace = true
29 | libc.workspace = true
30 | criterion = { workspace = true, optional = true }
31 | thread_local.workspace = true
32 | cfg-if.workspace = true
33 | borsh = { workspace = true, optional = true }
34 | serde = { workspace = true, optional = true }
35 | 
36 | [target.'cfg(target_os="linux")'.dependencies]
37 | io-uring.workspace = true
38 | 
39 | [target.'cfg(loom)'.dependencies]
40 | loom.workspace = true
41 | 
42 | [dev-dependencies]
43 | rand_pcg.workspace = true
44 | hex-literal.workspace = true
45 | tempfile.workspace = true
46 | criterion.workspace = true
47 | lazy_static.workspace = true
48 | hex.workspace = true
49 | quickcheck.workspace = true
50 | blake3.workspace = true
51 | 
52 | [lints.rust]
53 | unexpected_cfgs = { level = "warn", check-cfg = ['cfg(loom)'] }
54 | 
55 | [[bench]]
56 | name = "beatree"
57 | harness = false
58 | 
59 | [features]
60 | default = ["blake3-hasher", "sha2-hasher"]
61 | benchmarks = ["dep:criterion"]
62 | fuzz = []
63 | borsh = ["dep:borsh", "nomt-core/borsh"]
64 | blake3-hasher = ["nomt-core/blake3-hasher"]
65 | sha2-hasher = ["nomt-core/sha2-hasher"]
66 | serde = ["dep:serde", "nomt-core/serde"]
67 | 


--------------------------------------------------------------------------------
/nomt/benches/beatree.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "benchmarks")]
 2 | use criterion::{criterion_group, criterion_main};
 3 | #[cfg(feature = "benchmarks")]
 4 | use nomt::beatree::benches::beatree_benchmark;
 5 | 
 6 | #[cfg(feature = "benchmarks")]
 7 | criterion_group!(benches, beatree_benchmark);
 8 | #[cfg(feature = "benchmarks")]
 9 | criterion_main!(benches);
10 | 
11 | #[cfg(not(feature = "benchmarks"))]
12 | fn main() {}
13 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/benches.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(feature = "benchmarks")]
 2 | 
 3 | use crate::beatree::{
 4 |     branch::node::benches::*, leaf::node::benches::*, ops::benches::*, ops::bit_ops::benches::*,
 5 |     Key,
 6 | };
 7 | use rand::RngCore;
 8 | 
 9 | pub fn beatree_benchmark(c: &mut criterion::Criterion) {
10 |     separate_benchmark(c);
11 |     separator_len_benchmark(c);
12 |     prefix_len_benchmark(c);
13 |     search_branch_benchmark(c);
14 |     leaf_search_benchmark(c);
15 |     reconstruct_key_benchmark(c);
16 |     branch_builder_benchmark(c);
17 |     leaf_builder_benchmark(c);
18 | }
19 | 
20 | // returns two keys a and b where b > a and b shares the first n bits with a
21 | pub fn get_key_pair(shared_bytes: usize) -> (Key, Key) {
22 |     let mut rand = rand::thread_rng();
23 |     let mut a = [0; 32];
24 |     rand.fill_bytes(&mut a[0..shared_bytes]);
25 | 
26 |     // b > a
27 |     let mut b = a.clone();
28 |     b[shared_bytes] = 1;
29 | 
30 |     (a, b)
31 | }
32 | 
33 | // Get a vector containing `n` random keys that share the first `shared_bytes`
34 | pub fn get_keys(shared_bytes: usize, n: usize) -> Vec<Key> {
35 |     let mut rand = rand::thread_rng();
36 |     let mut prefix = [0; 32];
37 |     rand.fill_bytes(&mut prefix[0..shared_bytes]);
38 | 
39 |     let mut keys = vec![];
40 |     for _ in 0..n {
41 |         let mut key = prefix.clone();
42 |         rand.fill_bytes(&mut key[shared_bytes..]);
43 |         keys.push(key);
44 |     }
45 | 
46 |     keys
47 | }
48 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/branch/mod.rs:
--------------------------------------------------------------------------------
1 | pub use node::{body_size, BranchNode, BranchNodeBuilder, BranchNodeView, BRANCH_NODE_BODY_SIZE};
2 | pub mod node;
3 | 
4 | pub const BRANCH_NODE_SIZE: usize = 4096;
5 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/index.rs:
--------------------------------------------------------------------------------
 1 | //! In-memory index tracking bottom level branch nodes. This is an immutable data structure,
 2 | //! which is cheaply cloneable in O(1) and performs COW operations.
 3 | 
 4 | use std::ops::{Bound, RangeBounds};
 5 | use std::sync::Arc;
 6 | 
 7 | use imbl::OrdMap;
 8 | 
 9 | use super::Key;
10 | use crate::beatree::branch::BranchNode;
11 | 
12 | #[derive(Default, Clone)]
13 | pub struct Index {
14 |     first_key_map: OrdMap<Key, Arc<BranchNode>>,
15 | }
16 | 
17 | impl Index {
18 |     /// Look up the branch that would store the given key.
19 |     ///
20 |     /// This is either a branch whose separator is exactly equal to this key or the branch with the
21 |     /// highest separator less than the key.
22 |     pub fn lookup(&self, key: Key) -> Option<(Key, Arc<BranchNode>)> {
23 |         self.first_key_map
24 |             .get_prev(&key)
25 |             .map(|(sep, b)| (sep.clone(), b.clone()))
26 |     }
27 | 
28 |     /// Get the first separator greater than the given key.
29 |     pub fn next_key(&self, key: Key) -> Option<Key> {
30 |         self.first_key_map
31 |             .range(RangeFromExclusive { start: key })
32 |             .next()
33 |             .map(|(k, _)| *k)
34 |     }
35 | 
36 |     /// Remove the branch with the given separator key.
37 |     pub fn remove(&mut self, separator: &Key) -> Option<Arc<BranchNode>> {
38 |         self.first_key_map.remove(separator)
39 |     }
40 | 
41 |     /// Insert a branch with the given separator key.
42 |     pub fn insert(&mut self, separator: Key, branch: Arc<BranchNode>) -> Option<Arc<BranchNode>> {
43 |         self.first_key_map.insert(separator, branch)
44 |     }
45 | 
46 |     #[cfg(test)]
47 |     pub fn into_iter(self) -> impl Iterator<Item = (Key, Arc<BranchNode>)> {
48 |         self.first_key_map.into_iter()
49 |     }
50 | }
51 | 
52 | struct RangeFromExclusive {
53 |     start: Key,
54 | }
55 | 
56 | impl RangeBounds<Key> for RangeFromExclusive {
57 |     fn start_bound(&self) -> Bound<&Key> {
58 |         Bound::Excluded(&self.start)
59 |     }
60 | 
61 |     fn end_bound(&self) -> Bound<&Key> {
62 |         Bound::Unbounded
63 |     }
64 | 
65 |     fn contains<U>(&self, item: &U) -> bool
66 |     where
67 |         U: PartialOrd<Key> + ?Sized,
68 |     {
69 |         item > &self.start
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/leaf/mod.rs:
--------------------------------------------------------------------------------
 1 | // The `LeafStore` struct manages leaves. It's responsible for management (allocation and
 2 | // deallocation) and querying the LNs by their LNID.
 3 | //
 4 | // It maintains an in-memory copy of the freelist to facilitate the page management. The allocation
 5 | // is performed in LIFO order. The allocations are performed in batches to amortize the IO for the
 6 | // freelist and metadata updates (growing the file in case freelist is empty).
 7 | //
 8 | // The leaf store doesn't perform caching. When querying the leaf store returns a handle to a page.
 9 | // As soon as the handle is dropped, the data becomes inaccessible and another disk roundtrip would
10 | // be required to access the data again.
11 | 
12 | pub mod node;
13 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/leaf_cache.rs:
--------------------------------------------------------------------------------
 1 | //! The leaf cache stores recently accessed leaf nodes.
 2 | 
 3 | use crate::{
 4 |     beatree::{allocator::PageNumber, leaf::node::LeafNode},
 5 |     io::PAGE_SIZE,
 6 | };
 7 | use lru::LruCache;
 8 | use parking_lot::{Mutex, MutexGuard};
 9 | use std::{collections::hash_map::RandomState, hash::BuildHasher, sync::Arc};
10 | 
11 | /// A cache for leaf nodes.
12 | ///
13 | /// This i cheap to clone.
14 | #[derive(Clone)]
15 | pub struct LeafCache {
16 |     inner: Arc<Shared>,
17 | }
18 | 
19 | impl LeafCache {
20 |     /// Create a new cache with the given number of shards and the maximum number of items
21 |     /// to hold. `shards` must be non-zero.
22 |     pub fn new(shards: usize, leaf_cache_size: usize) -> Self {
23 |         let max_items = (leaf_cache_size * 1024 * 1024) / PAGE_SIZE;
24 |         let items_per_shard = max_items / shards;
25 |         LeafCache {
26 |             inner: Arc::new(Shared {
27 |                 shards: (0..shards)
28 |                     .map(|_| Shard {
29 |                         cache: LruCache::unbounded(),
30 |                         max_items: items_per_shard,
31 |                     })
32 |                     .map(Mutex::new)
33 |                     .collect::<Vec<_>>(),
34 |                 shard_assigner: RandomState::new(),
35 |             }),
36 |         }
37 |     }
38 | 
39 |     /// Get a cache entry, updating the LRU state.
40 |     pub fn get(&self, page_number: PageNumber) -> Option<Arc<LeafNode>> {
41 |         let mut shard = self.inner.shard_for(page_number);
42 | 
43 |         shard.cache.get(&page_number).map(|x| x.clone())
44 |     }
45 | 
46 |     /// Insert a cache entry. This does not evict anything.
47 |     pub fn insert(&self, page_number: PageNumber, node: Arc<LeafNode>) {
48 |         let mut shard = self.inner.shard_for(page_number);
49 | 
50 |         shard.cache.put(page_number, node);
51 |     }
52 | 
53 |     /// Evict all excess items from the cache.
54 |     pub fn evict(&self) {
55 |         for shard in &self.inner.shards {
56 |             let mut shard = shard.lock();
57 |             while shard.cache.len() > shard.max_items {
58 |                 let _ = shard.cache.pop_lru();
59 |             }
60 |         }
61 |     }
62 | }
63 | 
64 | struct Shared {
65 |     shards: Vec<Mutex<Shard>>,
66 |     shard_assigner: RandomState,
67 | }
68 | 
69 | impl Shared {
70 |     fn shard_for(&self, page_number: PageNumber) -> MutexGuard<'_, Shard> {
71 |         self.shards[self.shard_index_for(page_number)].lock()
72 |     }
73 | 
74 |     fn shard_index_for(&self, page_number: PageNumber) -> usize {
75 |         (self.shard_assigner.hash_one(page_number.0) as usize) % self.shards.len()
76 |     }
77 | }
78 | 
79 | struct Shard {
80 |     cache: LruCache<PageNumber, Arc<LeafNode>>,
81 |     max_items: usize,
82 | }
83 | 


--------------------------------------------------------------------------------
/nomt/src/beatree/writeout.rs:
--------------------------------------------------------------------------------
 1 | //! The writeout logic for beatree.
 2 | 
 3 | // As part of beatree writeout, we need to write BBN and LN files, resizing them to the correct
 4 | // size beforehand. After the writes are completed (fsync'd), we wait for the MANIFEST to be
 5 | // updated and then perform some cleanup.
 6 | 
 7 | use super::allocator::{PageNumber, Store};
 8 | use crate::io::{FatPage, IoHandle};
 9 | 
10 | pub fn submit_freelist_write(
11 |     io_handle: &IoHandle,
12 |     store: &Store,
13 |     free_list_pages: Vec<(PageNumber, FatPage)>,
14 | ) {
15 |     for (pn, page) in free_list_pages {
16 |         io_handle
17 |             .send(crate::io::IoCommand {
18 |                 kind: crate::io::IoKind::Write(store.store_fd(), pn.0 as u64, page),
19 |                 user_data: 0,
20 |             })
21 |             .unwrap();
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/ht_file.rs:
--------------------------------------------------------------------------------
  1 | /// The HT file.
  2 | ///
  3 | /// The file that stores the hash-table buckets and the meta map.
  4 | use super::meta_map::MetaMap;
  5 | use crate::io::{self, PagePool, PAGE_SIZE};
  6 | use std::{
  7 |     fs::{File, OpenOptions},
  8 |     path::PathBuf,
  9 | };
 10 | 
 11 | /// The offsets of the HT file.
 12 | #[derive(Clone)]
 13 | pub struct HTOffsets {
 14 |     // the number of pages to add to a page number to find its real location in the file,
 15 |     // taking account of the meta page and meta byte pages.
 16 |     data_page_offset: u64,
 17 | }
 18 | 
 19 | impl HTOffsets {
 20 |     /// Returns the page number of the `ix`th item in the data section of the store.
 21 |     pub fn data_page_index(&self, ix: u64) -> u64 {
 22 |         self.data_page_offset + ix
 23 |     }
 24 | 
 25 |     /// Returns the page number of the `ix`th item in the meta bytes section of the store.
 26 |     pub fn meta_bytes_index(&self, ix: u64) -> u64 {
 27 |         ix
 28 |     }
 29 | }
 30 | 
 31 | fn expected_file_len(num_pages: u32) -> u64 {
 32 |     (num_meta_byte_pages(num_pages) + num_pages) as u64 * PAGE_SIZE as u64
 33 | }
 34 | 
 35 | fn num_meta_byte_pages(num_pages: u32) -> u32 {
 36 |     (num_pages + 4095) / PAGE_SIZE as u32
 37 | }
 38 | 
 39 | /// Opens the HT file, checks its length and reads the meta map.
 40 | pub fn open(
 41 |     num_pages: u32,
 42 |     page_pool: &PagePool,
 43 |     ht_fd: &File,
 44 | ) -> anyhow::Result<(HTOffsets, MetaMap)> {
 45 |     if ht_fd.metadata()?.len() != expected_file_len(num_pages) {
 46 |         anyhow::bail!("Store corrupted; unexpected file length");
 47 |     }
 48 | 
 49 |     let num_meta_byte_pages = num_meta_byte_pages(num_pages);
 50 |     let mut meta_bytes = Vec::with_capacity(num_meta_byte_pages as usize * PAGE_SIZE);
 51 |     for pn in 0..num_meta_byte_pages {
 52 |         let extra_meta_page = io::read_page(page_pool, ht_fd, pn as u64)?;
 53 |         meta_bytes.extend_from_slice(&*extra_meta_page);
 54 |     }
 55 | 
 56 |     let data_page_offset = num_meta_byte_pages as u64;
 57 |     Ok((
 58 |         HTOffsets { data_page_offset },
 59 |         MetaMap::from_bytes(meta_bytes, num_pages as usize),
 60 |     ))
 61 | }
 62 | 
 63 | /// Creates the store file. Fails if store file already exists.
 64 | ///
 65 | /// Lays out the meta page. If `preallocate` is true, preallocates the blocks for the file.
 66 | pub fn create(path: PathBuf, num_pages: u32, preallocate: bool) -> std::io::Result<()> {
 67 |     let ht_path = path.join("ht");
 68 |     let ht_file = OpenOptions::new().write(true).create(true).open(ht_path)?;
 69 | 
 70 |     // number of pages + pages required for meta bits.
 71 |     let page_count = num_pages + num_meta_byte_pages(num_pages);
 72 |     let len = page_count as usize * PAGE_SIZE;
 73 | 
 74 |     resize_and_prealloc(&ht_file, len as u64, preallocate)?;
 75 | 
 76 |     ht_file.sync_all()?;
 77 |     drop(ht_file);
 78 | 
 79 |     let wal_path = path.join("wal");
 80 |     let wal_file = OpenOptions::new().write(true).create(true).open(wal_path)?;
 81 |     wal_file.sync_all()?;
 82 |     drop(wal_file);
 83 |     Ok(())
 84 | }
 85 | 
 86 | /// Sets the file size and attempts to preallocate the file if `preallocate` is true.
 87 | ///
 88 | /// Returns an error if setting the file size fails. File preallocation is done on a best-effort basis
 89 | /// and may silently fall back to regular allocation.
 90 | ///
 91 | /// After this call, if successful, the file size is set to `len` bytes.
 92 | fn resize_and_prealloc(ht_file: &File, len: u64, preallocate: bool) -> std::io::Result<()> {
 93 |     if !preallocate {
 94 |         // If not preallocating, just set the file size and return.
 95 |         ht_file.set_len(len)?;
 96 |         return Ok(());
 97 |     }
 98 | 
 99 |     cfg_if::cfg_if! {
100 |         if #[cfg(target_os = "linux")] {
101 |             // To preallocate on Linux systems, try using fallocate with ZERO_RANGE first as it's more
102 |             // efficient. fallocate sets the file size as well, so ftruncate (aka file.set_len()) is
103 |             // not needed.
104 |             if crate::sys::linux::fs_check(ht_file).map_or(false, |fsck| fsck.is_tmpfs()) {
105 |                 // Skip preallocation for tmpfs. It doesn't support fallocate and it's
106 |                 // memory-backed anyway. ftruncate and bail.
107 |                 ht_file.set_len(len)?;
108 |                 return Ok(());
109 |             }
110 |             if let Err(_) = crate::sys::linux::falloc_zero_file(ht_file, len) {
111 |                 // If fallocate fails, fall back to zeroing the file with write.
112 |                 resize_and_zero_file(ht_file, len)?;
113 |             }
114 |         } else {
115 |             resize_and_zero_file(ht_file, len)?;
116 |         }
117 |     }
118 | 
119 |     Ok(())
120 | }
121 | 
122 | // Fallback method for allocating extents for the file: just incrementally write zeroes to the file.
123 | fn resize_and_zero_file(mut file: &File, len: u64) -> std::io::Result<()> {
124 |     use std::io::Write;
125 | 
126 |     // Set the file size first.
127 |     file.set_len(len)?;
128 | 
129 |     // Zero the file.
130 |     let len = len as usize;
131 |     let buf = [0u8; PAGE_SIZE * 4];
132 |     let mut remaining = len;
133 |     while remaining > 0 {
134 |         let len = std::cmp::min(remaining, buf.len());
135 |         file.write_all(&buf[..len])?;
136 |         remaining -= len;
137 |     }
138 |     Ok(())
139 | }
140 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/meta_map.rs:
--------------------------------------------------------------------------------
 1 | //! in-memory metadata for each bucket. this is also persisted on disk.
 2 | 
 3 | const EMPTY: u8 = 0b0000_0000;
 4 | const TOMBSTONE: u8 = 0b0111_1111;
 5 | const FULL_MASK: u8 = 0b1000_0000;
 6 | 
 7 | fn full_entry(hash: u64) -> u8 {
 8 |     (hash >> 57) as u8 ^ FULL_MASK
 9 | }
10 | 
11 | pub struct MetaMap {
12 |     buckets: usize,
13 |     bitvec: Vec<u8>,
14 | }
15 | 
16 | impl MetaMap {
17 |     // Create a new meta-map from an existing vector.
18 |     pub fn from_bytes(meta_bytes: Vec<u8>, buckets: usize) -> Self {
19 |         assert_eq!(meta_bytes.len() % 4096, 0);
20 |         MetaMap {
21 |             buckets,
22 |             bitvec: meta_bytes,
23 |         }
24 |     }
25 | 
26 |     pub fn full_count(&self) -> usize {
27 |         self.bitvec
28 |             .iter()
29 |             .filter(|&&byte| byte & FULL_MASK != 0)
30 |             .count()
31 |     }
32 | 
33 |     pub fn len(&self) -> usize {
34 |         self.buckets
35 |     }
36 | 
37 |     pub fn set_full(&mut self, bucket: usize, hash: u64) {
38 |         self.bitvec[bucket] = full_entry(hash);
39 |     }
40 | 
41 |     pub fn set_tombstone(&mut self, bucket: usize) {
42 |         self.bitvec[bucket] = TOMBSTONE;
43 |     }
44 | 
45 |     // true means definitely empty.
46 |     pub fn hint_empty(&self, bucket: usize) -> bool {
47 |         self.bitvec[bucket] == EMPTY
48 |     }
49 | 
50 |     // true means definitely a tombstone.
51 |     pub fn hint_tombstone(&self, bucket: usize) -> bool {
52 |         self.bitvec[bucket] == TOMBSTONE
53 |     }
54 | 
55 |     // returns true if it's definitely not a match.
56 |     pub fn hint_not_match(&self, bucket: usize, raw_hash: u64) -> bool {
57 |         self.bitvec[bucket] != full_entry(raw_hash)
58 |     }
59 | 
60 |     // get the page index of a bucket in the meta-map.
61 |     pub fn page_index(&self, bucket: usize) -> usize {
62 |         bucket / 4096
63 |     }
64 | 
65 |     // get a page-sized slice of the metamap. This is guaranteed to have len 4096
66 |     pub fn page_slice(&self, page_index: usize) -> &[u8] {
67 |         let start = page_index * 4096;
68 |         let end = start + 4096;
69 |         &self.bitvec[start..end]
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/wal/mod.rs:
--------------------------------------------------------------------------------
 1 | const WAL_ENTRY_TAG_START: u8 = 1;
 2 | const WAL_ENTRY_TAG_END: u8 = 2;
 3 | const WAL_ENTRY_TAG_CLEAR: u8 = 3;
 4 | const WAL_ENTRY_TAG_UPDATE: u8 = 4;
 5 | 
 6 | pub use read::{WalBlobReader, WalEntry};
 7 | pub use write::WalBlobBuilder;
 8 | 
 9 | mod read;
10 | mod write;
11 | 
12 | #[cfg(test)]
13 | mod tests;
14 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/wal/tests.rs:
--------------------------------------------------------------------------------
  1 | use super::{WalBlobBuilder, WalBlobReader, WalEntry};
  2 | use crate::{io::page_pool::PagePool, merkle::ElidedChildren, page_diff::PageDiff};
  3 | use std::{fs::OpenOptions, io::Write as _};
  4 | 
  5 | #[test]
  6 | fn test_write_read() {
  7 |     let tempdir = tempfile::tempdir().unwrap();
  8 |     let wal_filename = tempdir.path().join("wal");
  9 |     std::fs::create_dir_all(tempdir.path()).unwrap();
 10 |     let mut wal_fd = {
 11 |         let mut options = OpenOptions::new();
 12 |         options.read(true).write(true).create(true);
 13 |         options.open(&wal_filename).unwrap()
 14 |     };
 15 | 
 16 |     let mut builder = WalBlobBuilder::new().unwrap();
 17 |     builder.reset(69);
 18 |     builder.write_clear(0);
 19 |     builder.write_update(
 20 |         [0; 32],
 21 |         &PageDiff::from_bytes(hex_literal::hex!(
 22 |             "00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
 23 |         ))
 24 |         .unwrap(),
 25 |         vec![].into_iter(),
 26 |         ElidedChildren::new(),
 27 |         0,
 28 |     );
 29 |     builder.write_clear(1);
 30 |     builder.write_update(
 31 |         [1; 32],
 32 |         &PageDiff::from_bytes(hex_literal::hex!(
 33 |             "01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00"
 34 |         ))
 35 |         .unwrap(),
 36 |         vec![[1; 32]].into_iter(),
 37 |         ElidedChildren::from_bytes([1, 0, 0, 0, 0, 0, 0, 0]),
 38 |         1,
 39 |     );
 40 |     builder.write_update(
 41 |         [2; 32],
 42 |         &{
 43 |             let mut diff = PageDiff::default();
 44 |             for i in 0..126 {
 45 |                 diff.set_changed(i);
 46 |             }
 47 |             diff
 48 |         },
 49 |         (0..126).map(|x| [x; 32]),
 50 |         ElidedChildren::from_bytes([2, 0, 0, 0, 0, 0, 0, 0]),
 51 |         2,
 52 |     );
 53 |     builder.finalize();
 54 |     wal_fd.write_all(builder.as_slice()).unwrap();
 55 |     wal_fd.sync_data().unwrap();
 56 | 
 57 |     let page_pool = PagePool::new();
 58 |     let mut reader = WalBlobReader::new(&page_pool, &wal_fd).unwrap();
 59 | 
 60 |     assert_eq!(reader.sync_seqn(), 69);
 61 |     assert_eq!(
 62 |         reader.read_entry().unwrap(),
 63 |         Some(WalEntry::Clear { bucket: 0 })
 64 |     );
 65 |     assert_eq!(
 66 |         reader.read_entry().unwrap(),
 67 |         Some(WalEntry::Update {
 68 |             page_id: [0; 32],
 69 |             page_diff: PageDiff::default(),
 70 |             changed_nodes: vec![],
 71 |             elided_children: ElidedChildren::new(),
 72 |             bucket: 0,
 73 |         })
 74 |     );
 75 |     assert_eq!(
 76 |         reader.read_entry().unwrap(),
 77 |         Some(WalEntry::Clear { bucket: 1 })
 78 |     );
 79 |     assert_eq!(
 80 |         reader.read_entry().unwrap(),
 81 |         Some(WalEntry::Update {
 82 |             page_id: [1; 32],
 83 |             page_diff: {
 84 |                 let mut diff = PageDiff::default();
 85 |                 diff.set_changed(0);
 86 |                 diff
 87 |             },
 88 |             changed_nodes: vec![[1; 32]],
 89 |             elided_children: ElidedChildren::from_bytes([1, 0, 0, 0, 0, 0, 0, 0]),
 90 |             bucket: 1,
 91 |         })
 92 |     );
 93 |     assert_eq!(
 94 |         reader.read_entry().unwrap(),
 95 |         Some(WalEntry::Update {
 96 |             page_id: [2; 32],
 97 |             page_diff: {
 98 |                 let mut diff = PageDiff::default();
 99 |                 for i in 0..126 {
100 |                     diff.set_changed(i);
101 |                 }
102 |                 diff
103 |             },
104 |             changed_nodes: (0..126).map(|x| [x; 32]).collect(),
105 |             elided_children: ElidedChildren::from_bytes([2, 0, 0, 0, 0, 0, 0, 0]),
106 |             bucket: 2,
107 |         })
108 |     );
109 |     assert_eq!(reader.read_entry().unwrap(), None);
110 | }
111 | 


--------------------------------------------------------------------------------
/nomt/src/bitbox/writeout.rs:
--------------------------------------------------------------------------------
 1 | //! The writeout logic for bitbox.
 2 | 
 3 | // The logic for writeout is split into three parts:
 4 | // - first we write out the wal blob to the WAL file and wait for the MANIFEST to be synced.
 5 | // - then we write out the metabits and bucket pages to the HT file.
 6 | // - finally, we truncate the WAL file.
 7 | 
 8 | use std::{
 9 |     fs::File,
10 |     io::{Seek as _, SeekFrom, Write},
11 |     os::fd::AsRawFd as _,
12 |     sync::Arc,
13 | };
14 | 
15 | use crate::io::{FatPage, IoCommand, IoHandle, IoKind};
16 | 
17 | pub(super) fn write_wal(mut wal_fd: &File, wal_blob: &[u8]) -> std::io::Result<()> {
18 |     wal_fd.set_len(0)?;
19 |     wal_fd.seek(SeekFrom::Start(0))?;
20 |     wal_fd.write_all(wal_blob)?;
21 |     wal_fd.sync_all()?;
22 |     Ok(())
23 | }
24 | 
25 | /// Truncates the WAL file to zero length.
26 | ///
27 | /// Conditionally syncs the file to disk.
28 | pub(super) fn truncate_wal(mut wal_fd: &File, do_sync: bool) -> std::io::Result<()> {
29 |     wal_fd.set_len(0)?;
30 |     wal_fd.seek(SeekFrom::Start(0))?;
31 |     if do_sync {
32 |         wal_fd.sync_all()?;
33 |     }
34 |     Ok(())
35 | }
36 | 
37 | pub(super) fn write_ht(
38 |     io_handle: IoHandle,
39 |     ht_fd: &File,
40 |     mut ht: Vec<(u64, Arc<FatPage>)>,
41 | ) -> std::io::Result<()> {
42 |     let mut sent = 0;
43 | 
44 |     ht.sort_unstable_by_key(|item| item.0);
45 |     for (pn, page) in ht {
46 |         io_handle
47 |             .send(IoCommand {
48 |                 kind: IoKind::WriteArc(ht_fd.as_raw_fd(), pn, page),
49 |                 user_data: 0,
50 |             })
51 |             .unwrap();
52 |         sent += 1;
53 |     }
54 | 
55 |     while sent > 0 {
56 |         io_handle.recv().unwrap();
57 |         sent -= 1;
58 |     }
59 | 
60 |     ht_fd.sync_all()?;
61 | 
62 |     Ok(())
63 | }
64 | 


--------------------------------------------------------------------------------
/nomt/src/io/fsyncer.rs:
--------------------------------------------------------------------------------
  1 | use parking_lot::{Condvar, Mutex};
  2 | use std::{fs::File, sync::Arc};
  3 | 
  4 | #[derive(Debug)]
  5 | enum State {
  6 |     Idle,
  7 |     Started,
  8 |     Done(Result<(), std::io::Error>),
  9 |     HandleDead,
 10 | }
 11 | 
 12 | impl State {
 13 |     fn force_take_done(&mut self) -> Result<(), std::io::Error> {
 14 |         let s = std::mem::replace(self, State::Idle);
 15 |         if let State::Done(res) = s {
 16 |             res
 17 |         } else {
 18 |             panic!("force_take_done called on non-done state");
 19 |         }
 20 |     }
 21 | }
 22 | 
 23 | struct Shared {
 24 |     cv: Condvar,
 25 |     s: Mutex<State>,
 26 | }
 27 | 
 28 | /// Fsyncer is a helper that allows to fsync a file in a non-blocking manner.
 29 | ///
 30 | /// It spawns a thread that will fsync the file in the background.
 31 | ///
 32 | /// The expected usage is from two threads: the one that calls [`Self::fsync`] and the one that calls
 33 | /// [`Self::wait`].
 34 | pub struct Fsyncer {
 35 |     shared: Arc<Shared>,
 36 | }
 37 | 
 38 | impl Fsyncer {
 39 |     /// Creates a new fsyncer with the given file descriptor and identifier.
 40 |     pub fn new(name: &'static str, fd: Arc<File>) -> Self {
 41 |         let name = format!("nomt-fsyncer-{}", name);
 42 |         let shared = Arc::new(Shared {
 43 |             cv: Condvar::new(),
 44 |             s: Mutex::new(State::Idle),
 45 |         });
 46 |         let _thread = std::thread::Builder::new()
 47 |             .name(name)
 48 |             .spawn({
 49 |                 let shared = shared.clone();
 50 |                 move || {
 51 |                     worker(fd, shared);
 52 |                 }
 53 |             })
 54 |             .expect("failed to spawn fsyncer thread");
 55 |         Fsyncer { shared }
 56 |     }
 57 | 
 58 |     /// Issues a fsync request.
 59 |     ///
 60 |     /// # Panics
 61 |     ///
 62 |     /// Panics if there is an outstanding fsync operation that hasn't been consumed by
 63 |     /// [`Self::wait()`] yet.
 64 |     ///
 65 |     /// Make sure to call [`Self::wait()`] to consume any previous fsync result before issuing a new
 66 |     /// request.
 67 |     pub fn fsync(&self) {
 68 |         let mut s_guard = self.shared.s.lock();
 69 |         assert!(matches!(&*s_guard, State::Idle));
 70 |         *s_guard = State::Started;
 71 |         self.shared.cv.notify_all();
 72 |     }
 73 | 
 74 |     /// Waits for the fsync to complete and consumes the result.
 75 |     ///
 76 |     /// This blocks until a synchronization initiated by [`Self::fsync`] completes. If no fsync has been
 77 |     /// initiated yet, this will block until one is both started and completed. After consuming the result,
 78 |     /// subsequent calls will block until the next `fsync()` operation finishes.
 79 |     pub fn wait(&self) -> Result<(), std::io::Error> {
 80 |         let mut s_guard = self.shared.s.lock();
 81 |         self.shared
 82 |             .cv
 83 |             .wait_while(&mut s_guard, |s| !matches!(s, State::Done(_)));
 84 |         s_guard.force_take_done()
 85 |     }
 86 | }
 87 | 
 88 | impl Drop for Fsyncer {
 89 |     fn drop(&mut self) {
 90 |         let mut s_guard = self.shared.s.lock();
 91 |         *s_guard = State::HandleDead;
 92 |         self.shared.cv.notify_all();
 93 |     }
 94 | }
 95 | 
 96 | fn worker(fd: Arc<File>, shared: Arc<Shared>) {
 97 |     let bomb = Bomb;
 98 |     'outer: loop {
 99 |         let mut s_guard = shared.s.lock();
100 |         shared.cv.wait_while(&mut s_guard, |state| {
101 |             !matches!(state, State::Started | State::HandleDead)
102 |         });
103 |         if matches!(&*s_guard, State::HandleDead) {
104 |             break 'outer;
105 |         }
106 |         assert!(matches!(&*s_guard, State::Started | State::Done(_)));
107 |         drop(s_guard);
108 | 
109 |         let sync_result = fd.sync_all();
110 | 
111 |         let mut s_guard = shared.s.lock();
112 |         if matches!(&*s_guard, State::HandleDead) {
113 |             break 'outer;
114 |         }
115 |         *s_guard = State::Done(sync_result);
116 |         shared.cv.notify_all();
117 |     }
118 |     bomb.defuse();
119 | 
120 |     struct Bomb;
121 |     impl Bomb {
122 |         fn defuse(self) {
123 |             std::mem::forget(self);
124 |         }
125 |     }
126 |     impl Drop for Bomb {
127 |         fn drop(&mut self) {
128 |             panic!("worker panicked");
129 |         }
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/nomt/src/io/unix.rs:
--------------------------------------------------------------------------------
 1 | use super::{CompleteIo, IoCommand, IoKind, IoKindResult, IoPacket, PagePool, PAGE_SIZE};
 2 | use crossbeam_channel::{Receiver, Sender};
 3 | use threadpool::ThreadPool;
 4 | 
 5 | pub fn start_io_worker(
 6 |     page_pool: PagePool,
 7 |     io_workers_tp: &ThreadPool,
 8 |     io_workers: usize,
 9 | ) -> Sender<IoPacket> {
10 |     let (command_tx, command_rx) = crossbeam_channel::unbounded();
11 | 
12 |     for _ in 0..io_workers {
13 |         spawn_worker_thread(page_pool.clone(), io_workers_tp, command_rx.clone());
14 |     }
15 | 
16 |     command_tx
17 | }
18 | 
19 | fn spawn_worker_thread(
20 |     page_pool: PagePool,
21 |     io_workers_tp: &ThreadPool,
22 |     command_rx: Receiver<IoPacket>,
23 | ) {
24 |     let work = move || loop {
25 |         let Ok(packet) = command_rx.recv() else {
26 |             // Why the `drop` here?
27 |             //
28 |             // `command_rx` receives the IoPacket's which are ultimately parameterized by buffers.
29 |             // Those buffers are allocated in the `page_pool`. If the `page_pool` is deallocated
30 |             // before this worker thread is done, that's a use-after-free.
31 |             //
32 |             // So in other words, we plumb `page_pool` all the way here and drop it here only to
33 |             // ensure safety.
34 |             drop(page_pool);
35 |             return;
36 |         };
37 |         let complete = execute(packet.command);
38 |         let _ = packet.completion_sender.send(complete);
39 |     };
40 | 
41 |     io_workers_tp.execute(work);
42 | }
43 | 
44 | fn execute(mut command: IoCommand) -> CompleteIo {
45 |     let result = loop {
46 |         let res = match command.kind {
47 |             IoKind::Read(fd, page_index, ref mut page) => unsafe {
48 |                 libc::pread(
49 |                     fd,
50 |                     page.as_mut_ptr() as *mut libc::c_void,
51 |                     PAGE_SIZE as libc::size_t,
52 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
53 |                 )
54 |             },
55 |             IoKind::Write(fd, page_index, ref page) => unsafe {
56 |                 libc::pwrite(
57 |                     fd,
58 |                     page.as_ptr() as *const libc::c_void,
59 |                     PAGE_SIZE as libc::size_t,
60 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
61 |                 )
62 |             },
63 |             IoKind::WriteArc(fd, page_index, ref page) => unsafe {
64 |                 let page: &[u8] = &*page;
65 |                 libc::pwrite(
66 |                     fd,
67 |                     page.as_ptr() as *const libc::c_void,
68 |                     PAGE_SIZE as libc::size_t,
69 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
70 |                 )
71 |             },
72 |             IoKind::WriteRaw(fd, page_index, ref mut page) => unsafe {
73 |                 libc::pwrite(
74 |                     fd,
75 |                     page.as_ptr() as *const libc::c_void,
76 |                     PAGE_SIZE as libc::size_t,
77 |                     (page_index * PAGE_SIZE as u64) as libc::off_t,
78 |                 )
79 |             },
80 |         };
81 |         match command.kind.get_result(res) {
82 |             IoKindResult::Ok => break Ok(()),
83 |             IoKindResult::Err => break Err(std::io::Error::last_os_error()),
84 |             IoKindResult::Retry => (),
85 |         }
86 |     };
87 | 
88 |     CompleteIo { command, result }
89 | }
90 | 


--------------------------------------------------------------------------------
/nomt/src/merkle/cache_prepopulate.rs:
--------------------------------------------------------------------------------
 1 | //! Utility for prepopulating the first N layers of the cache.
 2 | 
 3 | use std::io;
 4 | 
 5 | use crate::{
 6 |     io::IoHandle,
 7 |     page_cache::{PageCache, PageMut},
 8 |     store::{PageLoad, PageLoader, Store},
 9 | };
10 | 
11 | use nomt_core::page_id::{ChildPageIndex, PageId, MAX_PAGE_DEPTH, NUM_CHILDREN, ROOT_PAGE_ID};
12 | 
13 | /// Prepopulate the given number of levels of the page tree into the page cache.
14 | ///
15 | /// This function blocks until the prepopulation has finished.
16 | pub fn prepopulate(
17 |     io_handle: IoHandle,
18 |     page_cache: &PageCache,
19 |     store: &Store,
20 |     levels: usize,
21 | ) -> io::Result<()> {
22 |     let page_loader = store.page_loader();
23 |     let mut loads = Vec::new();
24 | 
25 |     let levels = std::cmp::min(levels, MAX_PAGE_DEPTH);
26 | 
27 |     // dispatch all page loads recursively.
28 |     dispatch_recursive(ROOT_PAGE_ID, &page_loader, &io_handle, &mut loads, levels)?;
29 | 
30 |     let mut completed = 0;
31 | 
32 |     // wait on I/O results.
33 |     while completed < loads.len() {
34 |         // UNWRAP: we don't expect the I/O pool to go down. fatal error.
35 |         let complete_io = io_handle.recv().expect("I/O Pool Down");
36 |         complete_io.result?;
37 |         let load_index = complete_io.command.user_data as usize;
38 |         let load = &mut loads[load_index];
39 | 
40 |         // UNWRAP: all submitted requests are of kind Read(FatPage).
41 |         if let Some((page, bucket)) = load.try_complete(complete_io.command.kind.unwrap_buf()) {
42 |             completed += 1;
43 |             page_cache.insert(
44 |                 load.page_id().clone(),
45 |                 PageMut::pristine_with_data(page).freeze(),
46 |                 bucket,
47 |             );
48 |         } else {
49 |             // misprobe. try again.
50 |             if !page_loader.probe(load, &io_handle, complete_io.command.user_data) {
51 |                 // guaranteed empty.
52 |                 completed += 1;
53 |             }
54 |         }
55 |     }
56 | 
57 |     Ok(())
58 | }
59 | 
60 | // dispatch page loads for all the children of the given page.
61 | fn dispatch_recursive(
62 |     page_id: PageId,
63 |     page_loader: &PageLoader,
64 |     io_handle: &IoHandle,
65 |     loads: &mut Vec<PageLoad>,
66 |     levels_remaining: usize,
67 | ) -> io::Result<()> {
68 |     if levels_remaining == 0 {
69 |         return Ok(());
70 |     }
71 | 
72 |     for child_index in 0..NUM_CHILDREN {
73 |         // UNWRAP: all indices up to NUM_CHILDREN are allowed.
74 |         let child_index = ChildPageIndex::new(child_index as u8).unwrap();
75 | 
76 |         // UNWRAP: depth is not out of bounds and child index is valid.
77 |         let child_page_id = page_id.child_page_id(child_index).unwrap();
78 | 
79 |         let mut page_load = page_loader.start_load(child_page_id.clone());
80 | 
81 |         let next_index = loads.len() as u64;
82 |         if page_loader.probe(&mut page_load, io_handle, next_index) {
83 |             // probe has been dispatched.
84 |             loads.push(page_load);
85 |             dispatch_recursive(
86 |                 child_page_id,
87 |                 page_loader,
88 |                 io_handle,
89 |                 loads,
90 |                 levels_remaining - 1,
91 |             )?;
92 |         }
93 |     }
94 | 
95 |     Ok(())
96 | }
97 | 


--------------------------------------------------------------------------------
/nomt/src/merkle/page_set.rs:
--------------------------------------------------------------------------------
  1 | //! A set of pages that the page walker draws upon and which is filled by `Seek`ing.
  2 | 
  3 | use nomt_core::page_id::PageId;
  4 | use std::{collections::HashMap, sync::Arc};
  5 | 
  6 | use super::BucketInfo;
  7 | use crate::{
  8 |     io::PagePool,
  9 |     page_cache::{Page, PageMut},
 10 |     page_diff::PageDiff,
 11 | };
 12 | 
 13 | /// A page in the [`PageSet`] can have two different origins.
 14 | #[derive(Clone)]
 15 | pub enum PageOrigin {
 16 |     /// It could have been fetched from the hash table, thereby having an associated `BucketInfo`.
 17 |     Persisted(BucketInfo),
 18 |     /// It could have been reconstructed on the fly without being stored anywhere.
 19 |     /// It keeps track of the total number of leaves in child pages and which nodes
 20 |     /// in the page have been reconstructed.
 21 |     Reconstructed(u64, PageDiff),
 22 | }
 23 | 
 24 | impl PageOrigin {
 25 |     /// Extract `BucketInfo` from [`PageOrigin::Persisted`] variant.
 26 |     pub fn bucket_info(self) -> Option<BucketInfo> {
 27 |         match self {
 28 |             PageOrigin::Persisted(bucket_info) => Some(bucket_info),
 29 |             PageOrigin::Reconstructed(_, _) => None,
 30 |         }
 31 |     }
 32 | 
 33 |     /// Extract the number of leaves from [`PageOrigin::Reconstructed`] variant.
 34 |     pub fn leaves_counter(&self) -> Option<u64> {
 35 |         match self {
 36 |             PageOrigin::Reconstructed(counter, _) => Some(*counter),
 37 |             PageOrigin::Persisted(_) => None,
 38 |         }
 39 |     }
 40 | 
 41 |     /// Extract the [`PageDiff`] from [`PageOrigin::Reconstructed`] variant.
 42 |     pub fn page_diff(&self) -> Option<&PageDiff> {
 43 |         match self {
 44 |             PageOrigin::Reconstructed(_, page_diff) => Some(page_diff),
 45 |             PageOrigin::Persisted(_) => None,
 46 |         }
 47 |     }
 48 | }
 49 | 
 50 | pub struct PageSet {
 51 |     map: HashMap<PageId, (Page, PageOrigin)>,
 52 |     warm_up_map: Option<Arc<HashMap<PageId, (Page, PageOrigin)>>>,
 53 |     page_pool: PagePool,
 54 | }
 55 | 
 56 | impl PageSet {
 57 |     pub fn new(page_pool: PagePool, warmed_up: Option<FrozenSharedPageSet>) -> Self {
 58 |         PageSet {
 59 |             map: HashMap::new(),
 60 |             page_pool,
 61 |             warm_up_map: warmed_up.map(|x| x.0),
 62 |         }
 63 |     }
 64 | 
 65 |     /// Freeze this page-set and make a shareable version of it. This returns a frozen page set
 66 |     /// containing all insertions into this map.
 67 |     pub fn freeze(self) -> FrozenSharedPageSet {
 68 |         FrozenSharedPageSet(Arc::new(self.map))
 69 |     }
 70 | 
 71 |     fn get_warmed_up(&self, page_id: &PageId) -> Option<(Page, PageOrigin)> {
 72 |         self.warm_up_map
 73 |             .as_ref()
 74 |             .and_then(|m| m.get(page_id))
 75 |             .map(|(p, b)| (p.clone(), b.clone()))
 76 |     }
 77 | }
 78 | 
 79 | impl super::page_walker::PageSet for PageSet {
 80 |     fn fresh(&self, page_id: &PageId) -> PageMut {
 81 |         let page = PageMut::pristine_empty(&self.page_pool, &page_id);
 82 |         page
 83 |     }
 84 | 
 85 |     fn contains(&self, page_id: &PageId) -> bool {
 86 |         self.map.contains_key(&page_id)
 87 |     }
 88 | 
 89 |     fn get(&self, page_id: &PageId) -> Option<(Page, PageOrigin)> {
 90 |         self.map
 91 |             .get(&page_id)
 92 |             .map(|(p, bucket_info)| (p.clone(), bucket_info.clone()))
 93 |             .or_else(|| self.get_warmed_up(page_id))
 94 |     }
 95 | 
 96 |     fn insert(&mut self, page_id: PageId, page: Page, page_origin: PageOrigin) {
 97 |         self.map.insert(page_id, (page, page_origin));
 98 |     }
 99 | }
100 | 
101 | /// A frozen, shared page set. This is cheap to clone.
102 | #[derive(Clone)]
103 | pub struct FrozenSharedPageSet(Arc<HashMap<PageId, (Page, PageOrigin)>>);
104 | 


--------------------------------------------------------------------------------
/nomt/src/metrics.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::{
  2 |     atomic::{AtomicU64, Ordering},
  3 |     Arc,
  4 | };
  5 | 
  6 | /// Metrics collector, if active, it provides Counters and Timers
  7 | #[derive(Clone)]
  8 | pub struct Metrics {
  9 |     metrics: Option<Arc<ActiveMetrics>>,
 10 | }
 11 | 
 12 | /// Metrics that can be collected during execution
 13 | #[derive(PartialEq, Eq, Hash)]
 14 | pub enum Metric {
 15 |     /// Counter of total page requests
 16 |     PageRequests,
 17 |     /// Counter of page requests cache misses over all page requests
 18 |     PageCacheMisses,
 19 |     /// Timer used to record average page fetch time
 20 |     PageFetchTime,
 21 |     /// Timer used to record average value fetch time during reads
 22 |     ValueFetchTime,
 23 | }
 24 | 
 25 | struct ActiveMetrics {
 26 |     page_requests: AtomicU64,
 27 |     page_cache_misses: AtomicU64,
 28 |     page_fetch_time: Timer,
 29 |     value_fetch_time: Timer,
 30 | }
 31 | 
 32 | impl Metrics {
 33 |     /// Returns the Metrics object, active or not based on the specified input
 34 |     pub fn new(active: bool) -> Self {
 35 |         Self {
 36 |             metrics: if active {
 37 |                 Some(Arc::new(ActiveMetrics {
 38 |                     page_requests: AtomicU64::new(0),
 39 |                     page_cache_misses: AtomicU64::new(0),
 40 |                     page_fetch_time: Timer::new(),
 41 |                     value_fetch_time: Timer::new(),
 42 |                 }))
 43 |             } else {
 44 |                 None
 45 |             },
 46 |         }
 47 |     }
 48 | 
 49 |     /// Increase the Counter specified by the input
 50 |     ///
 51 |     /// panics if the specified [`Metric`] is not a Counter
 52 |     pub fn count(&self, metric: Metric) {
 53 |         if let Some(ref metrics) = self.metrics {
 54 |             let counter = match metric {
 55 |                 Metric::PageRequests => &metrics.page_requests,
 56 |                 Metric::PageCacheMisses => &metrics.page_cache_misses,
 57 |                 _ => panic!("Specified metric is not a Counter"),
 58 |             };
 59 | 
 60 |             counter.fetch_add(1, Ordering::Relaxed);
 61 |         }
 62 |     }
 63 | 
 64 |     /// Returns a guard that, when dropped, will record the time passed since creation
 65 |     ///
 66 |     /// panics if the specified [`Metric`] is not a Timer
 67 |     pub fn record<'a>(&'a self, metric: Metric) -> Option<impl Drop + 'a> {
 68 |         self.metrics.as_ref().and_then(|metrics| {
 69 |             let timer = match metric {
 70 |                 Metric::PageFetchTime => &metrics.page_fetch_time,
 71 |                 Metric::ValueFetchTime => &metrics.value_fetch_time,
 72 |                 _ => panic!("Specified metric is not a Timer"),
 73 |             };
 74 | 
 75 |             Some(timer.record())
 76 |         })
 77 |     }
 78 | 
 79 |     /// Print collected metrics to stdout
 80 |     pub fn print(&self) {
 81 |         if let Some(ref metrics) = self.metrics {
 82 |             println!("metrics");
 83 | 
 84 |             let tot_page_requests = metrics.page_requests.load(Ordering::Relaxed);
 85 |             println!("  page requests         {}", tot_page_requests);
 86 | 
 87 |             if tot_page_requests != 0 {
 88 |                 let cache_misses = metrics.page_cache_misses.load(Ordering::Relaxed);
 89 |                 let percentage_cache_misses =
 90 |                     (cache_misses as f64 / tot_page_requests as f64) * 100.0;
 91 | 
 92 |                 println!(
 93 |                     "  page cache misses     {} - {:.2}% of page requests",
 94 |                     cache_misses, percentage_cache_misses
 95 |                 );
 96 |             }
 97 | 
 98 |             if let Some(mean) = metrics.page_fetch_time.mean() {
 99 |                 println!("  page fetch mean       {}", pretty_display_ns(mean));
100 |             }
101 | 
102 |             if let Some(mean) = metrics.value_fetch_time.mean() {
103 |                 println!("  value fetch mean      {}", pretty_display_ns(mean));
104 |             }
105 |         } else {
106 |             println!("Metrics collection was not activated")
107 |         }
108 |     }
109 | }
110 | 
111 | fn pretty_display_ns(ns: u64) -> String {
112 |     // preserve 3 sig figs at minimum.
113 |     let (val, unit) = if ns > 100 * 1_000_000_000 {
114 |         (ns / 1_000_000_000, "s")
115 |     } else if ns > 100 * 1_000_000 {
116 |         (ns / 1_000_000, "ms")
117 |     } else if ns > 100 * 1_000 {
118 |         (ns / 1_000, "us")
119 |     } else {
120 |         (ns, "ns")
121 |     };
122 | 
123 |     format!("{val} {unit}")
124 | }
125 | 
126 | struct Timer {
127 |     number_of_records: AtomicU64,
128 |     sum: AtomicU64,
129 | }
130 | 
131 | impl Timer {
132 |     fn new() -> Self {
133 |         Timer {
134 |             number_of_records: AtomicU64::new(0),
135 |             sum: AtomicU64::new(0),
136 |         }
137 |     }
138 | 
139 |     fn mean(&self) -> Option<u64> {
140 |         let n = self.number_of_records.load(Ordering::Relaxed);
141 |         let sum = self.sum.load(Ordering::Relaxed);
142 |         sum.checked_div(n)
143 |     }
144 | 
145 |     fn record<'a>(&'a self) -> impl Drop + 'a {
146 |         struct TimerGuard<'a> {
147 |             start: std::time::Instant,
148 |             n: &'a AtomicU64,
149 |             sum: &'a AtomicU64,
150 |         }
151 | 
152 |         impl Drop for TimerGuard<'_> {
153 |             fn drop(&mut self) {
154 |                 let elapsed = self.start.elapsed().as_nanos() as u64;
155 |                 self.n.fetch_add(1, Ordering::Relaxed);
156 |                 self.sum.fetch_add(elapsed, Ordering::Relaxed);
157 |             }
158 |         }
159 | 
160 |         TimerGuard {
161 |             start: std::time::Instant::now(),
162 |             n: &self.number_of_records,
163 |             sum: &self.sum,
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/nomt/src/rollback/delta.rs:
--------------------------------------------------------------------------------
  1 | use nomt_core::trie::KeyPath;
  2 | use std::{
  3 |     collections::HashMap,
  4 |     io::{Cursor, Read as _},
  5 | };
  6 | 
  7 | /// A delta that should be applied to reverse a commit.
  8 | #[derive(Debug, Clone)]
  9 | pub struct Delta {
 10 |     /// This map contains the prior value for each key that was written by the commit this delta
 11 |     /// reverses. `None` indicates that the key did not exist before the commit.
 12 |     pub(crate) priors: HashMap<KeyPath, Option<Vec<u8>>>,
 13 | }
 14 | 
 15 | impl Delta {
 16 |     #[cfg(test)]
 17 |     fn empty() -> Self {
 18 |         Self {
 19 |             priors: HashMap::new(),
 20 |         }
 21 |     }
 22 | 
 23 |     /// Encode the delta into a buffer.
 24 |     ///
 25 |     /// Returns the number of bytes written.
 26 |     pub(super) fn encode(&self) -> Vec<u8> {
 27 |         // The serialization format has the following layout.
 28 |         //
 29 |         // The keys are split into two groups and written as separate arrays. Those groups are:
 30 |         //
 31 |         // 1. erase: The keys that did not exist before the commit.
 32 |         // 2. reinstateThe keys that had prior values.
 33 |         //
 34 |         // The keys that did not exist are written first. The keys that had prior values are
 35 |         // written second.
 36 |         //
 37 |         // For each kind of key, we first write out the length of the array encoded as a u32.
 38 |         // This is followed by the keys themselves, written contiguously in little-endian order.
 39 |         //
 40 |         // The keys are written as 32-byte big-endian values.
 41 | 
 42 |         // Sort the keys into two groups.
 43 |         let mut to_erase = Vec::with_capacity(self.priors.len());
 44 |         let mut to_reinstate = Vec::with_capacity(self.priors.len());
 45 |         for (key, value) in self.priors.iter() {
 46 |             match value {
 47 |                 None => to_erase.push(key),
 48 |                 Some(value) => to_reinstate.push((key, value)),
 49 |             }
 50 |         }
 51 | 
 52 |         let to_erase_len = to_erase.len() as u32;
 53 |         let mut buf = Vec::with_capacity(4 + 32 * to_erase.len());
 54 |         buf.extend_from_slice(&to_erase_len.to_le_bytes());
 55 |         for key in to_erase {
 56 |             buf.extend_from_slice(&key[..]);
 57 |         }
 58 | 
 59 |         let to_reinstate_len = to_reinstate.len() as u32;
 60 |         buf.extend_from_slice(&to_reinstate_len.to_le_bytes());
 61 |         for (key, value) in to_reinstate {
 62 |             buf.extend_from_slice(&key[..]);
 63 |             let value_len = value.len() as u32;
 64 |             buf.extend_from_slice(&value_len.to_le_bytes());
 65 |             buf.extend_from_slice(value);
 66 |         }
 67 | 
 68 |         buf
 69 |     }
 70 | 
 71 |     /// Decodes the delta from a buffer.
 72 |     pub(super) fn decode(reader: &mut Cursor<impl AsRef<[u8]>>) -> anyhow::Result<Self> {
 73 |         let mut priors = HashMap::new();
 74 | 
 75 |         // Read the number of keys to erase.
 76 |         let mut buf = [0; 4];
 77 |         reader.read_exact(&mut buf)?;
 78 |         let to_erase_len = u32::from_le_bytes(buf);
 79 |         // Read the keys to erase.
 80 |         for _ in 0..to_erase_len {
 81 |             let mut key_path = [0; 32];
 82 |             reader.read_exact(&mut key_path)?;
 83 |             let preemted = priors.insert(key_path, None).is_some();
 84 |             if preemted {
 85 |                 anyhow::bail!("duplicate key path (erase): {:?}", key_path);
 86 |             }
 87 |         }
 88 | 
 89 |         // Read the number of keys to reinstate.
 90 |         reader.read_exact(&mut buf)?;
 91 |         let to_reinsate_len = u32::from_le_bytes(buf);
 92 |         // Read the keys to reinstate along with their values.
 93 |         for _ in 0..to_reinsate_len {
 94 |             // Read the key path.
 95 |             let mut key_path = [0; 32];
 96 |             reader.read_exact(&mut key_path)?;
 97 |             // Read the value.
 98 |             let mut value = Vec::new();
 99 |             reader.read_exact(&mut buf)?;
100 |             let value_len = u32::from_le_bytes(buf);
101 |             value.resize(value_len as usize, 0);
102 |             reader.read_exact(&mut value)?;
103 |             let preempted = priors.insert(key_path, Some(value)).is_some();
104 |             if preempted {
105 |                 anyhow::bail!("duplicate key path (reinstate): {:?}", key_path);
106 |             }
107 |         }
108 |         Ok(Delta { priors })
109 |     }
110 | }
111 | 
112 | #[cfg(test)]
113 | mod tests {
114 |     use super::*;
115 | 
116 |     #[test]
117 |     fn delta_roundtrip() {
118 |         let mut delta = Delta::empty();
119 |         delta.priors.insert([1; 32], Some(b"value1".to_vec()));
120 |         delta.priors.insert([2; 32], None);
121 |         delta.priors.insert([3; 32], Some(b"value3".to_vec()));
122 | 
123 |         let mut buf = delta.encode();
124 |         let mut cursor = Cursor::new(&mut buf);
125 |         let delta2 = Delta::decode(&mut cursor).unwrap();
126 |         assert_eq!(delta.priors, delta2.priors);
127 |     }
128 | 
129 |     #[test]
130 |     fn delta_roundtrip_empty() {
131 |         let delta = Delta::empty();
132 |         let mut buf = delta.encode();
133 |         let mut cursor = Cursor::new(&mut buf);
134 |         let delta2 = Delta::decode(&mut cursor).unwrap();
135 |         assert_eq!(delta.priors, delta2.priors);
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/nomt/src/seglog/segment_filename.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | 
  3 | pub fn format(prefix: &str, segment_id: u32) -> String {
  4 |     // The format string specifies a 10-digit number, so we pad with leading zeros from
  5 |     // the left. This assumes that segment_id is a 32-bit integer, which is confirmed by
  6 |     // the assert below. If you came here because it failed due to changing it to u64,
  7 |     // you will need to update the format string as well.
  8 |     assert_eq!(segment_id.to_le_bytes().len(), 4);
  9 |     format!("{prefix}.{segment_id:0>10}.log")
 10 | }
 11 | 
 12 | pub fn parse(prefix: &str, filename: &str) -> Result<u32> {
 13 |     // The filename of a segment file consists of a configurable prefix, a 10-digit segment ID,
 14 |     // and a ".log" suffix.
 15 |     //
 16 |     // Example: "prefix.0000000001.log".
 17 |     // Extract the segment ID from the filename
 18 |     assert!(!prefix.is_empty());
 19 |     let without_prefix = match filename.strip_prefix(prefix) {
 20 |         Some(s) => s,
 21 |         None => {
 22 |             return Err(anyhow::anyhow!(
 23 |                 "Invalid segment filename format: missing prefix"
 24 |             ))
 25 |         }
 26 |     };
 27 | 
 28 |     let without_suffix = match without_prefix.strip_suffix(".log") {
 29 |         Some(s) => s,
 30 |         None => {
 31 |             return Err(anyhow::anyhow!(
 32 |                 "Invalid segment filename format: missing .log suffix"
 33 |             ))
 34 |         }
 35 |     };
 36 | 
 37 |     let segment_id_str = match without_suffix.strip_prefix('.') {
 38 |         Some(s) => s,
 39 |         None => {
 40 |             return Err(anyhow::anyhow!(
 41 |                 "Invalid segment filename format: missing dot separator"
 42 |             ))
 43 |         }
 44 |     };
 45 | 
 46 |     // Check that the segment ID string has exactly 10 digits
 47 |     if segment_id_str.len() != 10 {
 48 |         return Err(anyhow::anyhow!(
 49 |             "Invalid segment filename format: segment ID must be exactly 10 digits"
 50 |         ));
 51 |     }
 52 | 
 53 |     // Parse the segment ID as a u32
 54 |     let segment_id = segment_id_str
 55 |         .parse::<u32>()
 56 |         .context("Failed to parse segment ID")?;
 57 | 
 58 |     Ok(segment_id)
 59 | }
 60 | 
 61 | #[cfg(test)]
 62 | mod tests {
 63 |     use super::{format, parse};
 64 | 
 65 |     #[test]
 66 |     fn test_filename_isomorphism() {
 67 |         let test_cases = vec![
 68 |             ("prefix", 0),
 69 |             ("prefix", 1),
 70 |             ("prefix", 9999),
 71 |             ("prefix", u32::MAX),
 72 |             ("log", 42),
 73 |             ("segment", 1000000),
 74 |             ("very_long_prefix_name", 12345),
 75 |             ("a", 987654321),
 76 |         ];
 77 | 
 78 |         for (prefix, id) in test_cases {
 79 |             let filename = format(prefix, id);
 80 |             let parsed_id = parse(prefix, &filename).unwrap();
 81 |             assert_eq!(
 82 |                 id, parsed_id,
 83 |                 "Mismatch for prefix '{}' and id {}",
 84 |                 prefix, id
 85 |             );
 86 |         }
 87 |     }
 88 | 
 89 |     #[test]
 90 |     fn test_parse_segment_filename_edge_cases() {
 91 |         // Valid cases
 92 |         assert_eq!(parse("prefix", "prefix.0000000000.log").unwrap(), 0);
 93 |         assert_eq!(parse("prefix", "prefix.0000000001.log").unwrap(), 1);
 94 |         assert_eq!(parse("prefix", "prefix.4294967295.log").unwrap(), u32::MAX);
 95 |         assert_eq!(parse("a", "a.0000000042.log").unwrap(), 42);
 96 | 
 97 |         // Invalid cases
 98 |         assert!(parse("prefix", "prefix.00000000001.log").is_err()); // Too many digits
 99 |         assert!(parse("prefix", "prefix.000000001.log").is_err()); // Too few digits
100 |         assert!(parse("prefix", "prefix.000000000a.log").is_err()); // Non-numeric ID
101 |         assert!(parse("prefix", "prefix.0000000000").is_err()); // Missing .log suffix
102 |         assert!(parse("prefix", "prefix0000000000.log").is_err()); // Missing dot after prefix
103 |         assert!(parse("prefix", "wrongprefix.0000000000.log").is_err()); // Wrong prefix
104 |         assert!(parse("prefix", ".0000000000.log").is_err()); // Missing prefix
105 |         assert!(parse("prefix", "prefix..log").is_err()); // Missing ID
106 |         assert!(parse("prefix", "prefix.0000000000.wrongsuffix").is_err()); // Wrong suffix
107 | 
108 |         // Adversarial cases
109 |         assert!(parse("prefix", "prefix.0000000000.logx").is_err()); // Extra character after .log
110 |         assert!(parse("prefix", "xprefix.0000000000.log").is_err()); // Extra character before prefix
111 |         assert!(parse("prefix", "prefix.00000000001log").is_err()); // Missing dot before log
112 |         assert!(parse("prefix", "prefix.0000000000.log.").is_err()); // Extra dot at the end
113 |         assert!(parse("prefix", "prefix.4294967296.log").is_err()); // ID overflow (u32::MAX + 1)
114 |         assert!(parse("prefix", "prefix.0x0000000A.log").is_err()); // Hexadecimal ID
115 |         assert_eq!(
116 |             parse("prefix.with.dots", "prefix.with.dots.0000000000.log").unwrap(),
117 |             0
118 |         ); // Prefix with dots
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/nomt/src/store/flock.rs:
--------------------------------------------------------------------------------
 1 | //! This module provides a cross-platform advisory lock on a directory.
 2 | 
 3 | use std::{
 4 |     fs::{File, OpenOptions},
 5 |     path::Path,
 6 | };
 7 | 
 8 | /// Represents a cross-platform advisory lock on a directory.
 9 | pub struct Flock {
10 |     lock_fd: File,
11 | }
12 | 
13 | impl Flock {
14 |     pub fn lock(db_dir: &Path, lock_filename: &str) -> anyhow::Result<Self> {
15 |         let lock_path = db_dir.join(lock_filename);
16 | 
17 |         let lock_fd = OpenOptions::new()
18 |             .read(true)
19 |             .write(true)
20 |             .create(true)
21 |             .open(lock_path)?;
22 | 
23 |         match crate::sys::unix::try_lock_exclusive(&lock_fd) {
24 |             Ok(_) => Ok(Self { lock_fd }),
25 |             Err(e) => {
26 |                 anyhow::bail!("Failed to lock directory: {e}");
27 |             }
28 |         }
29 |     }
30 | }
31 | 
32 | impl Drop for Flock {
33 |     fn drop(&mut self) {
34 |         if let Err(e) = crate::sys::unix::unlock(&self.lock_fd) {
35 |             eprintln!("Failed to unlock directory lock: {e}");
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/nomt/src/store/page_loader.rs:
--------------------------------------------------------------------------------
 1 | use crate::{bitbox, io::IoHandle};
 2 | use nomt_core::page_id::PageId;
 3 | 
 4 | pub use bitbox::PageLoad;
 5 | 
 6 | pub struct PageLoader {
 7 |     pub(super) inner: bitbox::PageLoader,
 8 | }
 9 | 
10 | impl PageLoader {
11 |     /// Create a new page load.
12 |     pub fn start_load(&self, page_id: PageId) -> PageLoad {
13 |         self.inner.start_load(page_id)
14 |     }
15 | 
16 |     /// Advance the state of the given page load, blocking the current thread.
17 |     ///
18 |     /// Panics if the page load needs a completion or if the I/O pool is down.
19 |     ///
20 |     /// This returns `true` if the page request has been submitted and a completion will be
21 |     /// coming. `false` means that the page is guaranteed to be fresh.
22 |     pub fn probe(&self, load: &mut PageLoad, io_handle: &IoHandle, user_data: u64) -> bool {
23 |         self.inner.probe(load, io_handle, user_data)
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/nomt/src/store/sync.rs:
--------------------------------------------------------------------------------
 1 | use nomt_core::page_id::PageId;
 2 | 
 3 | use super::{
 4 |     meta::{self, Meta},
 5 |     DirtyPage, Shared,
 6 | };
 7 | use crate::{beatree, bitbox, options::PanicOnSyncMode, page_cache::PageCache, rollback};
 8 | 
 9 | pub struct Sync {
10 |     pub(crate) sync_seqn: u32,
11 |     pub(crate) bitbox_num_pages: u32,
12 |     pub(crate) bitbox_seed: [u8; 16],
13 |     pub(crate) panic_on_sync: Option<PanicOnSyncMode>,
14 | }
15 | 
16 | impl Sync {
17 |     pub fn new(
18 |         sync_seqn: u32,
19 |         bitbox_num_pages: u32,
20 |         bitbox_seed: [u8; 16],
21 |         panic_on_sync: Option<PanicOnSyncMode>,
22 |     ) -> Self {
23 |         Self {
24 |             sync_seqn,
25 |             bitbox_num_pages,
26 |             bitbox_seed,
27 |             panic_on_sync,
28 |         }
29 |     }
30 | 
31 |     pub fn sync(
32 |         &mut self,
33 |         shared: &Shared,
34 |         value_tx: impl IntoIterator<Item = (beatree::Key, beatree::ValueChange)> + Send + 'static,
35 |         bitbox: bitbox::DB,
36 |         beatree: beatree::Tree,
37 |         rollback: Option<rollback::Rollback>,
38 |         page_cache: PageCache,
39 |         updated_pages: impl IntoIterator<Item = (PageId, DirtyPage)> + Send + 'static,
40 |     ) -> anyhow::Result<()> {
41 |         let sync_seqn = self.sync_seqn + 1;
42 | 
43 |         let mut bitbox_sync = bitbox.sync();
44 |         let mut beatree_sync = beatree.sync();
45 |         let mut rollback_sync = rollback.map(|rollback| rollback.sync());
46 | 
47 |         bitbox_sync.begin_sync(sync_seqn, page_cache, updated_pages);
48 |         beatree_sync.begin_sync(value_tx);
49 |         let (rollback_start_live, rollback_end_live) = match rollback_sync {
50 |             Some(ref mut rollback) => rollback.begin_sync(),
51 |             None => (0, 0),
52 |         };
53 | 
54 |         bitbox_sync.wait_pre_meta()?;
55 |         let beatree_meta_wd = beatree_sync.wait_pre_meta()?;
56 | 
57 |         if let Some(PanicOnSyncMode::PostWal) = self.panic_on_sync {
58 |             panic!("panic_on_sync is true (post-wal)")
59 |         }
60 | 
61 |         let new_meta = Meta {
62 |             magic: meta::MAGIC,
63 |             version: meta::VERSION,
64 |             ln_freelist_pn: beatree_meta_wd.ln_freelist_pn,
65 |             ln_bump: beatree_meta_wd.ln_bump,
66 |             bbn_freelist_pn: beatree_meta_wd.bbn_freelist_pn,
67 |             bbn_bump: beatree_meta_wd.bbn_bump,
68 |             sync_seqn,
69 |             bitbox_num_pages: self.bitbox_num_pages,
70 |             bitbox_seed: self.bitbox_seed,
71 |             rollback_start_live,
72 |             rollback_end_live,
73 |         };
74 |         Meta::write(&shared.io_pool.page_pool(), &shared.meta_fd, &new_meta)?;
75 |         self.sync_seqn += 1;
76 | 
77 |         if let Some(PanicOnSyncMode::PostMeta) = self.panic_on_sync {
78 |             panic!("panic_on_sync is true (post-meta)");
79 |         }
80 | 
81 |         if let Some(ref mut rollback) = rollback_sync {
82 |             rollback.post_meta();
83 |         }
84 | 
85 |         bitbox_sync.post_meta(shared.io_pool.make_handle())?;
86 |         beatree_sync.post_meta();
87 | 
88 |         if let Some(ref rollback) = rollback_sync {
89 |             rollback.wait_post_meta()?;
90 |         }
91 |         Ok(())
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/nomt/src/sys/linux.rs:
--------------------------------------------------------------------------------
 1 | //! Linux-specific code.
 2 | 
 3 | use super::unix::cvt_r;
 4 | use std::fs::File;
 5 | use std::os::fd::AsRawFd;
 6 | 
 7 | /// Returns an instance of `FsCheck` for the given file.
 8 | pub fn fs_check(file: &File) -> std::io::Result<FsCheck> {
 9 |     unsafe {
10 |         // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed
11 |         //         by reference. This should be memory-safe because the `statfs` struct is
12 |         //         zeroed and the `f_type` field should be set by the ffi call.
13 |         let mut stat: libc::statfs = std::mem::zeroed();
14 |         cvt_r(|| libc::fstatfs(file.as_raw_fd(), &mut stat))?;
15 |         Ok(FsCheck { stat })
16 |     }
17 | }
18 | 
19 | /// A utility struct to get filesystem information at a given path.
20 | pub struct FsCheck {
21 |     stat: libc::statfs,
22 | }
23 | 
24 | impl FsCheck {
25 |     /// Returns true if the filesystem is tmpfs.
26 |     pub fn is_tmpfs(&self) -> bool {
27 |         self.stat.f_type == libc::TMPFS_MAGIC
28 |     }
29 | }
30 | 
31 | /// fallocate changes the size of the file to the given length if it's less than the current size.
32 | /// If the file is larger than the given length, the file is not truncated.
33 | ///
34 | /// Doesn't work on tmpfs.
35 | pub fn falloc_zero_file(file: &File, len: u64) -> std::io::Result<()> {
36 |     cvt_r(|| unsafe {
37 |         // SAFETY: unsafe because ffi call. This should be IO-safe because the file is passed
38 |         //         by reference.
39 |         libc::fallocate(
40 |             file.as_raw_fd(),
41 |             libc::FALLOC_FL_ZERO_RANGE,
42 |             0 as _,
43 |             len as _,
44 |         )
45 |     })
46 |     .map(drop)
47 | }
48 | 


--------------------------------------------------------------------------------
/nomt/src/sys/macos.rs:
--------------------------------------------------------------------------------
1 | //! macOS-specific code.
2 | 


--------------------------------------------------------------------------------
/nomt/src/sys/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Platform-specific code.
 2 | //!
 3 | //! At the moment we only target Linux and macOS.
 4 | 
 5 | cfg_if::cfg_if! {
 6 |     if #[cfg(target_os = "linux")] {
 7 |         pub mod linux;
 8 |         pub mod unix;
 9 |     } else if #[cfg(target_os = "macos")] {
10 |         pub mod macos;
11 |         pub mod unix;
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/nomt/src/sys/unix.rs:
--------------------------------------------------------------------------------
 1 | //! Common Unix definitions.
 2 | 
 3 | use std::{fs::File, os::fd::AsRawFd as _};
 4 | 
 5 | pub fn try_lock_exclusive(file: &File) -> std::io::Result<()> {
 6 |     cvt_r(|| unsafe { libc::flock(file.as_raw_fd(), libc::LOCK_EX | libc::LOCK_NB) }).map(drop)
 7 | }
 8 | 
 9 | pub fn unlock(file: &File) -> std::io::Result<()> {
10 |     unsafe { cvt_r(|| libc::flock(file.as_raw_fd(), libc::LOCK_UN)).map(drop) }
11 | }
12 | 
13 | pub(super) fn cvt_r<F>(mut f: F) -> std::io::Result<i32>
14 | where
15 |     F: FnMut() -> i32,
16 | {
17 |     fn cvt(res: i32) -> std::io::Result<i32> {
18 |         if res == -1 {
19 |             Err(std::io::Error::last_os_error())
20 |         } else {
21 |             Ok(res)
22 |         }
23 |     }
24 | 
25 |     loop {
26 |         match cvt(f()) {
27 |             Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => (),
28 |             other => break other,
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/nomt/src/task.rs:
--------------------------------------------------------------------------------
 1 | pub type TaskResult<R> = std::thread::Result<R>;
 2 | 
 3 | /// Spawn the given task within the given ThreadPool.
 4 | /// Use the provided Sender to send the result of the task execution.
 5 | ///
 6 | /// The result will contain the effective result or the payload
 7 | /// of the panic that occurred.
 8 | pub fn spawn_task<F, R>(
 9 |     thread_pool: &threadpool::ThreadPool,
10 |     task: F,
11 |     tx: crossbeam_channel::Sender<TaskResult<R>>,
12 | ) where
13 |     R: Send + 'static,
14 |     F: FnOnce() -> R + Send + 'static,
15 | {
16 |     thread_pool.execute(move || {
17 |         let res = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| task()));
18 |         let _ = tx.send(res);
19 |     });
20 | }
21 | 
22 | /// Blocks waiting for completion of the task spawned with [`spawn_task`].
23 | /// It requires the receiver associated to the sender used to spawn the task.
24 | ///
25 | /// Panics if the sender is dropped.
26 | pub fn join_task<R>(receiver: &crossbeam_channel::Receiver<TaskResult<R>>) -> R
27 | where
28 |     R: Send + 'static,
29 | {
30 |     // UNWRAP: The sender is not expected to be dropped by the spawned task.
31 |     let res = receiver.recv().unwrap();
32 |     match res {
33 |         Ok(res) => res,
34 |         Err(err_payload) => std::panic::resume_unwind(err_payload),
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/nomt/tests/add_remove.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use hex_literal::hex;
 5 | use nomt::trie::Node;
 6 | 
 7 | #[test]
 8 | fn add_remove_1000() {
 9 |     let mut accounts = 0;
10 |     let mut t = Test::new("add_remove");
11 | 
12 |     let expected_roots = [
13 |         hex!("0000000000000000000000000000000000000000000000000000000000000000"),
14 |         hex!("4a7a6fe118037086a49ff10484f4d80b0a9f31f1060eeb1c9f0162634604b0d9"),
15 |         hex!("7d5b013105d7b835225256f2233a458e1a158a53d20e0d3834886df89a26c27b"),
16 |         hex!("1a290e07bcacfb58ddcd0b9da348c740ca1bf87b05ed96752a1503ed7c187b69"),
17 |         hex!("5e9abfee6d927b084fed3e1306bbe65f0880d0b7de12522c38813014927f1336"),
18 |         hex!("57b39e06b2ee98dccd882033eb4136f5376699128b421c83bdc7c6ca96168938"),
19 |         hex!("7fd75809ef0e2133102eb5e31e47cb577149dcaebb42cddeb2fd6754256b365f"),
20 |         hex!("7c00cb11ec8262385078613e7b7977e50b0751f8cb2384fdccc048eea02acb63"),
21 |         hex!("516d6911c3b0a36c9227922ca0273a4aee44886201bd186f7ee7e538a769eaa5"),
22 |         hex!("381b24719ff91b13d36cf0dd7622f391f4a461452ed7547a46a992ee4a4025aa"),
23 |         hex!("207793e2ce76c1feb68c7259f883229f985706c8cc2fcf99f481b622a54ba375"),
24 |     ];
25 | 
26 |     let mut root = Node::default();
27 |     for i in 0..10 {
28 |         let _ = t.read_id(0);
29 |         for _ in 0..100 {
30 |             common::set_balance(&mut t, accounts, 1000);
31 |             accounts += 1;
32 |         }
33 |         {
34 |             root = t.commit().0.into_inner();
35 |         }
36 | 
37 |         assert_eq!(root, common::expected_root(accounts));
38 |         assert_eq!(root, expected_roots[i + 1]);
39 |     }
40 | 
41 |     assert_eq!(root, expected_roots[10]);
42 | 
43 |     for i in 0..10 {
44 |         for _ in 0..100 {
45 |             accounts -= 1;
46 |             common::kill(&mut t, accounts);
47 |         }
48 |         {
49 |             root = t.commit().0.into_inner();
50 |         }
51 | 
52 |         assert_eq!(root, common::expected_root(accounts));
53 |         assert_eq!(root, expected_roots[10 - i - 1]);
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/nomt/tests/compute_root.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use nomt::{hasher::Blake3Hasher, trie::NodeKind};
 5 | 
 6 | #[test]
 7 | fn root_on_empty_db() {
 8 |     let t = Test::new("compute_root_empty");
 9 |     let root = t.root();
10 |     assert_eq!(
11 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
12 |         NodeKind::Terminator
13 |     );
14 | }
15 | 
16 | #[test]
17 | fn root_on_leaf() {
18 |     {
19 |         let mut t = Test::new("compute_root_leaf");
20 |         t.write([1; 32], Some(vec![1, 2, 3]));
21 |         t.commit();
22 |     }
23 | 
24 |     let t = Test::new_with_params("compute_root_leaf", 1, 1, None, false);
25 |     let root = t.root();
26 |     assert_eq!(
27 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
28 |         NodeKind::Leaf
29 |     );
30 | }
31 | 
32 | #[test]
33 | fn root_on_internal() {
34 |     {
35 |         let mut t = Test::new("compute_root_internal");
36 |         t.write([0; 32], Some(vec![1, 2, 3]));
37 |         t.write([1; 32], Some(vec![1, 2, 3]));
38 |         t.commit();
39 |     }
40 | 
41 |     let t = Test::new_with_params("compute_root_internal", 1, 1, None, false);
42 |     let root = t.root();
43 |     assert_eq!(
44 |         NodeKind::of::<Blake3Hasher>(&root.into_inner()),
45 |         NodeKind::Internal
46 |     );
47 | }
48 | 


--------------------------------------------------------------------------------
/nomt/tests/exclusive_dir.rs:
--------------------------------------------------------------------------------
 1 | //! Tests the directory lock behavior.
 2 | 
 3 | use std::path::PathBuf;
 4 | 
 5 | use nomt::{hasher::Blake3Hasher, Nomt, Options};
 6 | 
 7 | fn setup_nomt(path: &str, should_clean_up: bool) -> anyhow::Result<Nomt<Blake3Hasher>> {
 8 |     let path = {
 9 |         let mut p = PathBuf::from("test");
10 |         p.push(path);
11 |         p
12 |     };
13 |     if should_clean_up && path.exists() {
14 |         std::fs::remove_dir_all(&path)?;
15 |     }
16 |     let mut o = Options::new();
17 |     o.path(path);
18 |     o.bitbox_seed([0; 16]);
19 |     Nomt::open(o)
20 | }
21 | 
22 | #[test]
23 | fn smoke() {
24 |     let _nomt = setup_nomt("smoke", true).unwrap();
25 | }
26 | 
27 | #[test]
28 | fn dir_lock() {
29 |     let _nomt_1 = setup_nomt("dir_lock", true).unwrap();
30 |     let nomt_2 = setup_nomt("dir_lock", false);
31 |     assert!(matches!(nomt_2, Err(e) if e.to_string().contains("Resource temporarily unavailable")));
32 | }
33 | 
34 | #[test]
35 | fn dir_unlock() {
36 |     let nomt_1 = setup_nomt("dir_unlock", true).unwrap();
37 |     drop(nomt_1);
38 |     let _nomt_2 = setup_nomt("dir_unlock", false).unwrap();
39 | }
40 | 


--------------------------------------------------------------------------------
/nomt/tests/extend_range_protocol.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | use common::Test;
 3 | use std::path::Path;
 4 | 
 5 | // nomt::beatree::branch::LEAF_NODE_BODY_SIZE is
 6 | // expected to be 4096 and thus the merge threshold is 2047.
 7 | //
 8 | // This parameter makes it possible to define the following vector of
 9 | // keys and values whose size, when inserted into the database, will result
10 | // in the expected set of leaves. Each line adheres to the half full
11 | // requirement, and the first element of the next row does not fit
12 | // in the previous leaf, requiring a new one. The last row does not
13 | // need to meet the half full requirement, as it may be the rightmost leaf.
14 | #[rustfmt::skip]
15 | const KEYS_AND_VALUE_SIZES: [(u8, usize); 16] =[
16 |     // leaf 1
17 |     (1, 1100), (2, 1000), (3, 1000),
18 |     // leaf 2
19 |     (4, 900), (5, 900), (7, 900), (8, 900),
20 |     // leaf 3
21 |     (10, 1200), (11, 1100), (13, 700),
22 |     // leaf 4
23 |     (15, 1300), (16, 1100), (17, 700),
24 |     // leaf 5
25 |     (18, 1100), (19, 1000), (20, 500),
26 | ];
27 | 
28 | // 2 update workers will be used and the first half of `to_delete` items
29 | // which fall under the same set of leaves are assigned to the first worker
30 | // and all the remaining keys to the next worker. This makes possible
31 | // to expect the type of communication between the two workers
32 | fn insert_delete_and_read(name: impl AsRef<Path>, to_delete: Vec<u8>) {
33 |     let mut t = Test::new_with_params(name, 2, 64_000, None, true);
34 | 
35 |     // insert values
36 |     for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() {
37 |         t.write(key(k), Some(vec![k; value_size]));
38 |     }
39 |     t.commit();
40 | 
41 |     // delete values
42 |     for k in to_delete.clone() {
43 |         t.write(key(k), None);
44 |     }
45 |     t.commit();
46 | 
47 |     // read values
48 |     for (k, value_size) in KEYS_AND_VALUE_SIZES.clone() {
49 |         if to_delete.contains(&k) {
50 |             let res = t.read(key(k));
51 |             assert_eq!(None, res);
52 |         } else {
53 |             let value = vec![k; value_size];
54 |             let res = t.read(key(k));
55 |             assert_eq!(Some(value), res);
56 |         }
57 |     }
58 | }
59 | 
60 | fn key(id: u8) -> [u8; 32] {
61 |     let mut key = [0; 32];
62 |     key[0] = id;
63 |     key
64 | }
65 | 
66 | #[test]
67 | fn extend_range_protocol_underfull_to_degenerate_split() {
68 |     insert_delete_and_read("underfull_to_degenerate_split", vec![7, 8, 13])
69 | }
70 | 
71 | #[test]
72 | fn extend_range_protocol_final_unchanged_range() {
73 |     insert_delete_and_read("final_unchanged_range", vec![7, 8, 10, 11, 13])
74 | }
75 | 
76 | #[test]
77 | fn extend_range_protocol_unchanged_range_to_changed() {
78 |     insert_delete_and_read("unchanged_range_to_changed", vec![7, 8, 10, 11, 13, 20])
79 | }
80 | 
81 | #[test]
82 | fn extend_range_protocol_remove_cutoff() {
83 |     insert_delete_and_read(
84 |         "remove_cutoff",
85 |         vec![7, 8, 10, 11, 13, 15, 16, 17, 18, 19, 20],
86 |     );
87 | }
88 | 


--------------------------------------------------------------------------------
/nomt/tests/fill_and_empty.rs:
--------------------------------------------------------------------------------
  1 | mod common;
  2 | use common::Test;
  3 | use rand::{prelude::SliceRandom, Rng, SeedableRng};
  4 | use std::time::{SystemTime, UNIX_EPOCH};
  5 | 
  6 | fn seed() -> [u8; 16] {
  7 |     SystemTime::now()
  8 |         .duration_since(UNIX_EPOCH)
  9 |         .expect("no time?")
 10 |         .as_nanos()
 11 |         .to_le_bytes()[0..16]
 12 |         .try_into()
 13 |         .unwrap()
 14 | }
 15 | 
 16 | fn fill_and_empty(seed: [u8; 16], commit_concurrency: usize) {
 17 |     let mut rng = rand_pcg::Lcg64Xsh32::from_seed(seed);
 18 | 
 19 |     let db_size = 1 << 12;
 20 |     let commit_size = db_size / 16;
 21 | 
 22 |     let mut items = std::collections::BTreeSet::new();
 23 |     while items.len() < db_size as usize {
 24 |         items.insert(rand_key(&mut rng));
 25 |     }
 26 |     let mut items: Vec<_> = items.into_iter().collect();
 27 |     items.shuffle(&mut rng);
 28 | 
 29 |     let mut to_delete: Vec<usize> = (0..db_size as usize).collect();
 30 |     to_delete.shuffle(&mut rng);
 31 | 
 32 |     let mut t = Test::new_with_params(
 33 |         format!("fill_and_empty_{}", commit_concurrency), // name
 34 |         commit_concurrency,
 35 |         15000, // hashtable_buckets
 36 |         None,  // panic_on_sync
 37 |         true,  //  cleanup_dir
 38 |     );
 39 | 
 40 |     // inserting all the values
 41 |     let mut to_check = vec![];
 42 |     for i in 0..db_size {
 43 |         let key = items[i];
 44 |         let value = vec![i as u8; 400];
 45 | 
 46 |         to_check.push((key, value.clone()));
 47 |         t.write(key, Some(value));
 48 | 
 49 |         if (i + 1) % commit_size == 0 {
 50 |             t.commit();
 51 |             // check for presence
 52 |             for (key, value) in to_check.drain(..) {
 53 |                 assert_eq!(t.read(key), Some(value));
 54 |             }
 55 |         }
 56 |     }
 57 | 
 58 |     // deleting all the values in different order
 59 |     let mut to_check = vec![];
 60 |     for i in 0..db_size {
 61 |         let key = items[to_delete[i]];
 62 | 
 63 |         to_check.push(key);
 64 |         t.write(key, None);
 65 | 
 66 |         if (i + 1) % commit_size == 0 {
 67 |             t.commit();
 68 |             // check for absence
 69 |             for key in to_check.drain(..) {
 70 |                 assert_eq!(t.read(key), None);
 71 |             }
 72 |         }
 73 |     }
 74 | 
 75 |     assert!(t.commit().0.is_empty());
 76 | }
 77 | 
 78 | fn rand_key(rng: &mut impl Rng) -> [u8; 32] {
 79 |     let mut key = [0; 32];
 80 |     rng.fill(&mut key[..]);
 81 |     key
 82 | }
 83 | 
 84 | #[test]
 85 | fn fill_and_empty_1_commit_worker() {
 86 |     let seed = seed();
 87 |     let test_result = std::panic::catch_unwind(|| {
 88 |         fill_and_empty(seed, 1);
 89 |     });
 90 |     if let Err(cause) = test_result {
 91 |         eprintln!(
 92 |             "fill_and_empty_1_commit_worker failed with seed: {:?}",
 93 |             seed
 94 |         );
 95 |         std::panic::resume_unwind(cause);
 96 |     }
 97 | }
 98 | 
 99 | #[test]
100 | fn fill_and_empty_64_commit_worker() {
101 |     let seed = seed();
102 |     let test_result = std::panic::catch_unwind(|| {
103 |         fill_and_empty(seed, 64);
104 |     });
105 |     if let Err(cause) = test_result {
106 |         eprintln!(
107 |             "fill_and_empty_64_commit_worker failed with seed: {:?}",
108 |             seed
109 |         );
110 |         std::panic::resume_unwind(cause);
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/nomt/tests/large_values.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | 
 5 | #[test]
 6 | fn large_values() {
 7 |     let mut t = Test::new("large_values");
 8 | 
 9 |     let large1 = vec![1; 4096 * 128];
10 |     let large2 = vec![2; 4096 * 80 - 1245];
11 | 
12 |     t.write_id(0, Some(large1.clone()));
13 |     t.write_id(1, Some(large2.clone()));
14 |     let _ = t.commit();
15 |     assert_eq!(&*t.read_id(0).unwrap(), &large1);
16 |     assert_eq!(&*t.read_id(1).unwrap(), &large2);
17 |     t.write_id(1, None);
18 |     let _ = t.commit();
19 |     assert_eq!(&*t.read_id(0).unwrap(), &large1);
20 |     assert!(t.read_id(1).is_none());
21 | }
22 | 


--------------------------------------------------------------------------------
/nomt/tests/last_layer_trie.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | 
 5 | #[test]
 6 | fn last_layer_trie() {
 7 |     let mut t = Test::new_with_params(
 8 |         "last_layer_trie", // name
 9 |         1,                 // commit_concurrency
10 |         10_000,            // hashtable_buckets
11 |         None,              // panic_on_sync
12 |         true,              // cleanup_dir
13 |     );
14 | 
15 |     let key1 = [170; 32];
16 |     let mut key2 = key1.clone();
17 |     key2[31] = 171;
18 | 
19 |     // write two leaf nodes at the last layer of the trie
20 |     t.write(key1, Some(vec![1; 128]));
21 |     t.write(key2, Some(vec![2; 128]));
22 |     t.commit();
23 |     assert_eq!(t.read(key1), Some(vec![1; 128]));
24 |     assert_eq!(t.read(key2), Some(vec![2; 128]));
25 | 
26 |     // modify two leaf nodes at the last layer of the trie
27 |     t.write(key1, Some(vec![3; 100]));
28 |     t.write(key2, Some(vec![4; 100]));
29 |     t.commit();
30 |     assert_eq!(t.read(key1), Some(vec![3; 100]));
31 |     assert_eq!(t.read(key2), Some(vec![4; 100]));
32 | 
33 |     // delete two leaf nodes at the last layer of the trie
34 |     t.write(key1, None);
35 |     t.write(key2, None);
36 |     t.commit();
37 |     assert_eq!(t.read(key1), None);
38 |     assert_eq!(t.read(key2), None);
39 | }
40 | 


--------------------------------------------------------------------------------
/nomt/tests/overlay.rs:
--------------------------------------------------------------------------------
  1 | mod common;
  2 | 
  3 | use common::Test;
  4 | 
  5 | fn expected_root(items: Vec<([u8; 32], Vec<u8>)>) -> nomt_core::trie::Node {
  6 |     nomt_core::update::build_trie::<nomt::hasher::Blake3Hasher>(
  7 |         0,
  8 |         items
  9 |             .into_iter()
 10 |             .map(|(k, v)| (k, *blake3::hash(&v).as_bytes())),
 11 |         |_| {},
 12 |     )
 13 | }
 14 | 
 15 | #[test]
 16 | fn overlay_multiple_forks() {
 17 |     let mut test = Test::new("overlay_multiple_forks");
 18 | 
 19 |     let overlay_a = test.update().0;
 20 |     let overlay_b1 = {
 21 |         test.start_overlay_session([&overlay_a]);
 22 |         test.write([1; 32], Some(vec![1, 2, 3]));
 23 |         test.update().0
 24 |     };
 25 |     let overlay_b2 = {
 26 |         test.start_overlay_session([&overlay_a]);
 27 |         test.write([1; 32], Some(vec![4, 5, 6]));
 28 |         test.update().0
 29 |     };
 30 | 
 31 |     {
 32 |         test.start_overlay_session([&overlay_b1, &overlay_a]);
 33 |         assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3]));
 34 |     }
 35 | 
 36 |     {
 37 |         test.start_overlay_session([&overlay_b2, &overlay_a]);
 38 |         assert_eq!(test.read([1; 32]), Some(vec![4, 5, 6]));
 39 |     }
 40 | }
 41 | 
 42 | #[test]
 43 | fn overlay_root_calculation() {
 44 |     let mut test = Test::new("overlay_root_calculation");
 45 |     test.write([1; 32], Some(vec![1, 2, 3]));
 46 |     let overlay_a = test.update().0;
 47 | 
 48 |     assert_eq!(
 49 |         overlay_a.root().into_inner(),
 50 |         expected_root(vec![([1; 32], vec![1, 2, 3])]),
 51 |     );
 52 | 
 53 |     test.start_overlay_session([&overlay_a]);
 54 |     test.write([2; 32], Some(vec![4, 5, 6]));
 55 |     let overlay_b = test.update().0;
 56 | 
 57 |     assert_eq!(
 58 |         overlay_b.root().into_inner(),
 59 |         expected_root(vec![([1; 32], vec![1, 2, 3]), ([2; 32], vec![4, 5, 6])]),
 60 |     );
 61 | 
 62 |     test.start_overlay_session([&overlay_b, &overlay_a]);
 63 |     test.write([1; 32], Some(vec![7, 8, 9]));
 64 |     test.write([3; 32], Some(vec![0, 1, 0]));
 65 |     let overlay_c = test.update().0;
 66 | 
 67 |     assert_eq!(
 68 |         overlay_c.root().into_inner(),
 69 |         expected_root(vec![
 70 |             ([1; 32], vec![7, 8, 9]),
 71 |             ([2; 32], vec![4, 5, 6]),
 72 |             ([3; 32], vec![0, 1, 0])
 73 |         ]),
 74 |     );
 75 | }
 76 | 
 77 | #[test]
 78 | #[should_panic]
 79 | fn overlays_must_be_committed_in_order() {
 80 |     let mut test = Test::new("overlays_committed_in_order");
 81 |     let overlay_a = test.update().0;
 82 |     test.start_overlay_session([&overlay_a]);
 83 |     let overlay_b = test.update().0;
 84 | 
 85 |     test.commit_overlay(overlay_b);
 86 | }
 87 | 
 88 | #[test]
 89 | #[should_panic]
 90 | fn overlay_competing_committed() {
 91 |     let mut test = Test::new("overlays_competing_committed");
 92 |     let overlay_a = test.update().0;
 93 |     test.start_overlay_session([&overlay_a]);
 94 |     let overlay_b1 = test.update().0;
 95 |     test.start_overlay_session([&overlay_a]);
 96 |     let overlay_b2 = test.update().0;
 97 | 
 98 |     test.commit_overlay(overlay_a);
 99 |     test.commit_overlay(overlay_b1);
100 | 
101 |     test.commit_overlay(overlay_b2);
102 | }
103 | 
104 | #[test]
105 | fn overlay_commit_in_order_works() {
106 |     let mut test = Test::new("overlays_commit_in_order_works");
107 |     let overlay_a = test.update().0;
108 |     test.start_overlay_session([&overlay_a]);
109 |     let overlay_b = test.update().0;
110 | 
111 |     test.commit_overlay(overlay_a);
112 |     test.commit_overlay(overlay_b);
113 | }
114 | 
115 | #[test]
116 | fn overlay_changes_land_on_disk_when_committed() {
117 |     {
118 |         let mut test = Test::new("overlay_changes_land_on_disk");
119 |         test.write([1; 32], Some(vec![1, 2, 3]));
120 |         test.write([2; 32], Some(vec![4, 5, 6]));
121 |         test.write([3; 32], Some(vec![7, 8, 9]));
122 | 
123 |         let overlay = test.update().0;
124 |         test.commit_overlay(overlay);
125 |     }
126 | 
127 |     let mut test = Test::new_with_params(
128 |         "overlay_changes_land_on_disk",
129 |         /* commit_concurrency */ 1,
130 |         /* hashtable_buckets */ 1,
131 |         /* panic_on_sync */ None,
132 |         /* cleanup_dir */ false,
133 |     );
134 | 
135 |     assert_eq!(test.read([1; 32]), Some(vec![1, 2, 3]));
136 |     assert_eq!(test.read([2; 32]), Some(vec![4, 5, 6]));
137 |     assert_eq!(test.read([3; 32]), Some(vec![7, 8, 9]));
138 | }
139 | 
140 | #[test]
141 | fn overlay_uncommitted_not_on_disk() {
142 |     {
143 |         let mut test = Test::new("overlay_uncommitted_not_on_disk");
144 |         test.write([1; 32], Some(vec![1, 2, 3]));
145 |         test.write([2; 32], Some(vec![4, 5, 6]));
146 |         test.write([3; 32], Some(vec![7, 8, 9]));
147 | 
148 |         let _overlay = test.update().0;
149 |     }
150 | 
151 |     let mut test = Test::new_with_params(
152 |         "overlay_uncommitted_not_on_disk",
153 |         /* commit_concurrency */ 1,
154 |         /* hashtable_buckets */ 1,
155 |         /* panic_on_sync */ None,
156 |         /* cleanup_dir */ false,
157 |     );
158 | 
159 |     assert_eq!(test.read([1; 32]), None);
160 |     assert_eq!(test.read([2; 32]), None);
161 |     assert_eq!(test.read([3; 32]), None);
162 | }
163 | 


--------------------------------------------------------------------------------
/nomt/tests/prev_root_check.rs:
--------------------------------------------------------------------------------
 1 | use nomt::{hasher::Blake3Hasher, KeyReadWrite, Nomt, Options, SessionParams};
 2 | use std::path::PathBuf;
 3 | 
 4 | /// Setup a NOMT with the given path, rollback enabled, and the given commit concurrency.
 5 | ///
 6 | /// It's important that tests that run in parallel don't use the same path.
 7 | fn setup_nomt(path: &str) -> Nomt<Blake3Hasher> {
 8 |     let path = {
 9 |         let mut p = PathBuf::from("test");
10 |         p.push(path);
11 |         p
12 |     };
13 |     if path.exists() {
14 |         std::fs::remove_dir_all(&path).unwrap();
15 |     }
16 |     let mut o = Options::new();
17 |     o.path(path);
18 |     o.commit_concurrency(1);
19 |     Nomt::open(o).unwrap()
20 | }
21 | 
22 | #[test]
23 | fn test_prev_root_commits() {
24 |     let nomt = setup_nomt("prev_root_commits");
25 |     let session1 = nomt.begin_session(SessionParams::default());
26 |     let finished1 = session1
27 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
28 |         .unwrap();
29 | 
30 |     let session2 = nomt.begin_session(SessionParams::default());
31 |     let finished2 = session2
32 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
33 |         .unwrap();
34 | 
35 |     finished1.commit(&nomt).unwrap();
36 | 
37 |     finished2.commit(&nomt).unwrap_err();
38 | }
39 | 
40 | #[test]
41 | fn test_prev_root_overlay_invalidated() {
42 |     let nomt = setup_nomt("prev_root_overlay_invalidated");
43 |     let session1 = nomt.begin_session(SessionParams::default());
44 |     let finished1 = session1
45 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
46 |         .unwrap();
47 |     let overlay1 = finished1.into_overlay();
48 | 
49 |     let session2 = nomt.begin_session(SessionParams::default());
50 |     let finished2 = session2
51 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
52 |         .unwrap();
53 | 
54 |     finished2.commit(&nomt).unwrap();
55 | 
56 |     overlay1.commit(&nomt).unwrap_err();
57 | }
58 | 
59 | #[test]
60 | fn test_prev_root_overlay_invalidates_session() {
61 |     let nomt = setup_nomt("prev_root_overlays");
62 |     let session1 = nomt.begin_session(SessionParams::default());
63 |     let finished1 = session1
64 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
65 |         .unwrap();
66 |     let overlay1 = finished1.into_overlay();
67 | 
68 |     let session2 = nomt.begin_session(SessionParams::default());
69 |     let finished2 = session2
70 |         .finish(vec![([1; 32], KeyReadWrite::Write(Some(vec![1, 2, 3])))])
71 |         .unwrap();
72 | 
73 |     overlay1.commit(&nomt).unwrap();
74 | 
75 |     finished2.commit(&nomt).unwrap_err();
76 | }
77 | 


--------------------------------------------------------------------------------
/nomt/tests/wal.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | use common::Test;
 4 | use nomt::PanicOnSyncMode;
 5 | 
 6 | #[test]
 7 | fn wal_recovery_test_post_meta_swap() {
 8 |     // Initialize the db with panic on sync equals true.
 9 |     let mut t = Test::new_with_params(
10 |         "wal_add_remove_1000",
11 |         1,                               // commit_concurrency,
12 |         1000000,                         // hashtable_buckets,
13 |         Some(PanicOnSyncMode::PostMeta), // panic_on_sync
14 |         true,                            // clean
15 |     );
16 | 
17 |     common::set_balance(&mut t, 0, 1000);
18 |     common::set_balance(&mut t, 1, 2000);
19 |     common::set_balance(&mut t, 2, 3000);
20 | 
21 |     let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
22 |         t.commit();
23 |     }));
24 |     assert!(r.is_err());
25 |     drop(t);
26 | 
27 |     // Re-open the db without cleaning the DB dir and without panic on sync.
28 |     let mut t = Test::new_with_params(
29 |         "wal_add_remove_1000",
30 |         1,       // commit_concurrency,
31 |         1000000, // hashtable_buckets,
32 |         None,    // panic_on_sync
33 |         false,   // clean
34 |     );
35 |     assert_eq!(common::read_balance(&mut t, 0), Some(1000));
36 |     assert_eq!(common::read_balance(&mut t, 1), Some(2000));
37 |     assert_eq!(common::read_balance(&mut t, 2), Some(3000));
38 | }
39 | 
40 | #[test]
41 | fn wal_recovery_test_pre_meta_swap() {
42 |     // Initialize the db with panic on sync equals true.
43 |     let mut t = Test::new_with_params(
44 |         "wal_pre_meta_swap",
45 |         1,                              // commit_concurrency,
46 |         1000000,                        // hashtable_buckets,
47 |         Some(PanicOnSyncMode::PostWal), // panic_on_sync
48 |         true,                           // clean
49 |     );
50 | 
51 |     for i in 0..1000 {
52 |         common::set_balance(&mut t, i, 1000);
53 |     }
54 | 
55 |     let r = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
56 |         t.commit();
57 |     }));
58 |     assert!(r.is_err());
59 |     drop(t);
60 | 
61 |     // Re-open the db without cleaning the DB dir and without panic on sync.
62 |     let mut t = Test::new_with_params(
63 |         "wal_pre_meta_swap",
64 |         1,       // commit_concurrency,
65 |         1000000, // hashtable_buckets,
66 |         None,    // panic_on_sync
67 |         false,   // clean
68 |     );
69 | 
70 |     // DB should open cleanly and not have any incomplete changes; the WAL is too new and will be
71 |     // discarded.
72 |     for i in 0..1000 {
73 |         assert_eq!(common::read_balance(&mut t, i), None);
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/nomt/tests/witness_check.rs:
--------------------------------------------------------------------------------
  1 | mod common;
  2 | 
  3 | use common::Test;
  4 | use nomt::{hasher::Blake3Hasher, proof, trie::LeafData};
  5 | 
  6 | #[test]
  7 | fn produced_witness_validity() {
  8 |     let mut accounts = 0;
  9 |     let mut t = Test::new("witness_validity");
 10 | 
 11 |     let (prev_root, _) = {
 12 |         for _ in 0..10 {
 13 |             common::set_balance(&mut t, accounts, 1000);
 14 |             accounts += 1;
 15 |         }
 16 |         t.commit()
 17 |     };
 18 | 
 19 |     let (new_root, witness) = {
 20 |         // read all existing accounts.
 21 |         for i in 0..accounts {
 22 |             t.read_id(i);
 23 |         }
 24 | 
 25 |         // read some nonexistent accounts.
 26 |         for i in 100..105 {
 27 |             t.read_id(i);
 28 |         }
 29 | 
 30 |         // kill half the existing ones.
 31 |         for i in 0..5 {
 32 |             common::kill(&mut t, i);
 33 |         }
 34 | 
 35 |         // and add 5 more.
 36 |         for _ in 0..5 {
 37 |             common::set_balance(&mut t, accounts, 1000);
 38 |             accounts += 1;
 39 |         }
 40 |         t.commit()
 41 |     };
 42 | 
 43 |     assert_eq!(witness.operations.reads.len(), 15); // 10 existing + 5 nonexisting
 44 |     assert_eq!(witness.operations.writes.len(), 10); // 5 deletes + 5 inserts
 45 | 
 46 |     let mut updates = Vec::new();
 47 |     for (i, witnessed_path) in witness.path_proofs.iter().enumerate() {
 48 |         let verified = witnessed_path
 49 |             .inner
 50 |             .verify::<Blake3Hasher>(&witnessed_path.path.path(), prev_root.into_inner())
 51 |             .unwrap();
 52 |         for read in witness
 53 |             .operations
 54 |             .reads
 55 |             .iter()
 56 |             .skip_while(|r| r.path_index != i)
 57 |             .take_while(|r| r.path_index == i)
 58 |         {
 59 |             match read.value {
 60 |                 None => assert!(verified.confirm_nonexistence(&read.key).unwrap()),
 61 |                 Some(ref v) => {
 62 |                     let leaf = LeafData {
 63 |                         key_path: read.key,
 64 |                         value_hash: *v,
 65 |                     };
 66 |                     assert!(verified.confirm_value(&leaf).unwrap());
 67 |                 }
 68 |             }
 69 |         }
 70 | 
 71 |         let mut write_ops = Vec::new();
 72 |         for write in witness
 73 |             .operations
 74 |             .writes
 75 |             .iter()
 76 |             .skip_while(|r| r.path_index != i)
 77 |             .take_while(|r| r.path_index == i)
 78 |         {
 79 |             write_ops.push((write.key, write.value.clone()));
 80 |         }
 81 | 
 82 |         if !write_ops.is_empty() {
 83 |             updates.push(proof::PathUpdate {
 84 |                 inner: verified,
 85 |                 ops: write_ops,
 86 |             });
 87 |         }
 88 |     }
 89 | 
 90 |     assert_eq!(
 91 |         proof::verify_update::<Blake3Hasher>(prev_root.into_inner(), &updates).unwrap(),
 92 |         new_root.into_inner(),
 93 |     );
 94 | }
 95 | 
 96 | #[test]
 97 | fn empty_witness() {
 98 |     let mut accounts = 0;
 99 |     let mut t = Test::new("empty_witness");
100 | 
101 |     let (prev_root, _) = {
102 |         for _ in 0..10 {
103 |             common::set_balance(&mut t, accounts, 1000);
104 |             accounts += 1;
105 |         }
106 |         t.commit()
107 |     };
108 | 
109 |     // Create a commit with no operations performed
110 |     let (new_root, witness) = t.commit();
111 | 
112 |     // The roots should be identical since no changes were made
113 |     assert_eq!(prev_root, new_root);
114 | 
115 |     // The witness should be empty
116 |     assert_eq!(witness.operations.reads.len(), 0);
117 |     assert_eq!(witness.operations.writes.len(), 0);
118 |     assert_eq!(witness.path_proofs.len(), 0);
119 | 
120 |     // Verify that an empty update produces the same root
121 |     let updates: Vec<proof::PathUpdate> = Vec::new();
122 |     assert_eq!(
123 |         proof::verify_update::<Blake3Hasher>(prev_root.into_inner(), &updates).unwrap(),
124 |         new_root.into_inner(),
125 |     );
126 | }
127 | 
128 | #[test]
129 | fn test_verify_update_with_identical_paths() {
130 |     use nomt::{
131 |         hasher::Blake3Hasher,
132 |         proof::{verify_update, PathUpdate},
133 |         trie::ValueHash,
134 |     };
135 | 
136 |     let account0 = 0;
137 | 
138 |     // Create a simple trie, create an update witness.
139 |     let mut t = Test::new("identical_paths_test");
140 |     common::set_balance(&mut t, account0, 1000);
141 |     let (root, _) = t.commit();
142 |     t.read_id(account0);
143 |     let (_, witness) = t.commit();
144 | 
145 |     // Using that witness extract and verify the proof.
146 |     let witnessed_path = &witness.path_proofs[0];
147 |     let verified_proof = witnessed_path
148 |         .inner
149 |         .verify::<Blake3Hasher>(&witnessed_path.path.path(), root.into_inner())
150 |         .unwrap();
151 | 
152 |     // Create two identical PathUpdate objects
153 |     let mut updates = Vec::new();
154 | 
155 |     // First update
156 |     let value1 = ValueHash::default();
157 |     let ops1 = vec![([0; 32], Some(value1))];
158 |     updates.push(PathUpdate {
159 |         inner: verified_proof.clone(),
160 |         ops: ops1,
161 |     });
162 | 
163 |     // Second update with identical path
164 |     let value2 = ValueHash::default();
165 |     let ops2 = vec![([1; 32], Some(value2))];
166 |     updates.push(PathUpdate {
167 |         inner: verified_proof, // Using the same verified proof
168 |         ops: ops2,
169 |     });
170 | 
171 |     // Try to verify the update. We expect an error due to identical paths, because that violates
172 |     // the requirement of ascending keys.
173 |     verify_update::<Blake3Hasher>(root.into_inner(), &updates).unwrap_err();
174 | }
175 | 


--------------------------------------------------------------------------------
/torture/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "torture"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | nix.workspace = true
 8 | libc.workspace = true
 9 | anyhow.workspace = true
10 | cfg-if.workspace = true
11 | serde.workspace = true
12 | bincode.workspace = true
13 | nomt = { path = "../nomt" }
14 | tokio.workspace = true
15 | tokio-util.workspace = true
16 | tokio-stream.workspace = true
17 | futures.workspace = true
18 | tempfile.workspace = true
19 | rand.workspace = true
20 | rand_pcg.workspace = true
21 | rand_distr.workspace = true
22 | imbl.workspace = true
23 | tokio-serde.workspace = true
24 | tracing.workspace = true
25 | tracing-subscriber.workspace = true
26 | hex.workspace = true
27 | futures-util.workspace = true
28 | clap.workspace = true
29 | trickfs = { path = "../trickfs" }
30 | which.workspace = true
31 | ruint.workspace = true
32 | 


--------------------------------------------------------------------------------
/torture/src/logging.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, IsTerminal as _};
  2 | use std::path::Path;
  3 | 
  4 | use tracing::level_filters::LevelFilter;
  5 | use tracing::{span, Level};
  6 | use tracing_subscriber::fmt::MakeWriter;
  7 | use tracing_subscriber::{fmt, EnvFilter};
  8 | 
  9 | const ENV_NAME_COMMON: &str = "TORTURE_ALL_LOG";
 10 | const ENV_NAME_AGENT: &str = "TORTURE_AGENT_LOG";
 11 | const ENV_NAME_SUPERVISOR: &str = "TORTURE_SUPERVISOR_LOG";
 12 | 
 13 | enum Kind {
 14 |     Agent,
 15 |     Supervisor,
 16 | }
 17 | 
 18 | fn istty() -> bool {
 19 |     io::stdout().is_terminal() && io::stderr().is_terminal()
 20 | }
 21 | 
 22 | /// Creates env filter for the agent or supervisor (depending on the `agent_not_supervisor`
 23 | /// argument).
 24 | ///
 25 | /// This function tries to read the most specific environment variable first, then falls back to
 26 | /// the common one ([`ENV_NAME_COMMON`]).
 27 | fn env_filter(kind: Kind) -> EnvFilter {
 28 |     let specific_env_name = match kind {
 29 |         Kind::Agent => ENV_NAME_AGENT,
 30 |         Kind::Supervisor => ENV_NAME_SUPERVISOR,
 31 |     };
 32 | 
 33 |     return try_parse_env(specific_env_name).unwrap_or_else(|| {
 34 |         try_parse_env(ENV_NAME_COMMON).unwrap_or_else(|| {
 35 |             EnvFilter::builder()
 36 |                 .with_default_directive(LevelFilter::INFO.into())
 37 |                 .parse("")
 38 |                 .unwrap()
 39 |         })
 40 |     });
 41 | 
 42 |     fn try_parse_env(var_name: &str) -> Option<EnvFilter> {
 43 |         match std::env::var(var_name) {
 44 |             Ok(env) => Some(
 45 |                 EnvFilter::builder()
 46 |                     .with_default_directive(LevelFilter::INFO.into())
 47 |                     .parse(env)
 48 |                     .unwrap(),
 49 |             ),
 50 |             Err(std::env::VarError::NotPresent) => {
 51 |                 return None;
 52 |             }
 53 |             Err(std::env::VarError::NotUnicode(_)) => {
 54 |                 panic!("Environment variable {} is not unicode", var_name);
 55 |             }
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | fn create_subscriber<W>(kind: Kind, writer: W, ansi: bool) -> impl tracing::Subscriber
 61 | where
 62 |     W: for<'writer> MakeWriter<'writer> + 'static + Sync + Send,
 63 | {
 64 |     let format = fmt::format()
 65 |         .with_level(true)
 66 |         .with_target(false)
 67 |         .with_thread_ids(false)
 68 |         .with_thread_names(false)
 69 |         .compact()
 70 |         .with_timer(fmt::time::SystemTime::default());
 71 | 
 72 |     fmt::Subscriber::builder()
 73 |         .with_env_filter(env_filter(kind))
 74 |         .with_writer(writer)
 75 |         .with_ansi(ansi)
 76 |         .event_format(format)
 77 |         .finish()
 78 | }
 79 | 
 80 | pub fn init_supervisor() {
 81 |     let subscriber = create_subscriber(Kind::Supervisor, io::stdout, istty());
 82 |     tracing::subscriber::set_global_default(subscriber)
 83 |         .expect("Failed to set supervisor subscriber");
 84 | }
 85 | 
 86 | pub fn workload_subscriber(workload_dir: &impl AsRef<Path>) -> impl tracing::Subscriber {
 87 |     let log_file = std::fs::File::options()
 88 |         .create(true)
 89 |         .append(true)
 90 |         .open(workload_dir.as_ref().join("log.txt"))
 91 |         .expect("Failed to create log file");
 92 |     create_subscriber(Kind::Supervisor, log_file, false)
 93 | }
 94 | 
 95 | pub fn init_agent(agent_id: &str, workload_dir: &impl AsRef<Path>) {
 96 |     let log_file = std::fs::File::options()
 97 |         .create(false)
 98 |         .append(true)
 99 |         .open(workload_dir.as_ref().join("log.txt"))
100 |         .expect("Log file is expected to be created by the supervisor");
101 |     let subscriber = create_subscriber(Kind::Agent, log_file, false);
102 | 
103 |     // Set the agent global subscriber
104 |     tracing::subscriber::set_global_default(subscriber).expect("Failed to set agent subscriber");
105 | 
106 |     let pid = std::process::id();
107 |     let span = span!(Level::INFO, "agent", agent_id, pid);
108 |     let _enter = span.enter();
109 |     // We intentionally `forget` the guard so the span remains open
110 |     // for the lifetime of the entire agent process if desired.
111 |     std::mem::forget(_enter);
112 | }
113 | 


--------------------------------------------------------------------------------
/torture/src/main.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Result;
 2 | use tokio::net::UnixStream;
 3 | 
 4 | mod agent;
 5 | mod logging;
 6 | mod message;
 7 | mod panic;
 8 | mod spawn;
 9 | mod supervisor;
10 | 
11 | #[tokio::main]
12 | async fn main() -> Result<()> {
13 |     if let Some(chan) = spawn::am_spawned() {
14 |         let chan = UnixStream::from_std(chan)?;
15 |         agent::run(chan).await?;
16 |     } else {
17 |         supervisor::run().await?;
18 |     }
19 |     Ok(())
20 | }
21 | 


--------------------------------------------------------------------------------
/torture/src/panic.rs:
--------------------------------------------------------------------------------
 1 | /// Panics are caught with `std::panic::catch_unwind` which returns an `std::thread::Result`,
 2 | /// the variant `Err` will contain a `Box<dyn Any + Send>` error from which it is possible
 3 | /// to extract an error message. Those utilities allow to handle those panic error messages.
 4 | use std::any::Any;
 5 | 
 6 | /// Attempt to create a `String` with the given context and downcast
 7 | /// the error to look for a message within it. If no message is found,
 8 | /// the `String` will contain only the context.
 9 | pub fn panic_to_string(context: &str, err: Box<dyn Any + Send>) -> String {
10 |     if let Some(err) = err.downcast_ref::<&str>() {
11 |         return format!("{}: {}", context, err);
12 |     }
13 |     if let Some(err) = err.downcast_ref::<String>() {
14 |         return format!("{}: {}", context, err);
15 |     }
16 |     format!("{} (no message)", context)
17 | }
18 | 
19 | /// Creates a `anyhow::Result::Err(..)` from a context and an error
20 | /// possibly containing a message.
21 | pub fn panic_to_err<T>(context: &str, err: Box<dyn Any + Send>) -> anyhow::Result<T> {
22 |     Err(anyhow::anyhow!("{}", panic_to_string(context, err)))
23 | }
24 | 


--------------------------------------------------------------------------------
/torture/src/spawn.rs:
--------------------------------------------------------------------------------
  1 | // A low-level module for spawning a child process and figuring out if we are the parent or the
  2 | // child using the same binary.
  3 | //
  4 | // The parent spawns a child process and passes a socket to it. The socket is passed to the child
  5 | // via a predefined file descriptor. The child then uses this file descriptor to communicate with
  6 | // the parent.
  7 | //
  8 | // For a process launched using the common binary, it can check if it is a child by checking if the
  9 | // [`CANARY_SOCKET_FD`] is valid.
 10 | //
 11 | // The main goal of this module is to tuck away the low-level machinery like working with libc and
 12 | // nix into a single place.
 13 | 
 14 | use anyhow::Result;
 15 | use cfg_if::cfg_if;
 16 | use std::{
 17 |     os::{
 18 |         fd::{AsRawFd as _, FromRawFd as _, RawFd},
 19 |         unix::net::UnixStream,
 20 |     },
 21 |     path::PathBuf,
 22 |     sync::atomic::{AtomicBool, Ordering},
 23 | };
 24 | use tokio::process::{Child, Command};
 25 | use tracing::trace;
 26 | 
 27 | /// A special file descriptor that is used to pass a socket to the child process.
 28 | ///
 29 | /// We pick a high number to avoid conflicts with other file descriptors.
 30 | const CANARY_SOCKET_FD: RawFd = 1000;
 31 | 
 32 | /// Check whether the given file descriptor is valid.
 33 | fn is_valid_fd(fd: RawFd) -> bool {
 34 |     unsafe { libc::fcntl(fd, libc::F_GETFD) != -1 }
 35 | }
 36 | 
 37 | /// Check whether the file descriptor is set to non-blocking mode.
 38 | fn is_nonblocking(fd: RawFd) -> bool {
 39 |     unsafe { libc::fcntl(fd, libc::F_GETFL) & libc::O_NONBLOCK == libc::O_NONBLOCK }
 40 | }
 41 | 
 42 | /// Check if the file descriptor corresponds to a Unix domain socket.
 43 | /// In our case, we're verifying that the socket type is SOCK_STREAM.
 44 | fn is_unix_socket(fd: RawFd) -> bool {
 45 |     let mut sock_type: libc::c_int = 0;
 46 |     let mut type_len = std::mem::size_of::<libc::c_int>() as libc::socklen_t;
 47 |     unsafe {
 48 |         libc::getsockopt(
 49 |             fd,
 50 |             libc::SOL_SOCKET,
 51 |             libc::SO_TYPE,
 52 |             &mut sock_type as *mut _ as *mut _,
 53 |             &mut type_len,
 54 |         ) == 0
 55 |             && sock_type == libc::SOCK_STREAM
 56 |     }
 57 | }
 58 | 
 59 | /// Checks for evidence that this process is a child of a parent process that spawned it.
 60 | ///
 61 | /// Returns a UnixStream if the process is a child, otherwise returns None.
 62 | ///
 63 | /// Panics if called more than once.
 64 | pub fn am_spawned() -> Option<UnixStream> {
 65 |     static CALLED: AtomicBool = AtomicBool::new(false);
 66 |     if CALLED.swap(true, Ordering::SeqCst) {
 67 |         // This function should not be called more than once to protect against multiple ownership
 68 |         // of the file descriptor.
 69 |         panic!();
 70 |     }
 71 | 
 72 |     if !is_valid_fd(CANARY_SOCKET_FD) {
 73 |         return None;
 74 |     }
 75 | 
 76 |     if !is_unix_socket(CANARY_SOCKET_FD) {
 77 |         panic!("not unix socket");
 78 |     }
 79 | 
 80 |     if !is_nonblocking(CANARY_SOCKET_FD) {
 81 |         panic!("non blocking");
 82 |     }
 83 | 
 84 |     let stream = unsafe {
 85 |         // SAFETY:
 86 |         // - The file descriptor is valid (checked above with fcntl)
 87 |         // - We verified it's actually a Unix domain socket (checked with getsockopt)
 88 |         // - This code can only run once due to the TAKEN atomic bool, ensuring we have exclusive
 89 |         //   ownership, passing it down into the UnixStream instance.
 90 |         // - No other code could have taken ownership as this is the first access (TAKEN was false)
 91 |         UnixStream::from_raw_fd(CANARY_SOCKET_FD)
 92 |     };
 93 |     Some(stream)
 94 | }
 95 | 
 96 | pub fn spawn_child(workload_dir_path: PathBuf) -> Result<(Child, UnixStream)> {
 97 |     let (sock1, sock2) = UnixStream::pair()?;
 98 | 
 99 |     // Those sockets are going to be used in tokio and as such they should be both set to
100 |     // non-blocking mode.
101 |     sock1.set_nonblocking(true)?;
102 |     sock2.set_nonblocking(true)?;
103 | 
104 |     let child = spawn_child_with_sock(sock2.as_raw_fd(), workload_dir_path)?;
105 |     drop(sock2); // Close parent's end in child
106 | 
107 |     Ok((child, sock1))
108 | }
109 | 
110 | fn spawn_child_with_sock(socket_fd: RawFd, workload_dir_path: PathBuf) -> Result<Child> {
111 |     trace!(?socket_fd, "Spawning child process");
112 | 
113 |     // Prepare argv for the child process.
114 |     //
115 |     // Contains only the program binary path and a null terminator.
116 |     cfg_if! {
117 |         if #[cfg(target_os = "linux")] {
118 |             // Nothing beats the simplicity of /proc/self/exe on Linux.
119 |             let program = std::ffi::OsString::from("/proc/self/exe");
120 |         } else {
121 |             let program = std::env::current_exe()?;
122 |         }
123 |     }
124 | 
125 |     let out_file = std::fs::File::options()
126 |         .create(false)
127 |         .append(true)
128 |         .open(workload_dir_path.join("log.txt"))
129 |         .expect("Log file is expected to be created by the supervisor");
130 |     let mut cmd = Command::new(program);
131 |     cmd.stdout(out_file.try_clone().unwrap());
132 |     cmd.stderr(out_file);
133 |     // Override the PGID of the spawned process. The motivation for this is ^C handling. To handle
134 |     // ^C the shell will send the SIGINT to all processes in the process group. We are handling
135 |     // SIGINT manually in the supervisor process.
136 |     cmd.process_group(0);
137 |     unsafe {
138 |         cmd.pre_exec(move || {
139 |             // Duplicate the socket_fd to the CANARY_SOCKET_FD.
140 |             // Close the original socket_fd in the child process.
141 |             libc::dup2(socket_fd, CANARY_SOCKET_FD);
142 |             libc::close(socket_fd);
143 |             Ok(())
144 |         });
145 |     }
146 |     let child = cmd.spawn()?;
147 | 
148 |     let pid = child
149 |         .id()
150 |         .map(|pid| pid.to_string())
151 |         .unwrap_or_else(|| "<killed?>".to_string());
152 |     trace!("spawned child process, pid={pid}");
153 |     Ok(child)
154 | }
155 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/cli.rs:
--------------------------------------------------------------------------------
 1 | use clap::{Args, Parser, Subcommand};
 2 | 
 3 | #[derive(Parser, Debug)]
 4 | pub struct Cli {
 5 |     #[command(subcommand)]
 6 |     pub command: Commands,
 7 | }
 8 | 
 9 | #[derive(Subcommand, Debug)]
10 | pub enum Commands {
11 |     /// Execute swarm testing. Multiple workloads will be executed at the same
12 |     /// time, enabling and disabling different nomt features.
13 |     Swarm(SwarmParams),
14 |     /// Execute a single workload given a seed.
15 |     Run(RunParams),
16 | }
17 | 
18 | #[derive(Clone, Debug, Args)]
19 | pub struct SwarmParams {
20 |     /// The maximum number of failures before the supervisor stops.
21 |     ///
22 |     /// If not provided, the supervisor will stop after the first failure.
23 |     #[arg(short, long, default_value_t = 1)]
24 |     pub flag_limit: usize,
25 | 
26 |     /// Folder that will be used as the working directory by the Supervisor.
27 |     /// It will contain all workload folders.
28 |     #[arg(long = "workdir")]
29 |     pub workdir: Option<String>,
30 | 
31 |     /// The maximum percentage of total disk space that torture will occupy.
32 |     #[clap(value_parser=clap::value_parser!(u8).range(1..=100))]
33 |     #[arg(long, default_value_t = 70)]
34 |     pub max_disk: u8,
35 | 
36 |     /// The maximum percentage of total memory that torture will occupy.
37 |     #[clap(value_parser=clap::value_parser!(u8).range(1..=100))]
38 |     #[arg(long, default_value_t = 70)]
39 |     pub max_memory: u8,
40 | }
41 | 
42 | #[derive(Clone, Debug, Args)]
43 | pub struct RunParams {
44 |     /// The 8-byte seed to use for the random number generator.
45 |     pub seed: u64,
46 | 
47 |     /// Amount of disk space in bytes assigned to the workload. [Default: 20GiB]
48 |     #[arg(short = 'd', long, default_value_t = 20 * 1024 * 1024 * 1024)]
49 |     pub assigned_disk: u64,
50 | 
51 |     /// Amount of memory in bytes assigned to the workload. [Default: 3GiB]
52 |     #[arg(short = 'm' ,long, default_value_t = 3 * 1024 * 1024 * 1024)]
53 |     pub assigned_memory: u64,
54 | 
55 |     /// Folder that will be used as the working directory by the Supervisor.
56 |     /// It will contain the folder of the workload that it is being executed.
57 |     #[arg(long = "workdir")]
58 |     pub workdir: Option<String>,
59 | 
60 |     /// Check whether the entire state is up to date as expected.
61 |     ///
62 |     /// This applies after every rollback.
63 |     #[arg(long = "ensure_snapshot", default_value = "false")]
64 |     pub ensure_snapshot: bool,
65 | }
66 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/controller.rs:
--------------------------------------------------------------------------------
  1 | use crate::message::{InitOutcome, OpenOutcome};
  2 | 
  3 | use super::{comms, config::WorkloadConfiguration};
  4 | use anyhow::Result;
  5 | use std::{
  6 |     path::PathBuf,
  7 |     sync::atomic::{AtomicBool, AtomicUsize, Ordering},
  8 | };
  9 | use tokio::{net::UnixStream, process::Child};
 10 | 
 11 | /// A controller is responsible for overseeing a single agent process and handle its lifecycle.
 12 | pub struct SpawnedAgentController {
 13 |     child: Child,
 14 |     rr: comms::RequestResponse,
 15 |     torn_down: AtomicBool,
 16 |     agent_number: usize,
 17 | }
 18 | 
 19 | // This is a safe-guard to ensure that the [`SpawnedAgentController::teardown`] is called
 20 | // properly.
 21 | impl Drop for SpawnedAgentController {
 22 |     fn drop(&mut self) {
 23 |         if self.torn_down.load(Ordering::Relaxed) {
 24 |             // The controller was torn down properly, disarm.
 25 |             return;
 26 |         }
 27 |         if std::thread::panicking() {
 28 |             // The controller was not torn down properly, but we are panicking.
 29 |             eprintln!("controller was not torn down properly");
 30 |             return;
 31 |         }
 32 |         panic!("controller was not torn down properly");
 33 |     }
 34 | }
 35 | 
 36 | impl SpawnedAgentController {
 37 |     pub async fn init(
 38 |         &mut self,
 39 |         workdir: String,
 40 |         workload_id: u64,
 41 |         trickfs: bool,
 42 |     ) -> Result<InitOutcome> {
 43 |         let id = format!("agent-{}-{}", workload_id, self.agent_number);
 44 |         let response = self
 45 |             .rr
 46 |             .send_request(crate::message::ToAgent::Init(crate::message::InitPayload {
 47 |                 id,
 48 |                 workdir,
 49 |                 trickfs,
 50 |             }))
 51 |             .await?;
 52 |         match response {
 53 |             crate::message::ToSupervisor::InitResponse(outcome) => return Ok(outcome),
 54 |             _ => {
 55 |                 panic!("expected init, unexpected response: {:?}", response);
 56 |             }
 57 |         }
 58 |     }
 59 | 
 60 |     pub async fn open(&self, config: &WorkloadConfiguration) -> Result<OpenOutcome> {
 61 |         let rollback = if config.is_rollback_enable() {
 62 |             Some(config.max_rollback_commits)
 63 |         } else {
 64 |             None
 65 |         };
 66 | 
 67 |         let response = self
 68 |             .rr
 69 |             .send_request(crate::message::ToAgent::Open(crate::message::OpenPayload {
 70 |                 bitbox_seed: config.bitbox_seed,
 71 |                 rollback,
 72 |                 commit_concurrency: config.commit_concurrency,
 73 |                 io_workers: config.io_workers,
 74 |                 hashtable_buckets: config.hashtable_buckets,
 75 |                 warm_up: config.warm_up,
 76 |                 preallocate_ht: config.preallocate_ht,
 77 |                 page_cache_size: config.page_cache_size,
 78 |                 leaf_cache_size: config.leaf_cache_size,
 79 |                 prepopulate_page_cache: config.prepopulate_page_cache,
 80 |                 page_cache_upper_levels: config.page_cache_upper_levels,
 81 |             }))
 82 |             .await?;
 83 |         match response {
 84 |             crate::message::ToSupervisor::OpenResponse(outcome) => return Ok(outcome),
 85 |             _ => {
 86 |                 panic!("expected open, unexpected response: {:?}", response);
 87 |             }
 88 |         }
 89 |     }
 90 | 
 91 |     /// Kills the process, shuts down the comms, and cleans up the resources.
 92 |     ///
 93 |     /// This returns only when the process is dead and the resources are cleaned up.
 94 |     ///
 95 |     /// The controller must be torn down manually. Dropping the controller is disallowed. This is
 96 |     /// done to control precisely when the agent process is killed.
 97 |     pub async fn teardown(mut self) {
 98 |         self.torn_down.store(true, Ordering::Relaxed);
 99 |         let _ = self.child.kill().await;
100 |     }
101 | 
102 |     /// Resolves when the agent process exits.
103 |     pub async fn died(&mut self) {
104 |         let _ = self.child.wait().await;
105 |     }
106 | 
107 |     pub fn rr(&self) -> &comms::RequestResponse {
108 |         &self.rr
109 |     }
110 | 
111 |     /// Returns the PID of the agent process.
112 |     ///
113 |     /// Returns `None` if the agent is torn down.
114 |     pub fn pid(&self) -> Option<u32> {
115 |         if self.torn_down.load(Ordering::Relaxed) {
116 |             None
117 |         } else {
118 |             self.child.id()
119 |         }
120 |     }
121 | }
122 | 
123 | /// Spawns an agent process creating a controller.
124 | ///
125 | /// The controller is placed in the `place` argument. `place` must be `None` when calling this
126 | /// function.
127 | pub async fn spawn_agent_into(
128 |     place: &mut Option<SpawnedAgentController>,
129 |     output_path: PathBuf,
130 | ) -> Result<()> {
131 |     assert!(place.is_none(), "the controller must be empty");
132 | 
133 |     let (child, sock) = crate::spawn::spawn_child(output_path)?;
134 | 
135 |     let stream = UnixStream::from_std(sock)?;
136 | 
137 |     let (rr, task) = comms::run(stream);
138 |     let _ = tokio::spawn(task);
139 | 
140 |     // Assign a unique ID to the agent.
141 |     static AGENT_COUNT: AtomicUsize = AtomicUsize::new(0);
142 |     let agent_number = AGENT_COUNT.fetch_add(1, Ordering::Relaxed);
143 | 
144 |     *place = Some(SpawnedAgentController {
145 |         agent_number,
146 |         child,
147 |         rr,
148 |         torn_down: AtomicBool::new(false),
149 |     });
150 |     Ok(())
151 | }
152 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/pbt.rs:
--------------------------------------------------------------------------------
 1 | //! Collection of process backtraces.
 2 | //!
 3 | //! This uses the grug-brain developer approach: just invoke the LLDB or GDB to get the backtrace.
 4 | 
 5 | use futures::future::join3;
 6 | use std::{path::Path, time::Duration};
 7 | use tokio::{
 8 |     fs,
 9 |     io::{AsyncRead, AsyncReadExt as _},
10 |     process::Command,
11 |     time::timeout,
12 | };
13 | use which::which;
14 | 
15 | pub async fn collect_process_backtrace(filename: &Path, pid: u32) -> anyhow::Result<()> {
16 |     // Determine which debugger tool to use.
17 |     let command_str = if which("lldb").is_ok() {
18 |         lldb(pid)
19 |     } else if which("gdb").is_ok() {
20 |         gdb(pid)
21 |     } else {
22 |         anyhow::bail!("no lldb or gdb in PATH")
23 |     };
24 | 
25 |     // Run the command using a shell
26 |     // Spawn the command using a shell so that we have a Child handle.
27 |     let mut child = Command::new("sh")
28 |         .arg("-c")
29 |         .arg(&command_str)
30 |         .stdout(std::process::Stdio::piped())
31 |         .stderr(std::process::Stdio::piped())
32 |         .spawn()?;
33 | 
34 |     let mut stdout_pipe = child.stdout.take().expect("stdout pipe");
35 |     let mut stderr_pipe = child.stderr.take().expect("stderr pipe");
36 | 
37 |     async fn read_pipe(pipe: &mut (impl AsyncRead + Unpin)) -> anyhow::Result<String> {
38 |         let mut reader = tokio::io::BufReader::new(pipe);
39 |         let mut buf = Vec::new();
40 |         reader.read_to_end(&mut buf).await?;
41 |         let stdout = String::from_utf8(buf)?;
42 |         Ok(stdout)
43 |     }
44 | 
45 |     let stdout_fut = read_pipe(&mut stdout_pipe);
46 |     let stderr_fut = read_pipe(&mut stderr_pipe);
47 | 
48 |     let (exit_code, stdout, stderr) = match timeout(
49 |         Duration::from_secs(5),
50 |         join3(child.wait(), stdout_fut, stderr_fut),
51 |     )
52 |     .await
53 |     {
54 |         Ok(v) => v,
55 |         Err(_) => {
56 |             // Timed out.
57 |             //
58 |             // Do the best-effort attempt at killing the child process.
59 |             //
60 |             // FIXME: Ideally we kill not just the child process but the entire process group.
61 |             tokio::spawn(async move { child.kill().await });
62 |             anyhow::bail!("Debugger command timed out after 5 seconds");
63 |         }
64 |     };
65 | 
66 |     let exit_code = exit_code?;
67 |     let stderr = stderr?;
68 |     let stdout = stdout?;
69 | 
70 |     if !exit_code.success() {
71 |         anyhow::bail!("command '{}' failed: {}", command_str, stderr);
72 |     }
73 | 
74 |     // Write the backtrace into the file specified by filename.
75 |     fs::write(&filename, &stdout).await?;
76 | 
77 |     Ok(())
78 | }
79 | 
80 | /// Generate the lldb command for obtaining the backtrace.
81 | fn lldb(pid: u32) -> String {
82 |     format!(
83 |         "lldb -p {} -o \"thread backtrace all\" -o \"detach\" -o \"quit\"",
84 |         pid
85 |     )
86 | }
87 | 
88 | /// Generate the gdb command for obtaining the backtrace.
89 | fn gdb(pid: u32) -> String {
90 |     format!(
91 |         "gdb -p {} -batch -ex \"thread apply all bt\" -ex \"detach\" -ex \"quit\"",
92 |         pid
93 |     )
94 | }
95 | 


--------------------------------------------------------------------------------
/torture/src/supervisor/swarm.rs:
--------------------------------------------------------------------------------
 1 | use rand::Rng;
 2 | 
 3 | pub enum SwarmFeatures {
 4 |     /// Trigger on and off trickfs to return ENOSPC.
 5 |     ///
 6 |     /// Will be used only when the assigned memory is smaller than
 7 |     /// `TRICKFS_MEMORY_THRESHOLD`.
 8 |     TrickfsENOSPC,
 9 |     /// Trigger on and off trickfs to inject latencies in every response.
10 |     ///
11 |     /// Will be used only when the assigned memory is smaller than
12 |     /// `TRICKFS_MEMORY_THRESHOLD`.
13 |     TrickfsLatencyInjection,
14 |     /// Ensure that the changeset was correctly applied
15 |     EnsureChangeset,
16 |     /// Randomly sample the state after every crash or rollback to check the
17 |     /// correctness of the state of the database.
18 |     SampleSnapshot,
19 |     /// Whether merkle page fetches should be warmed up while sessions are ongoing.
20 |     WarmUp,
21 |     /// Whether to preallocate the hashtable file.
22 |     PreallocateHt,
23 |     /// Whether each commit should perform a bunch of reads before applying a changeset.
24 |     Read,
25 |     /// Whether rollback should be performed.
26 |     Rollback,
27 |     /// Whether rollback crash should be exercised.
28 |     RollbackCrash,
29 |     /// Whether commit crash should be exercised.
30 |     CommitCrash,
31 |     /// Whether to prepopulate the upper levels of the page cache on startup.
32 |     PrepopulatePageCache,
33 |     /// Whether new keys should be inserted during commits.
34 |     NewKeys,
35 |     /// Whether keys should be deleted during commits.
36 |     DeleteKeys,
37 |     /// Whether keys should be updated during commits.
38 |     UpdateKeys,
39 |     /// Whether inserted values should be overflow ones.
40 |     OverflowValues,
41 | }
42 | 
43 | pub fn new_features_set(rng: &mut rand_pcg::Pcg64) -> Vec<SwarmFeatures> {
44 |     let mut features = vec![
45 |         SwarmFeatures::EnsureChangeset,
46 |         SwarmFeatures::SampleSnapshot,
47 |         SwarmFeatures::WarmUp,
48 |         SwarmFeatures::PreallocateHt,
49 |         SwarmFeatures::Read,
50 |         SwarmFeatures::Rollback,
51 |         SwarmFeatures::RollbackCrash,
52 |         SwarmFeatures::CommitCrash,
53 |         SwarmFeatures::PrepopulatePageCache,
54 |         SwarmFeatures::NewKeys,
55 |         SwarmFeatures::DeleteKeys,
56 |         SwarmFeatures::UpdateKeys,
57 |         SwarmFeatures::OverflowValues,
58 |     ];
59 | 
60 |     // Features removal mechanism -> coin tossing for almost every feature.
61 |     for idx in (0..features.len()).rev() {
62 |         if rng.gen_bool(0.5) {
63 |             features.remove(idx);
64 |         }
65 |     }
66 | 
67 |     // Trickfs related features are a little bit treated differently.
68 |     // Trickfs rely entirely on memory thus features related to it gets executed
69 |     // less often, in particular they will follow a bias coin tossing with
70 |     // `p = 0.052` being the probability of being added to the set of features.
71 |     //
72 |     // The probability of using Trickfs is 10% (= p*p + 2 * (p * (1-p))).
73 |     let p = 0.052;
74 |     if rng.gen_bool(p) {
75 |         features.push(SwarmFeatures::TrickfsLatencyInjection);
76 |     }
77 |     if rng.gen_bool(p) {
78 |         features.push(SwarmFeatures::TrickfsENOSPC);
79 |     }
80 | 
81 |     features
82 | }
83 | 


--------------------------------------------------------------------------------
/trickfs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "trickfs"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | [dependencies]
11 | fuser.workspace = true
12 | libc.workspace = true
13 | log.workspace = true
14 | tempfile.workspace = true
15 | rand.workspace = true
16 | rand_pcg.workspace = true
17 | rand_distr.workspace = true
18 | 
19 | [dev-dependencies]
20 | env_logger.workspace = true
21 | 


--------------------------------------------------------------------------------
/trickfs/README.md:
--------------------------------------------------------------------------------
 1 | # trickfs
 2 | 
 3 | A FUSE filesystem useful for failure injection.
 4 | 
 5 | # Using trickfs.
 6 | 
 7 | Typically you would not need to run trickfs directly, because it should be used as a dependency
 8 | in other projects. However, if you want to test the filesystem, you can do so by running the
 9 | following command:
10 | 
11 | ```sh
12 | cargo run --release --bin trickmnt
13 | ```
14 | 
15 | # Building
16 | 
17 | Building the project requires fuse3 and fuse to be available. On Ubuntu, you can install them with
18 | the following commands:
19 | 
20 | ```sh
21 | sudo apt update
22 | sudo apt install libfuse3-dev libfuse-dev
23 | ```
24 | 
25 | On macOS you may need to install osxfuse:
26 | 
27 | ```sh
28 | brew install macfuse
29 | ```
30 | 


--------------------------------------------------------------------------------
/trickfs/src/latency.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     collections::VecDeque,
 3 |     sync::mpsc::{self, Receiver, RecvTimeoutError, Sender},
 4 |     time::{Duration, Instant},
 5 | };
 6 | 
 7 | use rand::SeedableRng;
 8 | use rand_distr::Distribution;
 9 | 
10 | /// Max possible delay, in micros, used as injected latency.
11 | const MAX_LATENCY_MICROS: u64 = 1000;
12 | type Reply = Box<dyn FnOnce() + Send + 'static>;
13 | 
14 | /// An injector of latencies.
15 | ///
16 | /// This allows to schedule replies after a certain delay.
17 | /// Delays are randomly chosen following a Pareto Distribution.
18 | /// 80% of the delay will be below 20% of MAX_LATENCY_MICROS
19 | pub struct LatencyInjector {
20 |     rng: rand_pcg::Pcg64,
21 |     distr: rand_distr::Pareto<f64>,
22 |     tx: Sender<(Reply, Duration)>,
23 | }
24 | 
25 | impl LatencyInjector {
26 |     pub fn new(seed: u64) -> Self {
27 |         let (tx, rx) = mpsc::channel();
28 |         let _ = std::thread::spawn(|| scheduler(rx));
29 |         Self {
30 |             rng: rand_pcg::Pcg64::seed_from_u64(seed),
31 |             distr: rand_distr::Pareto::new(1.0, 1.16).unwrap(),
32 |             tx,
33 |         }
34 |     }
35 | 
36 |     pub fn schedule_reply(&mut self, reply: Reply) {
37 |         // Shift and scale, values above 100.0 (0.05%) are clipped to MAX_LATENCY_MICROS.
38 |         let f = f64::min((self.distr.sample(&mut self.rng) - 1.0) / 100.0, 1.0);
39 |         let micros = (f * MAX_LATENCY_MICROS as f64).round() as u64;
40 |         let delay = Duration::from_micros(micros);
41 |         self.tx.send((reply, delay)).unwrap();
42 |     }
43 | }
44 | 
45 | /// Task used to execute every scheduled reply.
46 | fn scheduler(rx: Receiver<(Reply, Duration)>) {
47 |     let mut scheduled: VecDeque<(Reply, Instant)> = VecDeque::new();
48 |     loop {
49 |         let (_, deadline) = match scheduled.front() {
50 |             Some((reply, deadline)) => (reply, deadline),
51 |             None => {
52 |                 // Nothing scheduled, wait for next reply.
53 |                 match rx.recv() {
54 |                     Ok((reply, delay)) => {
55 |                         schedule_new_reply(&mut scheduled, reply, delay);
56 |                     }
57 |                     Err(_) => break,
58 |                 }
59 |                 continue;
60 |             }
61 |         };
62 | 
63 |         // Wait for a new reply to be scheduled or until we reach the deadline
64 |         // of the first reply in the queue.
65 |         let timeout = deadline.saturating_duration_since(std::time::Instant::now());
66 |         match rx.recv_timeout(timeout) {
67 |             Ok((reply, delay)) => schedule_new_reply(&mut scheduled, reply, delay),
68 |             Err(RecvTimeoutError::Timeout) => {
69 |                 let (reply, _) = scheduled.pop_front().unwrap();
70 |                 reply();
71 |             }
72 |             Err(RecvTimeoutError::Disconnected) => break,
73 |         };
74 |     }
75 | 
76 |     // Answer to all pending replies.
77 |     for (reply, _) in scheduled {
78 |         reply();
79 |     }
80 | }
81 | 
82 | /// Insert the reply into the scheduled queue following an order by the deadlines.
83 | fn schedule_new_reply(scheduled: &mut VecDeque<(Reply, Instant)>, reply: Reply, delay: Duration) {
84 |     let deadline = std::time::Instant::now() + delay.clone();
85 |     // If two replies happen to have the same deadline, then they will be kept in FIFO order.
86 |     let idx = match scheduled.binary_search_by_key(&deadline, |(_, d)| *d) {
87 |         Ok(idx) => idx + 1,
88 |         Err(idx) => idx,
89 |     };
90 |     scheduled.insert(idx, (reply, deadline));
91 | }
92 | 


--------------------------------------------------------------------------------
/trickfs/trickmnt/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "trickmnt"
 3 | version = "0.1.0"
 4 | authors.workspace = true
 5 | homepage.workspace = true
 6 | repository.workspace = true
 7 | edition.workspace = true
 8 | license.workspace = true
 9 | 
10 | [dependencies]
11 | trickfs = { path = ".." }
12 | clap = { version = "4.3.5", features = ["derive"] }
13 | env_logger = "0.11.6"
14 | log = "0.4.22"
15 | anyhow = "1.0.95"
16 | 


--------------------------------------------------------------------------------
/trickfs/trickmnt/src/main.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | #[derive(Parser, Debug)]
 4 | #[command(author, version, about, long_about = None)]
 5 | struct Args {
 6 |     /// Path to the directory where trickfs will be mounted
 7 |     #[arg(short, long, default_value = "/tmp/trick")]
 8 |     mountpoint: String,
 9 | }
10 | 
11 | fn waitline() {
12 |     log::info!("press return to stop...");
13 |     let _ = std::io::stdin().read_line(&mut String::new());
14 | }
15 | 
16 | fn main() -> anyhow::Result<()> {
17 |     env_logger::builder()
18 |         .filter_level(log::LevelFilter::Info)
19 |         .init();
20 | 
21 |     let args = Args::parse();
22 | 
23 |     let handle = trickfs::spawn_trick(args.mountpoint, 0).unwrap();
24 |     waitline();
25 |     drop(handle);
26 | 
27 |     Ok(())
28 | }
29 | 


--------------------------------------------------------------------------------