├── .cargo
    └── config.toml
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── NEWS
├── README.md
├── benchmarks
    └── async.rs
├── bigtest
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── justfile
├── rust-toolchain.toml
├── src
    ├── data_blocks
    │   ├── block.rs
    │   ├── builder.rs
    │   └── mod.rs
    ├── database.rs
    ├── disk.rs
    ├── index_blocks.rs
    ├── iterate.rs
    ├── level.rs
    ├── level_logger.rs
    ├── lib.rs
    ├── logic.rs
    ├── manifest.rs
    ├── memtable.rs
    ├── params.rs
    ├── sorted_table
    │   ├── builder.rs
    │   ├── iterator.rs
    │   ├── mod.rs
    │   └── tests.rs
    ├── tasks.rs
    ├── values
    │   ├── batch.rs
    │   ├── index.rs
    │   ├── mod.rs
    │   └── tests.rs
    ├── wal
    │   ├── mod.rs
    │   ├── reader.rs
    │   ├── tests.rs
    │   └── writer.rs
    └── write_batch.rs
├── sync
    ├── Cargo.toml
    ├── justfile
    ├── src
    │   ├── database.rs
    │   ├── iterate.rs
    │   └── lib.rs
    └── tests
    │   ├── basic.rs
    │   └── reopen.rs
└── tests
    ├── basic.rs
    └── reopen.rs


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [net]
2 | git-fetch-with-cli = true
3 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main ]
  6 |   pull_request:
  7 |     branches: [ main ]
  8 | 
  9 | env:
 10 |   CARGO_TERM_COLOR: always
 11 | 
 12 | jobs:
 13 |   test:
 14 |     runs-on: ubuntu-24.04
 15 |     steps:
 16 |     - uses: actions/checkout@v4
 17 |     - name: Install rustc and clippy nightly
 18 |       uses: dtolnay/rust-toolchain@stable
 19 |       with:
 20 |          toolchain: nightly-2025-05-26
 21 |          components: cargo, rustc, clippy, rustfmt
 22 |     - name: Install Just
 23 |       uses: taiki-e/install-action@v2
 24 |       with:
 25 |         tool: just
 26 |     - name: "Test: Async"
 27 |       run: just async-tests
 28 |       timeout-minutes: 10
 29 |     - name: "Test: Tokio-Uring"
 30 |       run: just tokio-uring-tests
 31 |     - name: "Test: Monoio"
 32 |       run: just monoio-tests
 33 |     - name: "Tests: Sync"
 34 |       run: just sync-tests
 35 |       timeout-minutes: 10
 36 |     - name: "Tests: No compression"
 37 |       run: just no-compression-tests
 38 |       timeout-minutes: 10
 39 |     - name: "Tests: Wisckey"
 40 |       run: just wisckey-tests
 41 |     - name: "Tests: Wisckey with no compression"
 42 |       run: just wisckey-no-compression-tests
 43 |       timeout-minutes: 10
 44 |     - name: "Tests: Sync Wisckey"
 45 |       run: just wisckey-sync-tests
 46 |     - name: "Test: Wisckey and Tokio-Uring"
 47 |       run: just tokio-uring-wisckey-tests
 48 |     - name: "Test: Wisckey and Monoio"
 49 |       run: just monoio-wisckey-tests
 50 |       timeout-minutes: 10
 51 |   big-test:
 52 |     runs-on: ubuntu-24.04
 53 |     steps:
 54 |     - uses: actions/checkout@v4
 55 |     - name: Install rustc and clippy nightly
 56 |       uses: dtolnay/rust-toolchain@stable
 57 |       with:
 58 |          toolchain: nightly
 59 |          components: cargo, rustc, clippy, rustfmt
 60 |     - name: Install Just
 61 |       uses: taiki-e/install-action@v2
 62 |       with:
 63 |         tool: just
 64 |     - name: Insert many entries 
 65 |       run: just bigtest-many
 66 |     - name: Insert large entries 
 67 |       run: just bigtest-large
 68 |   lint:
 69 |     runs-on: ubuntu-24.04
 70 |     steps:
 71 |     - uses: actions/checkout@v4
 72 |     - name: Install rustc and clippy nightly
 73 |       uses: dtolnay/rust-toolchain@stable
 74 |       with:
 75 |          toolchain: nightly
 76 |          components: cargo, rustc, clippy, rustfmt
 77 |     - name: Install Just
 78 |       uses: taiki-e/install-action@v2
 79 |       with:
 80 |         tool: just
 81 |     - name: "Lint Checks: Tokio (with sync FS)"
 82 |       run: just async-lint
 83 |     - name: "Lint Checks: Tokio-Uring"
 84 |       run: just tokio-uring-lint
 85 |     - name: "Lint Checks: Monoio "
 86 |       run: just monoio-lint
 87 |     - name: "Lint Checks: Synchronous API"
 88 |       run: just sync-lint
 89 |     - name: "Lint Checks: Wisckey"
 90 |       run: just wisckey-lint
 91 |     - name: "Lint Checks: Wisckey with no comrpession"
 92 |       run: just wisckey-lint
 93 |     - name: "Lint Checks: Wisckey and Tokio-Uring"
 94 |       run: just tokio-uring-wisckey-lint
 95 |     - name: "Lint Checks: Wisckey and Monoio"
 96 |       run: just monoio-wisckey-lint
 97 |     - name: "Formatting Checks"
 98 |       run: just check-formatting
 99 |     - name: Check for unused dependencies
100 |       run: |
101 |          cargo install cargo-machete
102 |          cargo machete
103 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | *.lsm
3 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsm"
 3 | version = "0.5.0-dev"
 4 | authors = ["Kai Mast <kai@kaimast.com>"]
 5 | edition = "2024"
 6 | repository = "https://github.com/kaimast/lsm-rs"
 7 | description = "An implementation of log-structured merge trees in pure Rust"
 8 | license = "MIT"
 9 | readme = "README.md"
10 | keywords = ["storage", "database", "async"]
11 | rust-version = "1.88"
12 | 
13 | [dependencies]
14 | lru = "0.14"
15 | parking_lot = "0.12"
16 | memmap2 = "0.9"
17 | byte-slice-cast = "1"
18 | zerocopy = { version="0.8", features=["derive"] }
19 | log = "0.4"
20 | futures = "0.3"
21 | snap = { version="1", optional=true }
22 | async-trait = "0.1"
23 | cfg-if = "1"
24 | tracing = { version="0.1", default-features=false, features=["attributes"] }
25 | csv = "1"
26 | tokio-condvar = { version="0.3", features=["parking_lot"] }
27 | tokio-uring = { version="0.5", optional=true }
28 | bloomfilter = { version="3", optional=true }
29 | monoio = { version="0.2", optional=true, features=["sync"] }
30 | kioto-uring-executor = { version="0.3.0-dev", optional=true, default-features=false, features=["macros", "tokio-uring"] }
31 | bitvec = { version="1", optional=true }
32 | 
33 | [dependencies.tokio]
34 | version="1"
35 | default-features=false
36 | features=["io-util", "sync", "macros", "tracing", "time"]
37 | 
38 | [dev-dependencies]
39 | clap = { version="4", features=["derive"] }
40 | env_logger = "0.11"
41 | tempfile = "3"
42 | tracing-tracy = "0.11"
43 | tokio = { version="1", default-features=false, features=["rt-multi-thread"] }
44 | tracing-subscriber = { version="0.3", default-features=false }
45 | rand = "0.9"
46 | 
47 | [lib]
48 | path = "src/lib.rs"
49 | 
50 | [features]
51 | default = ["snappy-compression", "bloom-filters"]
52 | monoio = ["dep:monoio", "_async-io"]
53 | snappy-compression = ["dep:snap"]
54 | wisckey = ["dep:bitvec"]
55 | bloom-filters = ["dep:bloomfilter"]
56 | tokio-uring = ["dep:tokio-uring", "dep:kioto-uring-executor", "_async-io"]
57 | _async-io = []
58 | 
59 | [[test]]
60 | name = "basic"
61 | path = "tests/basic.rs"
62 | 
63 | [[test]]
64 | name = "reopen"
65 | path = "tests/reopen.rs"
66 | 
67 | [[example]]
68 | name = "lsm-benchmark"
69 | path = "benchmarks/async.rs"
70 | 
71 | [workspace]
72 | members = ["sync", "bigtest"]
73 | 
74 | [patch.crates-io]
75 | kioto-uring-executor = { git = "https://github.com/kaimast/kioto-uring-executor" }
76 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kai Mast
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | UNRELEASED (0.5):
 2 |     - Bump MSRV to 1.88
 3 |     - More extensive testing
 4 |     - Add support for monoio
 5 |     - Codebase moved from Rust 2021 to 2024
 6 |     - Updated bloomfilter to 0.3
 7 | 
 8 | 0.4.1:
 9 |     - Fix an error during compaction
10 |     - Ensure everything has been written to the write-ahead log before shutting down
11 | 
12 | 0.4.0:
13 |     - Move sync API into a separate lsm-sync crate
14 |     - Removed KvTrait. The crate now only accept and returns bytes
15 |     - Get operations now return a reference to the data without copying
16 |     - Leverage zerocopy wherever possible to reduce serialization cost
17 |     - Update tokio-uring and kioto-uring-executor dependencies
18 | 
19 | 0.3.0:
20 |     - Write-Ahead logging moved to a dedicated thread (or async task)
21 |     - Support for io_uring
22 |     - Allow iterating in reverse order
23 |     - Add bloom filter support
24 |     - Various performance improvements
25 |     - Use tokio-condvar in more places
26 | 
27 | 0.2.0:
28 |     - Support for tracing to benchmark the library
29 |     - Removed custom Condition Variable implementation
30 |     - Databases can be reopened
31 |     - WiscKey now uses a more efficient "folding"-based implementation
32 |     - Allow disabling key-value separation using the "wisckey" feature flag
33 |     - Implemented proper garbage collection for the value log
34 |     - The Write-Ahead-Log is properly truncated after writes are flushed to L0
35 | 
36 | 0.1.0:
37 |     - Basic key-value store functionality
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Modular, Asynchronous Implementation of a Log-Structured Merge Tree
  2 | 
  3 | [![ci-badge](https://github.com/kaimast/lsm-rs/actions/workflows/ci.yml/badge.svg)](https://github.com/kaimast/lsm-rs/actions)
  4 | [![license-badge](https://img.shields.io/crates/l/lsm)](https://github.com/kaimast/lsm-rs/blob/main/LICENSE)
  5 | [![crates-badge](https://img.shields.io/crates/v/lsm)](https://crates.io/crates/lsm)
  6 | 
  7 | **Note: While this implementation is used by us and has not caused major problems, we do not recommend it yet production environments.**
  8 |  Please use the [leveldb](https://github.com/skade/leveldb) or [rocksdb](https://github.com/rust-rocksdb/rust-rocksdb) crate for this purpose.
  9 | 
 10 | This implementation does *not* aim to reimplement LevelDB. The major differences are:
 11 | * *Separation of keys and values*: Values can be stored seperately to increase compaction speed as outlined in the [WiscKey](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf) paper
 12 | * *Concurrent compaction*: Multiple threads can compact at the same time for higher write throughput
 13 | * *Async-support*: All API calls are exposed as async functions
 14 | * *io_uring-support*: For async file system access on Linux. Optional and still considered experimental.
 15 | * *Bloom filters* for faster lookups
 16 | 
 17 | ## Supported Platforms and Architectures
 18 | Currently, the code is only tested with Linux on x86 machines, but it should run on most systems supported by the Rust compiler.
 19 | 
 20 | ## On-Disk Format
 21 | LSM stores data using [zerocopy](https://github.com/google/zerocopy) to achieve high performance.
 22 | The implementation does not account for endianness so on-disk formats are not portable.
 23 | Replication across machines should be handled at a different layer of the system.
 24 | However, we may add a converter tool in the future or an `endianess` feature flag if needed.
 25 | 
 26 | ## Planned Features
 27 | * FLSM: Like [PebblesDB](https://github.com/utsaslab/pebblesdb) LSM-rs will fragment the keyspace to reduce write amplification and increase compaction speed
 28 | * Custom sorting functions
 29 | * More modularity and configuration options
 30 | 
 31 | ## Feature Flags
 32 | * `snappy-compression`: Use the [snappy format](https://docs.rs/snap/1.0.5/snap/) to compress data on disk *(enabled by default)*
 33 | * `bloom-filters`: Add bloom filters to data blocks for more efficient searching. *(enabled by default)*
 34 | * `monoio`: Use `monoio` as async runtime I/O instead of `tokio. Note, this will not spawn any additional OS threads. ` *(disabled by default)*
 35 | * `tokio-uring`: Use `tokio_uring` as async runtime (using `toio-uring-executor`) instead of regular `tokio`. *(disabled by default)*
 36 | * `wisckey`: Store keys and values separately. This usually results in higher throughput with slightly higher CPU-usage. *(disabled by default)*
 37 | 
 38 | ## Synchronous API
 39 | This crate exposes an async API intended to be used with Tokio or a similar runtime.
 40 | Alternatively, you can use the lsm-sync crate included in this repo, which internally uses Tokio but expose a synchronous API.
 41 | 
 42 | ## Sort Order
 43 | You need to serialize your data in a way that its byte representation maintains the same ordering as the unserialized data.
 44 | For example, you may want to use [big endian](https://en.wikipedia.org/wiki/Endianness) encoding so that numerical values are ordered correctly.
 45 | 
 46 | ## Usage
 47 | 
 48 | You can create or open a new databse instance as shown below.
 49 | ```rust
 50 | use lsm::{Database, Params};
 51 | 
 52 | // Set options here, such as the location of the database files
 53 | let params = Params {
 54 |     db_path,
 55 |     ..Default::default()
 56 | };
 57 | 
 58 | // Instantiate database
 59 | let database = Database::new_with_params(SM, params)
 60 |     .await
 61 |     .expect("Failed to create database instance");
 62 | ```
 63 | 
 64 | To write to the database use the `get` call. Note that the crate only supports
 65 | writing byte vectors. (De-)serialization is supposed to happen at another layer.
 66 | ```rust
 67 | let key = String::from("mykey").into_bytes();
 68 | let value = String::from("hello world").into_bytes();
 69 | 
 70 | database.put(key, value).await.expect("Writing to database failed");
 71 | ```
 72 | 
 73 | When reading, LSM will return a reference to the data to avoid copying.
 74 | ```rust
 75 | let value_ref = database.get(&key).await.expect("Reading failed");
 76 | 
 77 | // Returns a slice to the data
 78 | let data: &[u8] = value_ref.get_value();
 79 | 
 80 | // Assuming the put from above workd, this will print "hello world"
 81 | println!("{}", std::str::from_utf(data).unwrap());
 82 | ```
 83 | 
 84 | Please refer to the tests for more examples to how to use the crate.
 85 | 
 86 | ## Tests
 87 | This library ships with several tests. We provide a [justfile](https://github.com/casey/just) for convenience:
 88 | 
 89 | ```sh
 90 | just test #runs all tests for all configurations
 91 | just lint #runs cargo clippy
 92 | ```
 93 | 
 94 | ## Notes on io-uring
 95 | Currently, the io-uring feature relies on [tokio-uring-executor](https://github.com/kaimast/tokio-uring-executor), a simplistic multi-threaded wrapper around `tokio-uring`.
 96 | Eventually `tokio-uring` will [support multiple threads natively](https://github.com/tokio-rs/tokio-uring/issues/258) and this workaround will be removed.
 97 | 
 98 | I would also like to add support for more mature io_uring runtimes such as [gloomio](https://github.com/DataDog/glommio) but only have limited time to work on this crate. Help is very welcome.
 99 | 
100 | ## Similar Crates
101 | This is an incomplete list of crates that provide similar functionality. Please reach out if you know of others to add.
102 | 
103 | ### LSM trees
104 | * [rust-rocksdb](https://github.com/rust-rocksdb/rust-rocksdb): Rust bindings for RocksDB
105 | * [leveldb](https://github.com/skade/leveldb): Rust bindings for LevelDB
106 | * [wickdb](https://github.com/Fullstop000/wickdb): Rust re-implementation of vanilla LevelDB
107 | * [agatedb](https://github.com/tikv/agatedb): A WiscKey implementation in Rust for TiKV
108 | 
109 | ### Other Key-Value Stores
110 | These differ significantly in their approach but also provide a key-value store abstraction
111 | * [redb](https://github.com/cberner/redb)
112 | 
113 | 


--------------------------------------------------------------------------------
/benchmarks/async.rs:
--------------------------------------------------------------------------------
 1 | use clap::Parser;
 2 | 
 3 | use tempfile::{Builder, TempDir};
 4 | 
 5 | use tracing_subscriber::prelude::*;
 6 | use tracing_tracy::TracyLayer;
 7 | 
 8 | use lsm::{Database, Params, StartMode, WriteOptions};
 9 | 
10 | #[derive(Parser)]
11 | #[clap(author, version, about, long_about = None)]
12 | struct Args {
13 |     #[clap(long)]
14 |     enable_tracing: bool,
15 |     #[clap(long)]
16 |     log_level_stats: Option<String>,
17 |     #[clap(long, default_value = "100000")]
18 |     num_entries: usize,
19 | }
20 | 
21 | async fn bench_init(args: &Args) -> (TempDir, Database) {
22 |     if args.enable_tracing {
23 |         tracing_subscriber::registry()
24 |             .with(TracyLayer::default())
25 |             .init();
26 |     }
27 | 
28 |     let _ = env_logger::builder().is_test(true).try_init();
29 |     let tmp_dir = Builder::new()
30 |         .prefix("lsm-async-benchmark-")
31 |         .tempdir()
32 |         .unwrap();
33 | 
34 |     let mut db_path = tmp_dir.path().to_path_buf();
35 |     db_path.push("storage.lsm");
36 | 
37 |     let params = Params {
38 |         db_path,
39 |         log_level_stats: args.log_level_stats.clone(),
40 |         ..Default::default()
41 |     };
42 |     const SM: StartMode = StartMode::CreateOrOverride;
43 | 
44 |     let database = Database::new_with_params(SM, params)
45 |         .await
46 |         .expect("Failed to create database instance");
47 | 
48 |     (tmp_dir, database)
49 | }
50 | 
51 | #[cfg_attr(feature = "tokio-uring", kioto_uring_executor::main)]
52 | #[cfg_attr(feature = "monoio", monoio::main)]
53 | #[cfg_attr(not(feature = "_async-io"), tokio::main)]
54 | async fn main() {
55 |     let args = Args::parse();
56 | 
57 |     let (_tmpdir, database) = bench_init(&args).await;
58 | 
59 |     log::info!("Starting read/write benchmark");
60 | 
61 |     let options = WriteOptions { sync: false };
62 | 
63 |     log::debug!("Writing {} entries", args.num_entries);
64 | 
65 |     for pos in 0..args.num_entries {
66 |         let key = format!("{pos}").into_bytes();
67 |         let value = format!("some_string_{pos}").into_bytes();
68 |         database.put_opts(key, value, &options).await.unwrap();
69 |     }
70 | 
71 |     log::debug!("Reading {} entries", args.num_entries);
72 | 
73 |     for pos in 0..args.num_entries {
74 |         let key = format!("{pos}").into_bytes();
75 |         let expected = format!("some_string_{pos}").into_bytes();
76 | 
77 |         assert_eq!(
78 |             database.get(&key).await.unwrap().unwrap().get_value(),
79 |             expected,
80 |         );
81 |     }
82 | 
83 |     database.stop().await.unwrap();
84 |     log::info!("Done");
85 | }
86 | 


--------------------------------------------------------------------------------
/bigtest/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsm-bigtest"
 3 | version = "0.5.0-dev"
 4 | edition = "2024"
 5 | authors = ["Kai Mast <kai@kaimast.com>"]
 6 | license = "MIT"
 7 | description = "Runs a longer test with lots of data"
 8 | readme = "../README.md"
 9 | 
10 | [dependencies]
11 | kioto-uring-executor = { version="0.3.0-dev", default-features=false, features=["macros"] }
12 | clap = { version="4", features=["derive"] }
13 | env_logger = "0.11"
14 | tempfile = "3"
15 | rand = "0.8"
16 | 
17 | [dependencies.lsm]
18 | path = ".."
19 | features = ["tokio-uring"]
20 | 


--------------------------------------------------------------------------------
/bigtest/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use clap::Parser;
  4 | use rand::Rng;
  5 | 
  6 | use lsm::{Database, Params, StartMode};
  7 | 
  8 | #[derive(Parser)]
  9 | struct Args {
 10 |     #[clap(long, short = 'n', default_value_t = 100_000)]
 11 |     #[clap(help = "The number of insertions per thread")]
 12 |     num_insertions: usize,
 13 | 
 14 |     #[clap(long, short = 't', default_value_t = 10)]
 15 |     num_threads: usize,
 16 | 
 17 |     #[clap(long, default_value_t = 1_000_000)]
 18 |     key_range: usize,
 19 | 
 20 |     #[clap(long, default_value_t = 1024)]
 21 |     entry_size: usize,
 22 | 
 23 |     #[clap(long, default_value = "/tmp")]
 24 |     #[clap(
 25 |         help = "Where to create the temporary working directory? Note, this is the parent directoy of the directory not the directoy itself.
 26 | It is recommended to use a tmpfs to not wear out a physical disk"
 27 |     )]
 28 |     workdir_location: String,
 29 | }
 30 | 
 31 | #[kioto_uring_executor::main]
 32 | async fn main() {
 33 |     env_logger::init();
 34 | 
 35 |     let args = Args::parse();
 36 | 
 37 |     if args.num_insertions == 0 {
 38 |         panic!("Need to insert at least one entry");
 39 |     }
 40 | 
 41 |     if args.key_range == 0 {
 42 |         panic!("Key range cannot be zero");
 43 |     }
 44 | 
 45 |     println!("Creating working directory and empty database");
 46 |     let tmp_dir = tempfile::Builder::new()
 47 |         .prefix("lsm-bigest-")
 48 |         .tempdir_in(args.workdir_location)
 49 |         .expect("Failed to create working directory");
 50 | 
 51 |     let mut db_path = tmp_dir.path().to_path_buf();
 52 |     db_path.push("storage.lsm");
 53 | 
 54 |     let params = Params {
 55 |         db_path,
 56 |         ..Default::default()
 57 |     };
 58 | 
 59 |     let database = Arc::new(
 60 |         Database::new_with_params(StartMode::CreateOrOverride, params)
 61 |             .await
 62 |             .expect("Failed to create database instance"),
 63 |     );
 64 | 
 65 |     println!(
 66 |         "Inserting a total of {} entries of size {} across {} threads",
 67 |         args.num_insertions * args.num_threads,
 68 |         args.entry_size,
 69 |         args.num_threads
 70 |     );
 71 | 
 72 |     let tasks: Vec<_> = (1..=args.num_threads)
 73 |         .map(|idx| {
 74 |             let database = database.clone();
 75 |             kioto_uring_executor::spawn_with(move || {
 76 |                 let mut rng = rand::thread_rng();
 77 |                 Box::pin(async move {
 78 |                     for count in 1..=args.num_insertions {
 79 |                         let key_idx = rng.gen_range(0..args.key_range);
 80 |                         let key = format!("key{key_idx}").as_bytes().to_vec();
 81 | 
 82 |                         let mut value = vec![0; args.entry_size];
 83 |                         rng.fill(value.as_mut_slice());
 84 | 
 85 |                         database.put(key, value).await.expect("Insert failed");
 86 | 
 87 |                         if count % 10_000 == 0 {
 88 |                             println!(
 89 |                                 "Thread #{idx} inserted {count} entries so far ({}%)",
 90 |                                 (count as f64) * 100.0 / (args.num_insertions as f64)
 91 |                             );
 92 |                         }
 93 |                     }
 94 |                     println!("Thread #{idx} is done");
 95 |                 })
 96 |             })
 97 |         })
 98 |         .collect();
 99 | 
100 |     for task in tasks {
101 |         task.join().await;
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
  1 | LOG_LEVEL := "debug"
  2 | 
  3 | # Common prefix for lints
  4 | CLIPPY := "cargo clippy --no-default-features --tests"
  5 | 
  6 | all: tests lint
  7 | 
  8 | tests: sync-tests async-tests no-compression-tests \
  9 |        tokio-uring-tests wisckey-tests \
 10 |        wisckey-no-compression-tests wisckey-sync-tests \
 11 |        monoio-tests monoio-wisckey-tests
 12 | 
 13 | sync-tests:
 14 |     cd sync && just default-tests
 15 | 
 16 | async-tests:
 17 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features
 18 | 
 19 | tokio-uring-tests:
 20 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=tokio-uring,bloom-filters -- --test-threads=1
 21 | 
 22 | monoio-tests:
 23 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=monoio,bloom-filters -- --test-threads=1
 24 | 
 25 | monoio-wisckey-tests:
 26 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=monoio,wisckey,bloom-filters -- --test-threads=1
 27 | 
 28 | tokio-uring-wisckey-tests:
 29 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=tokio-uring,wisckey,bloom-filters -- --test-threads=1
 30 | 
 31 | no-compression-tests:
 32 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features
 33 | 
 34 | wisckey-tests:
 35 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=snappy-compression,wisckey
 36 | 
 37 | wisckey-no-compression-tests:
 38 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=wisckey
 39 | 
 40 | wisckey-sync-tests:
 41 |     cd sync && just wisckey-tests
 42 | 
 43 | lint: sync-lint async-lint wisckey-lint \
 44 |       wisckey-no-compression-lint tokio-uring-lint \
 45 |       tokio-uring-wisckey-lint monoio-lint monoio-wisckey-lint \
 46 |       bigtest-lint
 47 | 
 48 | fix-formatting:
 49 |     cargo fmt
 50 |     cd sync && just fix-formatting
 51 |     cd bigtest && cargo fmt
 52 | 
 53 | check-formatting:
 54 |     cargo fmt --check
 55 |     cd sync && just check-formatting
 56 | 
 57 | clean:
 58 |     rm -rf target/
 59 | 
 60 | update-dependencies:
 61 |     cargo update
 62 |     cd sync && cargo update
 63 | 
 64 | udeps:
 65 |     cargo udeps --all-targets --release
 66 |     cd sync && just udeps
 67 | 
 68 | sync-lint:
 69 |     cd sync && just lint
 70 | 
 71 | async-lint:
 72 |     {{CLIPPY}} -- -D warnings
 73 | 
 74 | tokio-uring-lint:
 75 |     {{CLIPPY}} --features=tokio-uring,bloom-filters -- -D warnings
 76 | 
 77 | monoio-lint:
 78 |     {{CLIPPY}} --features=monoio,bloom-filters -- -D warnings
 79 | 
 80 | monoio-wisckey-lint:
 81 |     {{CLIPPY}} --features=monoio,wisckey,bloom-filters -- -D warnings
 82 | 
 83 | wisckey-lint:
 84 |     {{CLIPPY}} --features=snappy-compression,wisckey -- -D warnings
 85 | 
 86 | wisckey-no-compression-lint:
 87 |     {{CLIPPY}} --features=wisckey -- -D warnings
 88 | 
 89 | tokio-uring-wisckey-lint:
 90 |     {{CLIPPY}} --features=tokio-uring,snappy-compression,wisckey -- -D warnings
 91 | 
 92 | bigtest-lint:
 93 |     {{CLIPPY}} --package=lsm-bigtest
 94 | 
 95 | bigtest-many:
 96 |     cargo run --release --package=lsm-bigtest -- -n100000 --entry-size=1024
 97 | 
 98 | bigtest-large:
 99 |     cargo run --release --package=lsm-bigtest -- -n100 --entry-size=100000
100 | 
101 | 


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel="nightly-2025-05-26"
3 | components=["cargo", "rustc", "clippy", "rustfmt"]
4 | 


--------------------------------------------------------------------------------
/src/data_blocks/block.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::sync::Arc;
  3 | 
  4 | use crate::Key;
  5 | 
  6 | use super::{DataEntry, SearchResult};
  7 | 
  8 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
  9 | 
 10 | #[cfg(feature = "bloom-filters")]
 11 | use bloomfilter::Bloom;
 12 | 
 13 | #[cfg(feature = "wisckey")]
 14 | use crate::values::{ValueBatchId, ValueOffset};
 15 | 
 16 | #[cfg(feature = "bloom-filters")]
 17 | //TODO change the size of this depending on max_key_block_length
 18 | pub(super) const BLOOM_LENGTH: usize = 1024;
 19 | 
 20 | #[cfg(feature = "bloom-filters")]
 21 | pub(super) const BLOOM_ITEM_COUNT: usize = 1024;
 22 | 
 23 | #[cfg(feature = "bloom-filters")]
 24 | /// Taken from https://github.com/jedisct1/rust-bloom-filter/blob/6b93b922be474998514b696dc84333d6c04ed991/src/bitmap.rs#L5
 25 | pub(super) const BLOOM_HEADER_SIZE: usize = 1 + 8 + 4 + 32;
 26 | 
 27 | /**
 28 |  * Layout of a data block on disk
 29 |  *
 30 |  * 1. 4 bytes marking where the restart list starts
 31 |  * 2. 4 bytes indicating the number of entries in this block
 32 |  * 3. 1024+32 bytes for the bloom filter (if enabled)
 33 |  * 4. Sequence of variable-length entries
 34 |  * 5. Variable length restart list (each entry is 4bytes; so we don't need length information)
 35 |  */
 36 | #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)]
 37 | #[repr(C, packed)]
 38 | pub(super) struct DataBlockHeader {
 39 |     pub(super) restart_list_start: u32,
 40 |     pub(super) number_of_entries: u32,
 41 |     #[cfg(feature = "bloom-filters")]
 42 |     pub(super) bloom_filter: [u8; BLOOM_LENGTH + BLOOM_HEADER_SIZE],
 43 | }
 44 | 
 45 | /**
 46 |  * For WiscKey an entry contains:
 47 |  *
 48 |  * Header:
 49 |  *  - Key prefix len (4 bytes)
 50 |  *  - Key suffix len (4 bytes)
 51 |  *  - Seq_number (8 bytes)
 52 |  *  - Entry type (1 byte)
 53 |  *  - Value reference (batch id and offset)
 54 |  *
 55 |  * Content (not part of the header):
 56 |  *  - Variable length key suffix
 57 |  *
 58 |  * When not using WiscKey an entry is variable length and contains the following
 59 |  *
 60 |  * Header:
 61 |  *  - Key prefix len (4 bytes)
 62 |  *  - Key suffix len (4 bytes)
 63 |  *  - Value length (8 bytes)
 64 |  *  - Entry Type (1 byte)
 65 |  *  - Sequence number (8 bytes)
 66 |  *
 67 |  * Content (not part of the header):
 68 |  *  - Variable length key suffix
 69 |  *  - Variable length value
 70 |  */
 71 | #[derive(IntoBytes, Immutable, FromBytes, KnownLayout)]
 72 | #[repr(C, packed)]
 73 | pub(super) struct EntryHeader {
 74 |     pub(super) prefix_len: u32,
 75 |     pub(super) suffix_len: u32,
 76 |     pub(super) entry_type: u8,
 77 |     pub(super) seq_number: u64,
 78 |     #[cfg(feature = "wisckey")]
 79 |     pub(super) value_batch: ValueBatchId,
 80 |     #[cfg(feature = "wisckey")]
 81 |     pub(super) value_offset: ValueOffset,
 82 |     #[cfg(not(feature = "wisckey"))]
 83 |     pub(super) value_length: u64,
 84 | }
 85 | 
 86 | //TODO support data block layouts without prefixed keys
 87 | pub struct DataBlock {
 88 |     pub(super) restart_list_start: usize,
 89 |     pub(super) num_entries: u32,
 90 |     pub(super) restart_interval: u32,
 91 |     pub(super) data: Vec<u8>,
 92 |     #[cfg(feature = "bloom-filters")]
 93 |     pub(super) bloom_filter: Bloom<[u8]>,
 94 | }
 95 | 
 96 | impl DataBlock {
 97 |     pub fn new_from_data(data: Vec<u8>, restart_interval: u32) -> Self {
 98 |         assert!(!data.is_empty(), "No data?");
 99 | 
100 |         let header = DataBlockHeader::ref_from_bytes(&data[..Self::header_length()]).unwrap();
101 | 
102 |         #[cfg(feature = "bloom-filters")]
103 |         let bloom_filter = Bloom::from_bytes(header.bloom_filter.as_slice().to_vec())
104 |             .expect("Failed to load bloom filter");
105 | 
106 |         log::trace!("Created new data block from existing data");
107 | 
108 |         Self {
109 |             num_entries: header.number_of_entries,
110 |             restart_list_start: header.restart_list_start as usize,
111 |             data,
112 |             restart_interval,
113 |             #[cfg(feature = "bloom-filters")]
114 |             bloom_filter,
115 |         }
116 |     }
117 | 
118 |     fn header_length() -> usize {
119 |         std::mem::size_of::<DataBlockHeader>()
120 |     }
121 | 
122 |     /// Get the key and entry at the specified offset in bytes (must be valid!)
123 |     /// The third entry in this result is the new offset after the entry
124 |     #[tracing::instrument(skip(self_ptr, previous_key))]
125 |     pub fn get_entry_at_offset(
126 |         self_ptr: Arc<DataBlock>,
127 |         offset: u32,
128 |         previous_key: &[u8],
129 |     ) -> (Key, DataEntry) {
130 |         let mut offset = (offset as usize) + Self::header_length();
131 | 
132 |         let header_len = std::mem::size_of::<EntryHeader>();
133 | 
134 |         if offset + header_len > self_ptr.restart_list_start {
135 |             panic!("Invalid offset {offset}");
136 |         }
137 | 
138 |         let header = EntryHeader::ref_from_bytes(&self_ptr.data[offset..offset + header_len])
139 |             .expect("Failed to read entry header");
140 |         let entry_offset = offset;
141 | 
142 |         offset += std::mem::size_of::<EntryHeader>();
143 | 
144 |         let kdata = [
145 |             &previous_key[..(header.prefix_len as usize)],
146 |             &self_ptr.data[offset..offset + (header.suffix_len as usize)],
147 |         ]
148 |         .concat();
149 |         offset += header.suffix_len as usize;
150 | 
151 |         // Move offset to after the entry
152 |         #[cfg(not(feature = "wisckey"))]
153 |         {
154 |             offset += header.value_length as usize;
155 |         }
156 | 
157 |         let next_offset = offset - Self::header_length();
158 | 
159 |         let entry = DataEntry {
160 |             block: self_ptr,
161 |             offset: entry_offset,
162 |             next_offset: next_offset as u32,
163 |         };
164 | 
165 |         (kdata, entry)
166 |     }
167 | 
168 |     /// How many entries are in this data block?
169 |     pub fn get_num_entries(&self) -> u32 {
170 |         self.num_entries
171 |     }
172 | 
173 |     /// Get they entry at the specified index
174 |     /// (the index is in entries not bytes)
175 |     #[tracing::instrument(skip(self_ptr))]
176 |     pub fn get_entry_at_index(self_ptr: &Arc<Self>, index: u32) -> (Key, DataEntry) {
177 |         // First, get the closest restart offset
178 |         let restart_pos = index / self_ptr.restart_interval;
179 | 
180 |         let restart_offset = self_ptr.get_restart_offset(restart_pos);
181 |         let (mut key, mut entry) = Self::get_entry_at_offset(self_ptr.clone(), restart_offset, &[]);
182 | 
183 |         let mut current_idx = restart_pos * self_ptr.restart_interval;
184 | 
185 |         while current_idx < index {
186 |             (key, entry) =
187 |                 Self::get_entry_at_offset(self_ptr.clone(), entry.get_next_offset(), &key);
188 |             current_idx += 1;
189 |         }
190 | 
191 |         (key, entry)
192 |     }
193 | 
194 |     /// Length of this block in bytes without the header and restart list
195 |     pub fn byte_len(&self) -> u32 {
196 |         // "Cut-off" the beginning and end
197 |         let rl_len = self.data.len() - self.restart_list_start;
198 |         (self.data.len() - Self::header_length() - rl_len) as u32
199 |     }
200 | 
201 |     #[inline(always)]
202 |     fn restart_list_len(&self) -> usize {
203 |         let offset_len = std::mem::size_of::<u32>();
204 |         let rl_len = self.data.len() - self.restart_list_start;
205 | 
206 |         assert!(rl_len % offset_len == 0);
207 |         rl_len / offset_len
208 |     }
209 | 
210 |     /// Get get byte offset of a restart entry
211 |     #[inline(always)]
212 |     fn get_restart_offset(&self, pos: u32) -> u32 {
213 |         let offset_len = std::mem::size_of::<u32>();
214 |         let pos = self.restart_list_start + (pos as usize) * offset_len;
215 | 
216 |         u32::read_from_bytes(&self.data[pos..pos + offset_len]).unwrap()
217 |             - Self::header_length() as u32
218 |     }
219 | 
220 |     #[tracing::instrument(skip(self_ptr, key))]
221 |     fn binary_search(self_ptr: &Arc<Self>, key: &[u8]) -> SearchResult {
222 |         let rl_len = self_ptr.restart_list_len();
223 | 
224 |         let mut start: u32 = 0;
225 |         let mut end = (rl_len as u32) - 1;
226 | 
227 |         // binary search
228 |         while end - start > 1 {
229 |             let mid = start + (end - start) / 2;
230 | 
231 |             // We always perform the search at the restart positions for efficiency
232 |             let offset = self_ptr.get_restart_offset(mid);
233 |             let (this_key, entry) = Self::get_entry_at_offset(self_ptr.clone(), offset, &[]);
234 | 
235 |             match this_key.as_slice().cmp(key) {
236 |                 Ordering::Equal => {
237 |                     // Exact match
238 |                     return SearchResult::ExactMatch(entry);
239 |                 }
240 |                 Ordering::Less => {
241 |                     // continue with right half
242 |                     start = mid;
243 |                 }
244 |                 Ordering::Greater => {
245 |                     // continue with left half
246 |                     end = mid;
247 |                 }
248 |             }
249 |         }
250 | 
251 |         // There is no reset at the very end so we need to include
252 |         // that part in the sequential search
253 |         let end = if end + 1 == rl_len as u32 {
254 |             self_ptr.byte_len()
255 |         } else {
256 |             self_ptr.get_restart_offset(end)
257 |         };
258 | 
259 |         SearchResult::Range(start, end)
260 |     }
261 | 
262 |     /// Get the entry for the specified key
263 |     /// Will return None if no such entry exists
264 |     #[tracing::instrument(skip(self_ptr, key))]
265 |     pub fn get_by_key(self_ptr: &Arc<Self>, key: &[u8]) -> Option<DataEntry> {
266 |         #[cfg(feature = "bloom-filters")]
267 |         if !self_ptr.bloom_filter.check(key) {
268 |             return None;
269 |         }
270 | 
271 |         let (start, end) = match Self::binary_search(self_ptr, key) {
272 |             SearchResult::ExactMatch(entry) => {
273 |                 return Some(entry);
274 |             }
275 |             SearchResult::Range(start, end) => (start, end),
276 |         };
277 | 
278 |         let mut pos = self_ptr.get_restart_offset(start);
279 | 
280 |         let mut last_key = vec![];
281 |         while pos < end {
282 |             let (this_key, entry) = Self::get_entry_at_offset(self_ptr.clone(), pos, &last_key);
283 | 
284 |             if key == this_key {
285 |                 return Some(entry);
286 |             }
287 | 
288 |             pos = entry.get_next_offset();
289 |             last_key = this_key;
290 |         }
291 | 
292 |         // Not found
293 |         None
294 |     }
295 | }
296 | 


--------------------------------------------------------------------------------
/src/data_blocks/builder.rs:
--------------------------------------------------------------------------------
  1 | use cfg_if::cfg_if;
  2 | 
  3 | use std::sync::Arc;
  4 | 
  5 | use crate::manifest::SeqNumber;
  6 | use crate::{Error, disk};
  7 | 
  8 | use zerocopy::IntoBytes;
  9 | 
 10 | use super::block::{DataBlockHeader, EntryHeader};
 11 | use super::{DataBlock, DataBlockId, DataBlocks, PrefixedKey};
 12 | 
 13 | #[cfg(feature = "bloom-filters")]
 14 | use bloomfilter::Bloom;
 15 | 
 16 | #[cfg(feature = "bloom-filters")]
 17 | use super::block::{BLOOM_HEADER_SIZE, BLOOM_ITEM_COUNT, BLOOM_LENGTH};
 18 | 
 19 | #[cfg(feature = "wisckey")]
 20 | use crate::data_blocks::ValueId;
 21 | 
 22 | pub struct DataBlockBuilder {
 23 |     data_blocks: Arc<DataBlocks>,
 24 |     data: Vec<u8>,
 25 | 
 26 |     /// The position/index of the next entry
 27 |     /// This is also the current number of entries in this block builder
 28 |     position: u32,
 29 | 
 30 |     /// The restart list keeps track of when the keys are fully reset
 31 |     /// This enables using binary search in get() instead of seeking linearly
 32 |     restart_list: Vec<u32>,
 33 | 
 34 |     #[cfg(feature = "bloom-filters")]
 35 |     bloom_filter: Bloom<[u8]>,
 36 | }
 37 | 
 38 | impl DataBlockBuilder {
 39 |     #[tracing::instrument(skip(data_blocks))]
 40 |     pub(super) fn new(data_blocks: Arc<DataBlocks>) -> Self {
 41 |         // Reserve space for the header
 42 |         let data = vec![0u8; std::mem::size_of::<DataBlockHeader>()];
 43 | 
 44 |         Self {
 45 |             data_blocks,
 46 |             data,
 47 |             position: 0,
 48 |             restart_list: vec![],
 49 |             #[cfg(feature = "bloom-filters")]
 50 |             bloom_filter: Bloom::new(BLOOM_LENGTH, BLOOM_ITEM_COUNT)
 51 |                 .expect("Failed to create bloom filter"),
 52 |         }
 53 |     }
 54 | 
 55 |     pub fn add_entry(
 56 |         &mut self,
 57 |         mut key: PrefixedKey,
 58 |         full_key: &[u8],
 59 |         seq_number: SeqNumber,
 60 |         entry_type: u8,
 61 |         #[cfg(not(feature = "wisckey"))] entry_data: &[u8],
 62 |         #[cfg(feature = "wisckey")] value_ref: ValueId,
 63 |     ) {
 64 |         if self.position % self.data_blocks.params.block_restart_interval == 0 {
 65 |             assert!(key.prefix_len == 0);
 66 |             self.restart_list.push(self.data.len() as u32);
 67 |         }
 68 | 
 69 |         cfg_if! {
 70 |             if #[cfg(feature="bloom-filters")] {
 71 |                 self.bloom_filter.set(full_key);
 72 |             } else {
 73 |                 let _ = full_key;
 74 |             }
 75 |         }
 76 | 
 77 |         let header = EntryHeader {
 78 |             prefix_len: key.prefix_len,
 79 |             suffix_len: key.suffix.len() as u32,
 80 |             seq_number,
 81 |             entry_type,
 82 |             #[cfg(feature = "wisckey")]
 83 |             value_batch: value_ref.0,
 84 |             #[cfg(feature = "wisckey")]
 85 |             value_offset: value_ref.1,
 86 |             #[cfg(not(feature = "wisckey"))]
 87 |             value_length: entry_data.len() as u64,
 88 |         };
 89 | 
 90 |         self.data.extend_from_slice(header.as_bytes());
 91 | 
 92 |         self.data.append(&mut key.suffix);
 93 | 
 94 |         #[cfg(not(feature = "wisckey"))]
 95 |         self.data.extend_from_slice(entry_data);
 96 | 
 97 |         self.position += 1;
 98 |     }
 99 | 
100 |     /// Finish building an return the data blocks
101 |     ///
102 |     /// This will return Ok(None) if the builder did not have any entries
103 |     /// An error might be generated if we failed to write to disk
104 |     #[tracing::instrument(skip(self))]
105 |     pub async fn finish(mut self) -> Result<Option<DataBlockId>, Error> {
106 |         if self.position == 0 {
107 |             return Ok(None);
108 |         }
109 | 
110 |         let identifier = self
111 |             .data_blocks
112 |             .manifest
113 |             .generate_next_data_block_id()
114 |             .await;
115 | 
116 |         #[cfg(feature = "bloom-filters")]
117 |         let bloom_filter: &[u8; BLOOM_LENGTH + BLOOM_HEADER_SIZE] =
118 |             self.bloom_filter.as_slice().try_into().unwrap();
119 | 
120 |         let header = DataBlockHeader {
121 |             #[cfg(feature = "bloom-filters")]
122 |             bloom_filter: *bloom_filter,
123 |             number_of_entries: self.position,
124 |             restart_list_start: self.data.len() as u32,
125 |         };
126 | 
127 |         // Write header
128 |         self.data[..std::mem::size_of::<DataBlockHeader>()].copy_from_slice(header.as_bytes());
129 | 
130 |         // Write restart list
131 |         for restart_offset in self.restart_list.drain(..) {
132 |             self.data.extend_from_slice(restart_offset.as_bytes());
133 |         }
134 | 
135 |         let block = Arc::new(DataBlock {
136 |             data: self.data,
137 |             num_entries: header.number_of_entries,
138 |             restart_interval: self.data_blocks.params.block_restart_interval,
139 |             restart_list_start: header.restart_list_start as usize,
140 |             #[cfg(feature = "bloom-filters")]
141 |             bloom_filter: self.bloom_filter,
142 |         });
143 |         let shard_id = DataBlocks::block_to_shard_id(identifier);
144 | 
145 |         // Store on disk before grabbing the lock
146 |         let block_data = &block.data;
147 |         let fpath = self.data_blocks.get_file_path(&identifier);
148 | 
149 |         disk::write(&fpath, block_data).await.map_err(|err| {
150 |             Error::from_io_error(format!("Failed to write data block at `{fpath:?}`"), err)
151 |         })?;
152 | 
153 |         self.data_blocks.block_caches[shard_id]
154 |             .lock()
155 |             .put(identifier, block);
156 | 
157 |         Ok(Some(identifier))
158 |     }
159 | 
160 |     /// How big is the block now?
161 |     pub fn current_size(&self) -> usize {
162 |         self.data.len()
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/data_blocks/mod.rs:
--------------------------------------------------------------------------------
  1 | /// Data blocks hold the actual contents of storted table
  2 | /// (In the case of WiscKey the content is only the key and the value reference)
  3 | use std::num::NonZeroUsize;
  4 | use std::sync::Arc;
  5 | 
  6 | use parking_lot::Mutex;
  7 | 
  8 | use lru::LruCache;
  9 | 
 10 | use zerocopy::FromBytes;
 11 | 
 12 | use crate::Params;
 13 | use crate::manifest::Manifest;
 14 | use crate::{WriteOp, disk};
 15 | 
 16 | mod builder;
 17 | pub use builder::DataBlockBuilder;
 18 | 
 19 | mod block;
 20 | pub use block::DataBlock;
 21 | 
 22 | use block::EntryHeader;
 23 | 
 24 | #[cfg(feature = "wisckey")]
 25 | use crate::values::ValueId;
 26 | 
 27 | pub type DataBlockId = u64;
 28 | 
 29 | /// The minimum valid data block identifier
 30 | pub const MIN_DATA_BLOCK_ID: DataBlockId = 1;
 31 | 
 32 | const NUM_SHARDS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
 33 | 
 34 | #[derive(Debug)]
 35 | pub struct PrefixedKey {
 36 |     prefix_len: u32,
 37 |     suffix: Vec<u8>,
 38 | }
 39 | 
 40 | impl PrefixedKey {
 41 |     pub fn new(prefix_len: usize, suffix: Vec<u8>) -> Self {
 42 |         Self {
 43 |             prefix_len: prefix_len as u32,
 44 |             suffix,
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | type BlockShard = LruCache<DataBlockId, Arc<DataBlock>>;
 50 | 
 51 | pub enum DataEntryType {
 52 |     Put,
 53 |     Delete,
 54 | }
 55 | 
 56 | #[derive(Clone)]
 57 | pub struct DataEntry {
 58 |     /// The block containing th
 59 |     block: Arc<DataBlock>,
 60 | 
 61 |     /// The of this entry in the block's buffer
 62 |     offset: usize,
 63 | 
 64 |     /// The end of this entry
 65 |     next_offset: u32,
 66 | }
 67 | 
 68 | enum SearchResult {
 69 |     ExactMatch(DataEntry),
 70 |     Range(u32, u32),
 71 | }
 72 | 
 73 | impl DataEntry {
 74 |     fn get_header(&self) -> &EntryHeader {
 75 |         let header_len = std::mem::size_of::<EntryHeader>();
 76 |         let header_data = &self.block.data[self.offset..self.offset + header_len];
 77 |         EntryHeader::ref_from_bytes(header_data).expect("Failed to read entry header")
 78 |     }
 79 | 
 80 |     pub fn get_sequence_number(&self) -> u64 {
 81 |         self.get_header().seq_number
 82 |     }
 83 | 
 84 |     /// The offset of the next entry
 85 |     pub fn get_next_offset(&self) -> u32 {
 86 |         self.next_offset
 87 |     }
 88 | 
 89 |     pub fn get_type(&self) -> DataEntryType {
 90 |         let header = self.get_header();
 91 | 
 92 |         if header.entry_type == WriteOp::PUT_OP {
 93 |             DataEntryType::Put
 94 |         } else if header.entry_type == WriteOp::DELETE_OP {
 95 |             DataEntryType::Delete
 96 |         } else {
 97 |             panic!("Unknown data entry type");
 98 |         }
 99 |     }
100 | 
101 |     #[cfg(not(feature = "wisckey"))]
102 |     pub fn get_value(&self) -> Option<&[u8]> {
103 |         let header = self.get_header();
104 |         let value_offset =
105 |             self.offset + std::mem::size_of::<EntryHeader>() + (header.suffix_len as usize);
106 | 
107 |         if header.entry_type == WriteOp::PUT_OP {
108 |             let end = value_offset + (header.value_length as usize);
109 |             Some(&self.block.data[value_offset..end])
110 |         } else if header.entry_type == WriteOp::DELETE_OP {
111 |             None
112 |         } else {
113 |             panic!("Unknown write op");
114 |         }
115 |     }
116 | 
117 |     #[cfg(feature = "wisckey")]
118 |     pub fn get_value_id(&self) -> Option<ValueId> {
119 |         let header = self.get_header();
120 | 
121 |         if header.entry_type == WriteOp::PUT_OP {
122 |             Some((header.value_batch, header.value_offset))
123 |         } else if header.entry_type == WriteOp::DELETE_OP {
124 |             None
125 |         } else {
126 |             panic!("Unknown write op");
127 |         }
128 |     }
129 | }
130 | 
131 | /// Keeps track of all in-memory data blocks
132 | pub struct DataBlocks {
133 |     params: Arc<Params>,
134 |     block_caches: Vec<Mutex<BlockShard>>,
135 |     manifest: Arc<Manifest>,
136 | }
137 | 
138 | impl DataBlocks {
139 |     pub fn new(params: Arc<Params>, manifest: Arc<Manifest>) -> Self {
140 |         let max_data_files = NonZeroUsize::new(params.max_open_files / 2)
141 |             .expect("Max open files needs to be greater than 2");
142 | 
143 |         let shard_size = NonZeroUsize::new(max_data_files.get() / NUM_SHARDS)
144 |             .expect("Not enough open files to support the number of shards");
145 | 
146 |         let mut block_caches = Vec::new();
147 |         for _ in 0..NUM_SHARDS.get() {
148 |             block_caches.push(Mutex::new(BlockShard::new(shard_size)));
149 |         }
150 | 
151 |         Self {
152 |             params,
153 |             block_caches,
154 |             manifest,
155 |         }
156 |     }
157 | 
158 |     #[inline]
159 |     fn block_to_shard_id(block_id: DataBlockId) -> usize {
160 |         (block_id as usize) % NUM_SHARDS
161 |     }
162 | 
163 |     /// The path where the block with the given id
164 |     /// will be stored at.
165 |     #[inline]
166 |     fn get_file_path(&self, block_id: &DataBlockId) -> std::path::PathBuf {
167 |         self.params.db_path.join(format!("key{block_id:08}.data"))
168 |     }
169 | 
170 |     /// Start creation of a new block
171 |     #[tracing::instrument(skip(self_ptr))]
172 |     pub fn build_block(self_ptr: Arc<DataBlocks>) -> DataBlockBuilder {
173 |         DataBlockBuilder::new(self_ptr)
174 |     }
175 | 
176 |     /// Get a block by its id
177 |     /// Will either return the block from cache or load it from disk
178 |     #[tracing::instrument(skip(self))]
179 |     pub async fn get_block(&self, id: &DataBlockId) -> Arc<DataBlock> {
180 |         let shard_id = Self::block_to_shard_id(*id);
181 |         let cache = &self.block_caches[shard_id];
182 | 
183 |         if let Some(block) = cache.lock().get(id) {
184 |             return block.clone();
185 |         }
186 | 
187 |         // Do not hold the lock while loading form disk for better concurrency
188 |         // Worst case this means we load the same block multiple times...
189 |         let fpath = self.get_file_path(id);
190 |         log::trace!("Loading data block from disk at {fpath:?}");
191 |         let data = disk::read(&fpath, 0).await.unwrap_or_else(|err| {
192 |             panic!("Failed to load data block from disk at {fpath:?}: {err}")
193 |         });
194 |         let block = Arc::new(DataBlock::new_from_data(
195 |             data,
196 |             self.params.block_restart_interval,
197 |         ));
198 | 
199 |         cache.lock().put(*id, block.clone());
200 |         log::trace!("Stored new block in cache");
201 |         block
202 |     }
203 | }
204 | 
205 | #[cfg(test)]
206 | mod tests {
207 |     use super::*;
208 |     use tempfile::tempdir;
209 | 
210 |     #[cfg(feature = "tokio-uring")]
211 |     use kioto_uring_executor::test as async_test;
212 | 
213 |     #[cfg(feature = "monoio")]
214 |     use monoio::test as async_test;
215 | 
216 |     #[cfg(not(feature = "_async-io"))]
217 |     use tokio::test as async_test;
218 | 
219 |     #[cfg(feature = "wisckey")]
220 |     #[async_test]
221 |     async fn store_and_load() {
222 |         let dir = tempdir().unwrap();
223 |         let params = Arc::new(Params {
224 |             db_path: dir.path().to_path_buf(),
225 |             ..Default::default()
226 |         });
227 | 
228 |         let manifest = Arc::new(Manifest::new(params.clone()).await);
229 | 
230 |         let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
231 |         let mut builder = DataBlocks::build_block(data_blocks.clone());
232 | 
233 |         let key1 = PrefixedKey {
234 |             prefix_len: 0,
235 |             suffix: vec![5],
236 |         };
237 |         let seq1 = 14234524;
238 |         let val1 = (4, 2);
239 |         builder.add_entry(key1, &[5], seq1, WriteOp::PUT_OP, val1);
240 | 
241 |         let key2 = PrefixedKey {
242 |             prefix_len: 1,
243 |             suffix: vec![2],
244 |         };
245 |         let seq2 = 424234;
246 |         let val2 = (4, 5);
247 |         builder.add_entry(key2, &[5, 2], seq2, WriteOp::PUT_OP, val2);
248 | 
249 |         let id = builder.finish().await.unwrap().unwrap();
250 |         let data_block1 = data_blocks.get_block(&id).await;
251 |         let data_block2 = Arc::new(DataBlock::new_from_data(
252 |             data_block1.data.clone(),
253 |             params.block_restart_interval,
254 |         ));
255 | 
256 |         let prev_key = vec![];
257 |         let (key, entry) = DataBlock::get_entry_at_offset(data_block2.clone(), 0, &prev_key);
258 | 
259 |         assert_eq!(key, vec![5]);
260 |         assert_eq!(entry.get_value_id(), Some(val1));
261 | 
262 |         let (key, entry) =
263 |             DataBlock::get_entry_at_offset(data_block2.clone(), entry.get_next_offset(), &key);
264 | 
265 |         assert_eq!(key, vec![5, 2]);
266 |         assert_eq!(entry.get_value_id(), Some(val2));
267 |         assert_eq!(entry.get_next_offset(), data_block2.byte_len());
268 |     }
269 | 
270 |     #[cfg(not(feature = "wisckey"))]
271 |     #[async_test]
272 |     async fn store_and_load() {
273 |         let dir = tempdir().unwrap();
274 |         let params = Arc::new(Params {
275 |             db_path: dir.path().to_path_buf(),
276 |             ..Default::default()
277 |         });
278 | 
279 |         let manifest = Arc::new(Manifest::new(params.clone()).await);
280 | 
281 |         let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
282 |         let mut builder = DataBlocks::build_block(data_blocks.clone());
283 | 
284 |         let key1 = PrefixedKey {
285 |             prefix_len: 0,
286 |             suffix: vec![5],
287 |         };
288 |         let seq1 = 14234524;
289 |         let val1 = vec![4, 2];
290 |         builder.add_entry(key1, &[5u8], seq1, WriteOp::PUT_OP, &val1);
291 | 
292 |         let key2 = PrefixedKey {
293 |             prefix_len: 1,
294 |             suffix: vec![2],
295 |         };
296 |         let seq2 = 424234;
297 |         let val2 = vec![24, 50];
298 |         builder.add_entry(key2, &[5u8, 2u8], seq2, WriteOp::PUT_OP, &val2);
299 | 
300 |         let id = builder.finish().await.unwrap().unwrap();
301 |         let data_block1 = data_blocks.get_block(&id).await;
302 |         let data_block2 = Arc::new(DataBlock::new_from_data(
303 |             data_block1.data.clone(),
304 |             params.block_restart_interval,
305 |         ));
306 | 
307 |         let prev_key = vec![];
308 |         let (key, entry) = DataBlock::get_entry_at_offset(data_block2.clone(), 0, &prev_key);
309 | 
310 |         assert_eq!(key, vec![5]);
311 |         assert_eq!(entry.get_value(), Some(&val1[..]));
312 | 
313 |         let (key, entry) =
314 |             DataBlock::get_entry_at_offset(data_block2.clone(), entry.get_next_offset(), &key);
315 | 
316 |         assert_eq!(key, vec![5, 2]);
317 |         assert_eq!(entry.get_value(), Some(&val2[..]));
318 |         assert_eq!(entry.get_next_offset(), data_block2.byte_len());
319 |     }
320 | }
321 | 


--------------------------------------------------------------------------------
/src/database.rs:
--------------------------------------------------------------------------------
  1 | use crate::iterate::DbIterator;
  2 | use crate::logic::{DbLogic, EntryRef};
  3 | use crate::tasks::{TaskManager, TaskType};
  4 | use crate::{Error, Key, Params, StartMode, Value, WriteBatch, WriteOptions};
  5 | 
  6 | use std::sync::Arc;
  7 | 
  8 | /// The main database structure
  9 | /// This struct can be accessed concurrently and you should
 10 | /// never instantiate it more than once for the same on-disk files
 11 | pub struct Database {
 12 |     inner: Arc<DbLogic>,
 13 |     tasks: Arc<TaskManager>,
 14 | }
 15 | 
 16 | impl Database {
 17 |     /// Create a new database instance with default parameters
 18 |     pub async fn new(mode: StartMode) -> Result<Self, Error> {
 19 |         let params = Params::default();
 20 |         Self::new_with_params(mode, params).await
 21 |     }
 22 | 
 23 |     /// Create a new database instance with specific parameters
 24 |     pub async fn new_with_params(mode: StartMode, params: Params) -> Result<Self, Error> {
 25 |         let compaction_concurrency = params.compaction_concurrency;
 26 | 
 27 |         let inner = Arc::new(DbLogic::new(mode, params).await?);
 28 |         let tasks = Arc::new(TaskManager::new(inner.clone(), compaction_concurrency).await);
 29 | 
 30 |         Ok(Self { inner, tasks })
 31 |     }
 32 | 
 33 |     /// Will deserialize V from the raw data (avoids an additional data copy)
 34 |     #[tracing::instrument(skip(self, key))]
 35 |     pub async fn get(&self, key: &[u8]) -> Result<Option<EntryRef>, Error> {
 36 |         match self.inner.get(key).await {
 37 |             Ok((needs_compaction, data)) => {
 38 |                 if needs_compaction {
 39 |                     self.tasks.wake_up(&TaskType::LevelCompaction);
 40 |                 }
 41 | 
 42 |                 Ok(data)
 43 |             }
 44 |             Err(err) => Err(err),
 45 |         }
 46 |     }
 47 | 
 48 |     /// Delete an existing entry
 49 |     /// For efficiency, the datastore does not check whether the key actually existed
 50 |     /// Instead, it will just mark the most recent version (which could be the first one) as deleted
 51 |     #[tracing::instrument(skip(self, key))]
 52 |     pub async fn delete(&self, key: Key) -> Result<(), Error> {
 53 |         let mut batch = WriteBatch::new();
 54 |         batch.delete(key);
 55 | 
 56 |         self.write_opts(batch, &WriteOptions::default()).await
 57 |     }
 58 | 
 59 |     /// Ensure all data is written to disk
 60 |     /// Only has an effect if there were previous writes with sync=false
 61 |     pub async fn synchronize(&self) -> Result<(), Error> {
 62 |         self.inner.synchronize().await
 63 |     }
 64 | 
 65 |     /// Delete an existing entry (with additional options)
 66 |     pub async fn delete_opts(&self, key: Key, opts: &WriteOptions) -> Result<(), Error> {
 67 |         let mut batch = WriteBatch::new();
 68 |         batch.delete(key);
 69 |         self.write_opts(batch, opts).await
 70 |     }
 71 | 
 72 |     /// Insert or update a single entry
 73 |     pub async fn put(&self, key: Key, value: Value) -> Result<(), Error> {
 74 |         const OPTS: WriteOptions = WriteOptions::new();
 75 |         self.put_opts(key, value, &OPTS).await
 76 |     }
 77 | 
 78 |     /// Insert or update a single entry (with additional options)
 79 |     #[tracing::instrument(skip(self))]
 80 |     pub async fn put_opts(&self, key: Key, value: Value, opts: &WriteOptions) -> Result<(), Error> {
 81 |         let mut batch = WriteBatch::new();
 82 |         batch.put(key, value);
 83 |         self.write_opts(batch, opts).await
 84 |     }
 85 | 
 86 |     /// Iterate over all entries in the database
 87 |     pub async fn iter(&self) -> DbIterator {
 88 |         let (mem_iters, table_iters, min_key, max_key) = self.inner.prepare_iter(None, None).await;
 89 | 
 90 |         DbIterator::new(
 91 |             mem_iters,
 92 |             table_iters,
 93 |             min_key,
 94 |             max_key,
 95 |             false,
 96 |             #[cfg(feature = "wisckey")]
 97 |             self.inner.get_value_log(),
 98 |         )
 99 |     }
100 | 
101 |     /// Like iter(), but will only include entries with keys in [min_key;max_key)
102 |     pub async fn range_iter(&self, min_key: &[u8], max_key: &[u8]) -> DbIterator {
103 |         let (mem_iters, table_iters, min_key, max_key) =
104 |             self.inner.prepare_iter(Some(min_key), Some(max_key)).await;
105 | 
106 |         DbIterator::new(
107 |             mem_iters,
108 |             table_iters,
109 |             min_key,
110 |             max_key,
111 |             false,
112 |             #[cfg(feature = "wisckey")]
113 |             self.inner.get_value_log(),
114 |         )
115 |     }
116 | 
117 |     /// Like range_iter(), but in reverse.
118 |     /// It will only include entries with keys in (min_key;max_key]
119 |     pub async fn reverse_range_iter(&self, max_key: &[u8], min_key: &[u8]) -> DbIterator {
120 |         let (mem_iters, table_iters, min_key, max_key) = self
121 |             .inner
122 |             .prepare_reverse_iter(Some(max_key), Some(min_key))
123 |             .await;
124 | 
125 |         DbIterator::new(
126 |             mem_iters,
127 |             table_iters,
128 |             min_key.map(|k| k.to_vec()),
129 |             max_key.map(|k| k.to_vec()),
130 |             true,
131 |             #[cfg(feature = "wisckey")]
132 |             self.inner.get_value_log(),
133 |         )
134 |     }
135 | 
136 |     /// Write a batch of updates to the database
137 |     ///
138 |     /// If you only want to write to a single key, use `Database::put` instead
139 |     pub async fn write(&self, write_batch: WriteBatch) -> Result<(), Error> {
140 |         const OPTS: WriteOptions = WriteOptions::new();
141 |         self.write_opts(write_batch, &OPTS).await
142 |     }
143 | 
144 |     /// Write a batch of updates to the database
145 |     /// This version of write allows you to specify options such as "synchronous"
146 |     #[tracing::instrument(skip(self, write_batch, opts))]
147 |     pub async fn write_opts(
148 |         &self,
149 |         write_batch: WriteBatch,
150 |         opts: &WriteOptions,
151 |     ) -> Result<(), Error> {
152 |         let needs_compaction = self.inner.write_opts(write_batch, opts).await?;
153 | 
154 |         if needs_compaction {
155 |             self.tasks.wake_up(&TaskType::MemtableCompaction);
156 |         }
157 | 
158 |         Ok(())
159 |     }
160 | 
161 |     /// Stop all background tasks gracefully
162 |     pub async fn stop(&self) -> Result<(), Error> {
163 |         self.inner.stop().await?;
164 |         self.tasks.stop_all().await
165 |     }
166 | }
167 | 
168 | impl Drop for Database {
169 |     fn drop(&mut self) {
170 |         self.tasks.terminate();
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/disk.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "tokio-uring")]
  2 | use tokio_uring::fs;
  3 | 
  4 | #[cfg(feature = "monoio")]
  5 | use monoio::fs;
  6 | 
  7 | #[cfg(not(feature = "_async-io"))]
  8 | use std::fs;
  9 | 
 10 | #[cfg(not(feature = "_async-io"))]
 11 | use std::io::{Read, Seek, Write};
 12 | 
 13 | use std::path::Path;
 14 | 
 15 | use cfg_if::cfg_if;
 16 | 
 17 | /// Read from the offset to the end of the file
 18 | ///
 19 | /// - This is not supported by tokio-uring yet, so it is added as a helper function here
 20 | #[cfg(feature = "_async-io")]
 21 | #[inline(always)]
 22 | #[tracing::instrument]
 23 | pub async fn read_uncompressed(fpath: &Path, offset: u64) -> Result<Vec<u8>, std::io::Error> {
 24 |     let file = fs::File::open(fpath).await?;
 25 |     let mut buffer = vec![0u8; 4096];
 26 |     let mut result = vec![];
 27 |     let mut pos = offset;
 28 | 
 29 |     loop {
 30 |         let (res, buf) = file.read_at(buffer, pos).await;
 31 | 
 32 |         match res {
 33 |             Ok(0) => return Ok(result),
 34 |             Ok(n) => {
 35 |                 buffer = buf;
 36 |                 result.extend_from_slice(&buffer[..n]);
 37 |                 pos += n as u64;
 38 |             }
 39 |             Err(err) => return Err(err),
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | #[cfg(not(feature = "_async-io"))]
 45 | #[inline(always)]
 46 | #[tracing::instrument]
 47 | pub async fn read_uncompressed(fpath: &Path, offset: u64) -> Result<Vec<u8>, std::io::Error> {
 48 |     let mut file = fs::File::open(fpath)?;
 49 | 
 50 |     if offset > 0 {
 51 |         file.seek(std::io::SeekFrom::Start(offset))?;
 52 |     }
 53 | 
 54 |     let mut buf = vec![];
 55 |     file.read_to_end(&mut buf)?;
 56 | 
 57 |     Ok(buf)
 58 | }
 59 | 
 60 | /// Read the contents of the file from the given offset to
 61 | /// its end.
 62 | #[inline(always)]
 63 | #[tracing::instrument]
 64 | pub async fn read(fpath: &Path, offset: u64) -> Result<Vec<u8>, std::io::Error> {
 65 |     let compressed = read_uncompressed(fpath, offset).await?;
 66 | 
 67 |     cfg_if! {
 68 |         if #[ cfg(feature="snappy-compression") ] {
 69 |             let mut decoder = snap::raw::Decoder::new();
 70 |             Ok(decoder.decompress_vec(&compressed)?)
 71 |         } else {
 72 |             Ok(compressed)
 73 |         }
 74 |     }
 75 | }
 76 | 
 77 | /// Writes the data to the specified file path
 78 | ///
 79 | /// This will create the file if it does not exist yet.
 80 | /// It will also compress the data, if enabled.
 81 | #[tracing::instrument(skip(data))]
 82 | #[inline(always)]
 83 | pub async fn write(fpath: &Path, data: &[u8]) -> Result<(), std::io::Error> {
 84 |     //TODO it might be worth investigating if encoding/decoding
 85 |     // chunks is more efficient
 86 | 
 87 |     cfg_if! {
 88 |         if #[cfg(feature="snappy-compression") ] {
 89 |             let mut encoder = snap::raw::Encoder::new();
 90 |             let compressed = encoder.compress_vec(data)
 91 |                 .expect("Failed to compress data");
 92 |         } else {
 93 |             let mut compressed = vec![];
 94 |             compressed.extend_from_slice(data);
 95 |         }
 96 |     }
 97 | 
 98 |     write_uncompressed(fpath, compressed).await
 99 | }
100 | 
101 | /// Writes the uncompressed (even if the feature is enabled)
102 | /// to the specified file path
103 | ///
104 | /// This will create the file if it does not exist yet.
105 | #[tracing::instrument(skip(data))]
106 | #[inline(always)]
107 | pub async fn write_uncompressed(fpath: &Path, data: Vec<u8>) -> Result<(), std::io::Error> {
108 |     cfg_if! {
109 |         if #[ cfg(feature="_async-io") ] {
110 |             let file = fs::OpenOptions::new().create(true)
111 |                 .truncate(true).write(true)
112 |                 .open(fpath).await?;
113 | 
114 |             let (res, _buf) = file.write_all_at(data, 0).await;
115 |             res?;
116 |             file.sync_all().await?;
117 |         } else {
118 |             let mut file = fs::OpenOptions::new().create(true)
119 |                 .truncate(true).write(true)
120 |                 .open(fpath)?;
121 | 
122 |             file.write_all(&data)?;
123 |             file.sync_all()?;
124 |         }
125 |     }
126 | 
127 |     Ok(())
128 | }
129 | 
130 | pub async fn remove_file(fpath: &Path) -> Result<(), std::io::Error> {
131 |     cfg_if! {
132 |         if #[ cfg(feature="tokio-uring") ] {
133 |             tokio_uring::fs::remove_file(fpath).await
134 |         } else {
135 |             std::fs::remove_file(fpath)?;
136 |             Ok(())
137 |         }
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/index_blocks.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::mem::size_of;
  3 | use std::path::Path;
  4 | 
  5 | use crate::data_blocks::DataBlockId;
  6 | use crate::sorted_table::TableId;
  7 | use crate::{Error, disk};
  8 | use crate::{Key, Params};
  9 | 
 10 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
 11 | 
 12 | #[derive(Debug, IntoBytes, KnownLayout, Immutable, FromBytes)]
 13 | #[repr(C, packed)]
 14 | struct IndexBlockHeader {
 15 |     size: u64,
 16 |     min_key_len: u32,
 17 |     max_key_len: u32,
 18 |     num_data_blocks: u32,
 19 |     _padding: u32,
 20 | }
 21 | 
 22 | #[derive(IntoBytes, KnownLayout, Immutable, FromBytes)]
 23 | #[repr(C, packed)]
 24 | struct IndexEntryHeader {
 25 |     block_id: DataBlockId,
 26 |     key_len: u32,
 27 |     _padding: u32,
 28 | }
 29 | 
 30 | /** Index blocks hold metadata about a sorted table
 31 |  * Each table has exactly one index block
 32 |  *
 33 |  * Layout:
 34 |  *   - Header
 35 |  *   - Min key bytes
 36 |  *   - Max key bytes
 37 |  *   - Offset list
 38 |  *   - Index Entries
 39 | **/
 40 | pub struct IndexBlock {
 41 |     data: Vec<u8>,
 42 | }
 43 | 
 44 | impl IndexBlock {
 45 |     pub async fn new(
 46 |         params: &Params,
 47 |         id: TableId,
 48 |         index: Vec<(Key, DataBlockId)>,
 49 |         size: u64,
 50 |         min_key: Key,
 51 |         max_key: Key,
 52 |     ) -> Result<Self, Error> {
 53 |         let header = IndexBlockHeader {
 54 |             size,
 55 |             min_key_len: min_key.len() as u32,
 56 |             max_key_len: max_key.len() as u32,
 57 |             num_data_blocks: index.len() as u32,
 58 |             _padding: 0,
 59 |         };
 60 | 
 61 |         let mut block_data = header.as_bytes().to_vec();
 62 |         block_data.extend_from_slice(&min_key);
 63 |         block_data.extend_from_slice(&max_key);
 64 | 
 65 |         crate::add_padding(&mut block_data);
 66 | 
 67 |         // Reserve space for offsets
 68 |         let offset_start = block_data.len();
 69 |         let offset_len = crate::pad_offset(index.len());
 70 |         block_data.append(&mut vec![0u8; offset_len * size_of::<u32>()]);
 71 | 
 72 |         for (pos, (key, block_id)) in index.into_iter().enumerate() {
 73 |             let header = IndexEntryHeader {
 74 |                 block_id,
 75 |                 key_len: key.len() as u32,
 76 |                 _padding: 0,
 77 |             };
 78 | 
 79 |             let entry_offset = block_data.len() as u32;
 80 | 
 81 |             block_data[offset_start + pos * size_of::<u32>()
 82 |                 ..offset_start + (pos + 1) * size_of::<u32>()]
 83 |                 .copy_from_slice(entry_offset.as_bytes());
 84 | 
 85 |             block_data.extend_from_slice(header.as_bytes());
 86 |             block_data.extend_from_slice(&key);
 87 |         }
 88 | 
 89 |         // Store on disk before grabbing the lock
 90 |         let fpath = Self::get_file_path(params, &id);
 91 |         disk::write(&fpath, &block_data)
 92 |             .await
 93 |             .map_err(|err| Error::from_io_error("Failed to write index block", err))?;
 94 | 
 95 |         Ok(IndexBlock { data: block_data })
 96 |     }
 97 | 
 98 |     pub async fn load(params: &Params, id: TableId) -> Result<Self, Error> {
 99 |         log::trace!("Loading index block from disk");
100 |         let fpath = Self::get_file_path(params, &id);
101 |         let data = disk::read(&fpath, 0)
102 |             .await
103 |             .map_err(|err| Error::from_io_error("Failed to read index block", err))?;
104 | 
105 |         Ok(IndexBlock { data })
106 |     }
107 | 
108 |     /// where is this index block located on disk?
109 |     #[inline]
110 |     fn get_file_path(params: &Params, block_id: &TableId) -> std::path::PathBuf {
111 |         let fname = format!("idx{block_id:08}.data");
112 |         params.db_path.join(Path::new(&fname))
113 |     }
114 | 
115 |     fn get_header(&self) -> &IndexBlockHeader {
116 |         IndexBlockHeader::ref_from_prefix(&self.data[..]).unwrap().0
117 |     }
118 | 
119 |     fn get_entry_offset(&self, pos: usize) -> usize {
120 |         let header = self.get_header();
121 |         assert!((pos as u32) < header.num_data_blocks);
122 | 
123 |         let offset = size_of::<IndexBlockHeader>()
124 |             + header.min_key_len as usize
125 |             + header.max_key_len as usize;
126 | 
127 |         let offset_offset = crate::pad_offset(offset) + pos * size_of::<u32>();
128 |         *u32::ref_from_prefix(&self.data[offset_offset..]).unwrap().0 as usize
129 |     }
130 | 
131 |     /// Get the unique id for the data block at the specified index
132 |     pub fn get_block_id(&self, pos: usize) -> DataBlockId {
133 |         let offset = self.get_entry_offset(pos);
134 | 
135 |         let entry_header = IndexEntryHeader::ref_from_bytes(
136 |             &self.data[offset..offset + size_of::<IndexEntryHeader>()],
137 |         )
138 |         .unwrap();
139 | 
140 |         entry_header.block_id
141 |     }
142 | 
143 |     /// Get the key for the data block at the specified index
144 |     pub fn get_block_key(&self, pos: usize) -> &[u8] {
145 |         let offset = self.get_entry_offset(pos);
146 | 
147 |         let (entry_header, _) = IndexEntryHeader::ref_from_prefix(&self.data[offset..]).unwrap();
148 | 
149 |         let key_start = offset + size_of::<IndexEntryHeader>();
150 |         &self.data[key_start..key_start + (entry_header.key_len as usize)]
151 |     }
152 | 
153 |     /// How many data blocks does this table have?
154 |     pub fn num_data_blocks(&self) -> usize {
155 |         self.get_header().num_data_blocks as usize
156 |     }
157 | 
158 |     /// The size of this table in bytes
159 |     /// (for WiscKey this just counts the references, not the values themselves)
160 |     pub fn get_size(&self) -> usize {
161 |         self.get_header().size as usize
162 |     }
163 | 
164 |     /// Whats the minimum key in this table?
165 |     pub fn get_min(&self) -> &[u8] {
166 |         let header = self.get_header();
167 |         let key_offset = size_of::<IndexBlockHeader>();
168 | 
169 |         &self.data[key_offset..key_offset + (header.min_key_len as usize)]
170 |     }
171 | 
172 |     /// What is the maximum key in this table?
173 |     pub fn get_max(&self) -> &[u8] {
174 |         let header = self.get_header();
175 |         let key_offset = size_of::<IndexBlockHeader>() + (header.min_key_len as usize);
176 | 
177 |         &self.data[key_offset..key_offset + (header.max_key_len as usize)]
178 |     }
179 | 
180 |     /// Search for a specific key
181 |     /// This will a return a data block id that *might* hold this entry or None
182 |     #[tracing::instrument(skip(self, key))]
183 |     pub fn binary_search(&self, key: &[u8]) -> Option<DataBlockId> {
184 |         if key < self.get_min() || key > self.get_max() {
185 |             return None;
186 |         }
187 | 
188 |         let header = self.get_header();
189 | 
190 |         let mut start = 0;
191 |         let mut end = (header.num_data_blocks as usize) - 1;
192 | 
193 |         while end - start > 1 {
194 |             let mid = (end - start) / 2 + start;
195 |             let mid_key = self.get_block_key(mid);
196 | 
197 |             match mid_key.cmp(key) {
198 |                 Ordering::Equal => {
199 |                     return Some(self.get_block_id(mid));
200 |                 }
201 |                 Ordering::Greater => {
202 |                     end = mid;
203 |                 }
204 |                 Ordering::Less => {
205 |                     start = mid;
206 |                 }
207 |             }
208 |         }
209 | 
210 |         assert!(key >= self.get_block_key(start));
211 | 
212 |         if key >= self.get_block_key(end) {
213 |             Some(self.get_block_id(end))
214 |         } else {
215 |             Some(self.get_block_id(start))
216 |         }
217 |     }
218 | }
219 | 


--------------------------------------------------------------------------------
/src/iterate.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::future::Future;
  3 | use std::pin::Pin;
  4 | use std::task::{Context, Poll};
  5 | 
  6 | #[cfg(feature = "wisckey")]
  7 | use std::sync::Arc;
  8 | 
  9 | #[cfg(feature = "wisckey")]
 10 | use crate::values::ValueLog;
 11 | 
 12 | use crate::logic::EntryRef;
 13 | use crate::memtable::MemtableIterator;
 14 | use crate::sorted_table::{InternalIterator, TableIterator};
 15 | use crate::{Error, Key};
 16 | 
 17 | use futures::stream::Stream;
 18 | 
 19 | #[cfg(feature = "_async-io")]
 20 | type IterFuture = dyn Future<Output = Result<(DbIteratorInner, Option<(Key, EntryRef)>), Error>>;
 21 | 
 22 | #[cfg(not(feature = "_async-io"))]
 23 | type IterFuture =
 24 |     dyn Future<Output = Result<(DbIteratorInner, Option<(Key, EntryRef)>), Error>> + Send;
 25 | 
 26 | pub struct DbIterator {
 27 |     state: Option<Pin<Box<IterFuture>>>,
 28 | }
 29 | 
 30 | impl DbIterator {
 31 |     pub(crate) fn new(
 32 |         mem_iters: Vec<MemtableIterator>,
 33 |         table_iters: Vec<TableIterator>,
 34 |         min_key: Option<Vec<u8>>,
 35 |         max_key: Option<Vec<u8>>,
 36 |         reverse: bool,
 37 |         #[cfg(feature = "wisckey")] value_log: Arc<ValueLog>,
 38 |     ) -> Self {
 39 |         let inner = DbIteratorInner::new(
 40 |             mem_iters,
 41 |             table_iters,
 42 |             min_key,
 43 |             max_key,
 44 |             reverse,
 45 |             #[cfg(feature = "wisckey")]
 46 |             value_log,
 47 |         );
 48 |         let state = Box::pin(DbIteratorInner::next(inner));
 49 | 
 50 |         Self { state: Some(state) }
 51 |     }
 52 | }
 53 | 
 54 | impl Stream for DbIterator {
 55 |     type Item = (Key, EntryRef);
 56 | 
 57 |     fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context) -> Poll<Option<Self::Item>> {
 58 |         let (inner, res) = if let Some(mut fut) = self.state.take() {
 59 |             match Future::poll(fut.as_mut(), ctx) {
 60 |                 // return and keep waiting for result
 61 |                 Poll::Pending => {
 62 |                     self.state = Some(fut);
 63 |                     return Poll::Pending;
 64 |                 }
 65 |                 // item computation complete
 66 |                 Poll::Ready(result) => {
 67 |                     let (inner, res) = result.expect("iteration failed");
 68 |                     (inner, res)
 69 |                 }
 70 |             }
 71 |         } else {
 72 |             // no items left
 73 |             return Poll::Ready(None);
 74 |         };
 75 | 
 76 |         // Prepare next state?
 77 |         if res.is_some() {
 78 |             self.state = Some(Box::pin(DbIteratorInner::next(inner)));
 79 |         } else {
 80 |             self.state = None;
 81 |         }
 82 | 
 83 |         // return item
 84 |         Poll::Ready(res)
 85 |     }
 86 | }
 87 | 
 88 | struct DbIteratorInner {
 89 |     last_key: Option<Vec<u8>>,
 90 |     iterators: Vec<Box<dyn InternalIterator>>,
 91 | 
 92 |     reverse: bool,
 93 | 
 94 |     min_key: Option<Vec<u8>>,
 95 |     max_key: Option<Vec<u8>>,
 96 | 
 97 |     #[cfg(feature = "wisckey")]
 98 |     value_log: Arc<ValueLog>,
 99 | }
100 | 
101 | type NextKV = Option<(crate::manifest::SeqNumber, usize)>;
102 | 
103 | impl DbIteratorInner {
104 |     fn new(
105 |         mem_iters: Vec<MemtableIterator>,
106 |         table_iters: Vec<TableIterator>,
107 |         min_key: Option<Vec<u8>>,
108 |         max_key: Option<Vec<u8>>,
109 |         reverse: bool,
110 |         #[cfg(feature = "wisckey")] value_log: Arc<ValueLog>,
111 |     ) -> Self {
112 |         let mut iterators: Vec<Box<dyn InternalIterator>> = vec![];
113 |         for iter in mem_iters.into_iter() {
114 |             iterators.push(Box::new(iter));
115 |         }
116 |         for iter in table_iters.into_iter() {
117 |             iterators.push(Box::new(iter));
118 |         }
119 | 
120 |         Self {
121 |             iterators,
122 |             last_key: None,
123 |             min_key,
124 |             max_key,
125 |             reverse,
126 |             #[cfg(feature = "wisckey")]
127 |             value_log,
128 |         }
129 |     }
130 | 
131 |     /// Tries to pick the next value from the specified iterator
132 |     async fn parse_iter(&mut self, pos: usize, next_kv: NextKV) -> (bool, NextKV) {
133 |         // Split slices to make the borrow checker happy
134 |         let (prev, cur) = self.iterators[..].split_at_mut(pos);
135 |         let iter = &mut *cur[0];
136 | 
137 |         if self.reverse {
138 |             // This iterator might be "behind" other iterators
139 |             if let Some(last_key) = &self.last_key {
140 |                 while !iter.at_end() && iter.get_key() >= last_key.as_slice() {
141 |                     iter.step().await;
142 |                 }
143 |             }
144 | 
145 |             // Don't pick a key that is greater than the maximum
146 |             if let Some(max_key) = &self.max_key {
147 |                 while !iter.at_end() && iter.get_key() > max_key.as_slice() {
148 |                     iter.step().await;
149 |                 }
150 | 
151 |                 // There might be no key in this iterator that is <=max_key
152 |                 if iter.at_end() || iter.get_key() > max_key.as_slice() {
153 |                     return (false, next_kv);
154 |                 }
155 |             }
156 | 
157 |             if iter.at_end() {
158 |                 return (false, next_kv);
159 |             }
160 | 
161 |             let key = iter.get_key();
162 | 
163 |             // Don't pick a key that is less or equal to the minimum
164 |             if let Some(min_key) = &self.min_key
165 |                 && iter.get_key() <= min_key.as_slice()
166 |             {
167 |                 return (false, next_kv);
168 |             }
169 | 
170 |             let seq_number = iter.get_seq_number();
171 | 
172 |             if let Some((max_seq_number, max_pos)) = next_kv {
173 |                 let max_iter = &*prev[max_pos];
174 |                 let max_key = max_iter.get_key();
175 | 
176 |                 match key.cmp(max_key) {
177 |                     Ordering::Greater => (true, Some((seq_number, pos))),
178 |                     Ordering::Equal => {
179 |                         if seq_number > max_seq_number {
180 |                             (true, Some((seq_number, pos)))
181 |                         } else {
182 |                             (false, next_kv)
183 |                         }
184 |                     }
185 |                     Ordering::Less => (false, next_kv),
186 |                 }
187 |             } else {
188 |                 (true, Some((seq_number, pos)))
189 |             }
190 |         } else {
191 |             // This iterator might be "behind" other iterators
192 |             if let Some(last_key) = &self.last_key {
193 |                 while !iter.at_end() && iter.get_key() <= last_key.as_slice() {
194 |                     iter.step().await;
195 |                 }
196 |             }
197 | 
198 |             // Don't pick a key that is smaller than the minimum
199 |             if let Some(min_key) = &self.min_key {
200 |                 while !iter.at_end() && iter.get_key() < min_key.as_slice() {
201 |                     iter.step().await;
202 |                 }
203 | 
204 |                 // There might be no key in this iterator that is >=min_key
205 |                 if iter.at_end() || iter.get_key() < min_key.as_slice() {
206 |                     return (false, next_kv);
207 |                 }
208 |             }
209 | 
210 |             if iter.at_end() {
211 |                 return (false, next_kv);
212 |             }
213 | 
214 |             let key = iter.get_key();
215 | 
216 |             // Don't pick a key that is greater or equal to the maximum
217 |             if let Some(max_key) = &self.max_key
218 |                 && iter.get_key() >= max_key.as_slice()
219 |             {
220 |                 return (false, next_kv);
221 |             }
222 | 
223 |             let seq_number = iter.get_seq_number();
224 | 
225 |             if let Some((min_seq_number, min_pos)) = next_kv {
226 |                 let min_iter = &*prev[min_pos];
227 |                 let min_key = min_iter.get_key();
228 | 
229 |                 match key.cmp(min_key) {
230 |                     Ordering::Less => (true, Some((seq_number, pos))),
231 |                     Ordering::Equal => {
232 |                         if seq_number > min_seq_number {
233 |                             (true, Some((seq_number, pos)))
234 |                         } else {
235 |                             (false, next_kv)
236 |                         }
237 |                     }
238 |                     Ordering::Greater => (false, next_kv),
239 |                 }
240 |             } else {
241 |                 (true, Some((seq_number, pos)))
242 |             }
243 |         }
244 |     }
245 | 
246 |     async fn next(mut self) -> Result<(Self, Option<(Key, EntryRef)>), Error> {
247 |         let mut result = None;
248 | 
249 |         while result.is_none() {
250 |             let mut next_kv = None;
251 |             let num_iterators = self.iterators.len();
252 | 
253 |             for pos in 0..num_iterators {
254 |                 let (change, kv) = self.parse_iter(pos, next_kv).await;
255 | 
256 |                 if change {
257 |                     next_kv = kv;
258 |                 }
259 |             }
260 | 
261 |             if let Some((_, pos)) = next_kv.take() {
262 |                 let iter = &*self.iterators[pos];
263 | 
264 |                 let res_key = iter.get_key();
265 |                 self.last_key = Some(iter.get_key().to_vec());
266 | 
267 |                 #[cfg(feature = "wisckey")]
268 |                 let entry = iter.get_entry(&self.value_log).await;
269 |                 #[cfg(not(feature = "wisckey"))]
270 |                 let entry = iter.get_entry();
271 | 
272 |                 if let Some(entry) = entry {
273 |                     result = Some(Some((res_key.to_vec(), entry)));
274 |                 } else {
275 |                     // this is a deletion... skip
276 |                 }
277 |             } else {
278 |                 // at end
279 |                 result = Some(None);
280 |             };
281 |         }
282 | 
283 |         let (key, result) = match result.unwrap() {
284 |             Some(inner) => inner,
285 |             None => {
286 |                 return Ok((self, None));
287 |             }
288 |         };
289 | 
290 |         Ok((self, Some((key, result))))
291 |     }
292 | }
293 | 


--------------------------------------------------------------------------------
/src/level_logger.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::File;
 2 | use std::time::Instant;
 3 | 
 4 | use crate::manifest::LevelId;
 5 | 
 6 | use parking_lot::Mutex;
 7 | 
 8 | struct Inner {
 9 |     start: Instant,
10 |     outfile: csv::Writer<File>,
11 |     num_tables: Vec<usize>,
12 | }
13 | 
14 | /// Locks changes to the number of tables in a level
15 | pub(crate) struct LevelLogger {
16 |     inner: Mutex<Inner>,
17 | }
18 | 
19 | impl LevelLogger {
20 |     pub fn new(path: &str, num_levels: usize) -> Self {
21 |         let outfile = csv::Writer::from_path(path).expect("Failed to create log file");
22 | 
23 |         let inner = Inner::new(outfile, num_levels);
24 | 
25 |         Self {
26 |             inner: Mutex::new(inner),
27 |         }
28 |     }
29 | 
30 |     pub fn l0_table_added(&self) {
31 |         let mut inner = self.inner.lock();
32 |         inner.num_tables[0] += 1;
33 | 
34 |         inner.write();
35 |     }
36 | 
37 |     pub fn compaction(&self, level: LevelId, added: usize, removed: usize) {
38 |         let mut inner = self.inner.lock();
39 |         inner.num_tables[level as usize] -= removed;
40 |         inner.num_tables[level as usize + 1] += added;
41 | 
42 |         inner.write();
43 |     }
44 | }
45 | 
46 | impl Inner {
47 |     fn new(mut outfile: csv::Writer<File>, num_levels: usize) -> Self {
48 |         let num_tables = vec![0; num_levels];
49 | 
50 |         let mut header = vec![format!("time")];
51 |         for idx in 0..num_levels {
52 |             header.push(format!("level{idx}"));
53 |         }
54 | 
55 |         outfile.write_record(&header).unwrap();
56 | 
57 |         Self {
58 |             outfile,
59 |             num_tables,
60 |             start: Instant::now(),
61 |         }
62 |     }
63 | 
64 |     fn write(&mut self) {
65 |         let mut record = vec![];
66 |         record.push(format!("{}", self.start.elapsed().as_millis()));
67 | 
68 |         for count in self.num_tables.iter() {
69 |             record.push(format!("{count}"));
70 |         }
71 | 
72 |         self.outfile.write_record(&record).unwrap();
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![feature(get_mut_unchecked)]
  2 | // Temporary workaround for the io_uring code
  3 | #![allow(clippy::arc_with_non_send_sync)]
  4 | 
  5 | pub mod iterate;
  6 | 
  7 | #[cfg(feature = "wisckey")]
  8 | pub mod values;
  9 | 
 10 | mod params;
 11 | pub use params::Params;
 12 | 
 13 | mod write_batch;
 14 | pub use write_batch::{WriteBatch, WriteOp, WriteOptions};
 15 | 
 16 | pub mod sorted_table;
 17 | 
 18 | mod level_logger;
 19 | 
 20 | pub mod memtable;
 21 | pub mod tasks;
 22 | 
 23 | pub mod logic;
 24 | pub use logic::EntryRef;
 25 | 
 26 | pub mod manifest;
 27 | 
 28 | mod data_blocks;
 29 | mod database;
 30 | mod disk;
 31 | mod index_blocks;
 32 | mod level;
 33 | mod wal;
 34 | 
 35 | pub type Key = Vec<u8>;
 36 | pub type Value = Vec<u8>;
 37 | 
 38 | /// Shorthand for a list of key-value pairs
 39 | #[cfg(feature = "wisckey")]
 40 | type EntryList = Vec<(Key, Value)>;
 41 | 
 42 | pub use database::Database;
 43 | 
 44 | /// How many bytes do we align by?
 45 | const WORD_SIZE: usize = 8;
 46 | 
 47 | fn pad_offset(offset: usize) -> usize {
 48 |     offset + compute_padding(offset)
 49 | }
 50 | 
 51 | fn compute_padding(offset: usize) -> usize {
 52 |     let remainder = offset % WORD_SIZE;
 53 |     if remainder == 0 {
 54 |         0
 55 |     } else {
 56 |         WORD_SIZE - remainder
 57 |     }
 58 | }
 59 | 
 60 | fn add_padding(data: &mut Vec<u8>) {
 61 |     let padding = compute_padding(data.len());
 62 |     if padding > 0 {
 63 |         data.resize(data.len() + padding, 0u8);
 64 |     }
 65 | }
 66 | 
 67 | #[derive(Clone, Debug)]
 68 | pub enum Error {
 69 |     Io { context: String, message: String },
 70 |     InvalidParams(String),
 71 |     Serialization(String),
 72 | }
 73 | 
 74 | impl std::fmt::Display for Error {
 75 |     fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
 76 |         match self {
 77 |             Self::Io { context, message } => {
 78 |                 fmt.write_fmt(format_args!("{context}: {message}"))?;
 79 |             }
 80 |             Self::InvalidParams(msg) => {
 81 |                 fmt.write_fmt(format_args!("Invalid Parameter: {msg}"))?;
 82 |             }
 83 |             Self::Serialization(msg) => {
 84 |                 fmt.write_fmt(format_args!("Serialization Error: {msg}"))?;
 85 |             }
 86 |         }
 87 | 
 88 |         Ok(())
 89 |     }
 90 | }
 91 | 
 92 | impl Error {
 93 |     fn from_io_error<S: ToString>(context: S, inner: std::io::Error) -> Self {
 94 |         Self::Io {
 95 |             context: context.to_string(),
 96 |             message: format!("{inner}"),
 97 |         }
 98 |     }
 99 | }
100 | 
101 | /// Allow specifying how the datastore behaves during startup
102 | #[derive(Debug, Clone)]
103 | pub enum StartMode {
104 |     /// Reuse existing database, or create if non-existent
105 |     CreateOrOpen,
106 |     /// Open existing database, or fail if non-existent
107 |     Open,
108 |     /// Create a new, or override an existing, database
109 |     CreateOrOverride,
110 | }
111 | 


--------------------------------------------------------------------------------
/src/memtable.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::sync::Arc;
  3 | 
  4 | use async_trait::async_trait;
  5 | 
  6 | use crate::data_blocks::DataEntryType;
  7 | use crate::manifest::SeqNumber;
  8 | use crate::sorted_table::InternalIterator;
  9 | use crate::{EntryRef, Key, Params};
 10 | 
 11 | #[cfg(feature = "wisckey")]
 12 | use crate::values::ValueLog;
 13 | 
 14 | #[derive(Debug, Clone)]
 15 | pub struct MemtableRef {
 16 |     inner: Arc<Memtable>,
 17 | }
 18 | 
 19 | /// A reference to a memtable that cannot modify it
 20 | #[derive(Debug, Clone)]
 21 | pub struct ImmMemtableRef {
 22 |     inner: Arc<Memtable>,
 23 | }
 24 | 
 25 | #[derive(Clone, Debug, PartialEq, Eq)]
 26 | pub enum MemtableEntry {
 27 |     Value { seq_number: u64, value: Vec<u8> },
 28 |     Deletion { seq_number: u64 },
 29 | }
 30 | 
 31 | /// Reference to an entry in the memtable
 32 | /// TODO: make this zerocopy somehow
 33 | pub struct MemtableEntryRef {
 34 |     entry: MemtableEntry,
 35 | }
 36 | 
 37 | impl MemtableEntryRef {
 38 |     pub fn get_type(&self) -> DataEntryType {
 39 |         match &self.entry {
 40 |             MemtableEntry::Value { .. } => DataEntryType::Put,
 41 |             MemtableEntry::Deletion { .. } => DataEntryType::Delete,
 42 |         }
 43 |     }
 44 | 
 45 |     pub fn get_value(&self) -> Option<&[u8]> {
 46 |         match &self.entry {
 47 |             MemtableEntry::Value { value, .. } => Some(value),
 48 |             MemtableEntry::Deletion { .. } => None,
 49 |         }
 50 |     }
 51 | }
 52 | 
 53 | impl MemtableEntry {
 54 |     pub fn get_value(&self) -> Option<&[u8]> {
 55 |         match self {
 56 |             MemtableEntry::Value { value, .. } => Some(value),
 57 |             MemtableEntry::Deletion { .. } => None,
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | /// Iterates over a memtable and returns its contents in order
 63 | #[derive(Debug)]
 64 | pub struct MemtableIterator {
 65 |     inner: Arc<Memtable>,
 66 |     next_index: i64,
 67 |     key: Option<Key>,
 68 |     entry: Option<MemtableEntry>,
 69 |     reverse: bool,
 70 | }
 71 | 
 72 | impl MemtableIterator {
 73 |     pub async fn new(inner: Arc<Memtable>, reverse: bool) -> Self {
 74 |         let next_index = if reverse {
 75 |             (inner.entries.len() as i64) - 1
 76 |         } else {
 77 |             0
 78 |         };
 79 | 
 80 |         let mut obj = Self {
 81 |             inner,
 82 |             reverse,
 83 |             key: None,
 84 |             entry: None,
 85 |             next_index,
 86 |         };
 87 | 
 88 |         obj.step().await;
 89 | 
 90 |         obj
 91 |     }
 92 | }
 93 | 
 94 | #[cfg_attr(feature="_async-io", async_trait(?Send))]
 95 | #[cfg_attr(not(feature = "_async-io"), async_trait)]
 96 | impl InternalIterator for MemtableIterator {
 97 |     #[tracing::instrument]
 98 |     async fn step(&mut self) {
 99 |         let entries = &self.inner.entries;
100 |         let num_entries = entries.len() as i64;
101 | 
102 |         if self.reverse {
103 |             match self.next_index.cmp(&(-1)) {
104 |                 Ordering::Less => {
105 |                     panic!("Cannot step(); already at end");
106 |                 }
107 |                 Ordering::Equal => {
108 |                     self.next_index -= 1;
109 |                 }
110 |                 Ordering::Greater => {
111 |                     let (key, entry) = entries[self.next_index as usize].clone();
112 |                     self.key = Some(key);
113 |                     self.entry = Some(entry);
114 |                     self.next_index -= 1;
115 |                 }
116 |             }
117 |         } else {
118 |             match self.next_index.cmp(&num_entries) {
119 |                 Ordering::Greater => {
120 |                     panic!("Cannot step(); already at end");
121 |                 }
122 |                 Ordering::Equal => {
123 |                     self.next_index += 1;
124 |                 }
125 |                 Ordering::Less => {
126 |                     let (key, entry) = entries[self.next_index as usize].clone();
127 |                     self.key = Some(key);
128 |                     self.entry = Some(entry);
129 |                     self.next_index += 1;
130 |                 }
131 |             }
132 |         }
133 |     }
134 | 
135 |     fn at_end(&self) -> bool {
136 |         if self.reverse {
137 |             self.next_index < -1
138 |         } else {
139 |             let len = self.inner.entries.len() as i64;
140 |             self.next_index > len
141 |         }
142 |     }
143 | 
144 |     fn get_key(&self) -> &[u8] {
145 |         self.key.as_ref().expect("Not a valid iterator")
146 |     }
147 | 
148 |     #[cfg(feature = "wisckey")]
149 |     async fn get_entry(&self, _value_log: &ValueLog) -> Option<EntryRef> {
150 |         self.entry.clone().map(|entry| EntryRef::Memtable {
151 |             entry: MemtableEntryRef { entry },
152 |         })
153 |     }
154 | 
155 |     #[cfg(not(feature = "wisckey"))]
156 |     fn get_entry(&self) -> Option<EntryRef> {
157 |         self.entry.clone().map(|entry| EntryRef::Memtable {
158 |             entry: MemtableEntryRef { entry },
159 |         })
160 |     }
161 | 
162 |     fn get_seq_number(&self) -> SeqNumber {
163 |         match self.entry.as_ref().unwrap() {
164 |             MemtableEntry::Value { seq_number, .. } | MemtableEntry::Deletion { seq_number } => {
165 |                 *seq_number
166 |             }
167 |         }
168 |     }
169 | 
170 |     fn get_entry_type(&self) -> DataEntryType {
171 |         match self.entry.as_ref().unwrap() {
172 |             MemtableEntry::Value { .. } => DataEntryType::Put,
173 |             MemtableEntry::Deletion { .. } => DataEntryType::Delete,
174 |         }
175 |     }
176 | }
177 | 
178 | impl ImmMemtableRef {
179 |     pub fn get(&self) -> &Memtable {
180 |         &self.inner
181 |     }
182 | 
183 |     pub async fn into_iter(self, reverse: bool) -> MemtableIterator {
184 |         MemtableIterator::new(self.inner, reverse).await
185 |     }
186 | }
187 | 
188 | impl MemtableRef {
189 |     pub fn wrap(inner: Memtable) -> Self {
190 |         Self {
191 |             inner: Arc::new(inner),
192 |         }
193 |     }
194 | 
195 |     /// Get an immutable reference to the same memtable
196 |     pub fn clone_immutable(&self) -> ImmMemtableRef {
197 |         ImmMemtableRef {
198 |             inner: self.inner.clone(),
199 |         }
200 |     }
201 | 
202 |     /// Make the current contents into an immutable memtable
203 |     /// and create a new mutable one
204 |     pub fn take(&mut self, next_seq_number: u64) -> ImmMemtableRef {
205 |         let mut inner = Arc::new(Memtable::new(next_seq_number));
206 |         std::mem::swap(&mut inner, &mut self.inner);
207 | 
208 |         ImmMemtableRef { inner }
209 |     }
210 | 
211 |     pub fn get(&self) -> &Memtable {
212 |         &self.inner
213 |     }
214 | 
215 |     /// This is only safe to call from the DbLogic while holding the memtable lock
216 |     pub(crate) unsafe fn get_mut(&mut self) -> &mut Memtable {
217 |         unsafe { Arc::get_mut_unchecked(&mut self.inner) }
218 |     }
219 | }
220 | 
221 | /// In-memory representation of state that has not been written to level 0 yet.
222 | /// This data structure does not exist on disk, but can be recreated from the write-ahead log
223 | #[derive(Debug)]
224 | pub struct Memtable {
225 |     /// Sorted updates
226 |     entries: Vec<(Vec<u8>, MemtableEntry)>,
227 |     size: usize,
228 |     next_seq_number: SeqNumber,
229 | }
230 | 
231 | impl Memtable {
232 |     pub fn new(next_seq_number: SeqNumber) -> Self {
233 |         let entries = Vec::new();
234 |         let size = 0;
235 | 
236 |         Self {
237 |             entries,
238 |             size,
239 |             next_seq_number,
240 |         }
241 |     }
242 | 
243 |     #[inline]
244 |     pub fn get_next_seq_number(&self) -> u64 {
245 |         self.next_seq_number
246 |     }
247 | 
248 |     pub fn get_min_max_key(&self) -> (&[u8], &[u8]) {
249 |         let len = self.entries.len();
250 | 
251 |         if len == 0 {
252 |             panic!("Memtable is empty");
253 |         }
254 | 
255 |         (&self.entries[0].0, &self.entries[len - 1].0)
256 |     }
257 | 
258 |     #[tracing::instrument(skip(self, key))]
259 |     pub fn get(&self, key: &[u8]) -> Option<MemtableEntryRef> {
260 |         match self.entries.binary_search_by_key(&key, |t| t.0.as_slice()) {
261 |             Ok(pos) => Some(MemtableEntryRef {
262 |                 entry: self.entries[pos].1.clone(),
263 |             }),
264 |             Err(_) => None,
265 |         }
266 |     }
267 | 
268 |     /// Get position were to insert the key
269 |     /// Will remove existing entries with the same key
270 |     fn get_key_pos(&mut self, key: &[u8]) -> usize {
271 |         match self.entries.binary_search_by_key(&key, |t| t.0.as_slice()) {
272 |             Ok(pos) => {
273 |                 // remove old entry
274 |                 let entry_len = {
275 |                     let (_, entry) = self.entries.remove(pos);
276 |                     match entry {
277 |                         MemtableEntry::Value { value, .. } => key.len() + value.len(),
278 |                         MemtableEntry::Deletion { .. } => key.len(),
279 |                     }
280 |                 };
281 | 
282 |                 self.size -= entry_len;
283 |                 pos
284 |             }
285 |             Err(pos) => pos,
286 |         }
287 |     }
288 | 
289 |     #[tracing::instrument(skip(self, key, value))]
290 |     pub fn put(&mut self, key: Vec<u8>, value: Vec<u8>) {
291 |         let pos = self.get_key_pos(key.as_slice());
292 |         let entry_len = key.len() + value.len();
293 | 
294 |         self.entries.insert(
295 |             pos,
296 |             (
297 |                 key,
298 |                 MemtableEntry::Value {
299 |                     value,
300 |                     seq_number: self.next_seq_number,
301 |                 },
302 |             ),
303 |         );
304 | 
305 |         self.size += entry_len;
306 |         self.next_seq_number += 1;
307 |     }
308 | 
309 |     #[tracing::instrument(skip(self, key))]
310 |     pub fn delete(&mut self, key: Vec<u8>) {
311 |         let pos = self.get_key_pos(key.as_slice());
312 |         let entry_len = key.len();
313 | 
314 |         self.entries.insert(
315 |             pos,
316 |             (
317 |                 key,
318 |                 MemtableEntry::Deletion {
319 |                     seq_number: self.next_seq_number,
320 |                 },
321 |             ),
322 |         );
323 | 
324 |         self.size += entry_len;
325 |         self.next_seq_number += 1;
326 |     }
327 | 
328 |     #[inline]
329 |     pub fn is_full(&self, params: &Params) -> bool {
330 |         self.size >= params.max_memtable_size
331 |     }
332 | 
333 |     //FIXME avoid this copy somehow without breaking seek consistency
334 |     pub fn get_entries(&self) -> Vec<(Key, MemtableEntry)> {
335 |         self.entries.clone()
336 |     }
337 | }
338 | 
339 | #[cfg(test)]
340 | mod tests {
341 |     use super::*;
342 | 
343 |     #[test]
344 |     fn get_put() {
345 |         let mut mem = Memtable::new(1);
346 | 
347 |         let key1 = vec![5, 2, 4];
348 |         let key2 = vec![3, 8, 1];
349 | 
350 |         let val1 = vec![5, 1];
351 |         let val2 = vec![1, 8];
352 | 
353 |         mem.put(key1.clone(), val1.clone());
354 |         mem.put(key2.clone(), val2.clone());
355 | 
356 |         assert_eq!(mem.get(&key1).unwrap().get_value().unwrap(), &val1);
357 |         assert_eq!(mem.get(&key2).unwrap().get_value().unwrap(), &val2);
358 |     }
359 | 
360 |     #[test]
361 |     fn delete() {
362 |         let mut mem = Memtable::new(1);
363 | 
364 |         assert_eq!(mem.entries.len(), 0);
365 | 
366 |         let key = vec![5, 2, 4];
367 |         let val = vec![5, 1];
368 | 
369 |         mem.put(key.clone(), val.clone());
370 |         mem.delete(key.clone());
371 | 
372 |         assert_eq!(mem.entries.len(), 1);
373 |         assert_eq!(mem.get(&key).unwrap().get_value(), None);
374 |     }
375 | 
376 |     #[test]
377 |     fn override_entry() {
378 |         let mut mem = Memtable::new(1);
379 | 
380 |         let key1 = vec![5, 2, 4];
381 | 
382 |         let val1 = vec![5, 1];
383 |         let val2 = vec![1, 8];
384 | 
385 |         mem.put(key1.clone(), val1.clone());
386 |         mem.put(key1.clone(), val2.clone());
387 | 
388 |         assert_eq!(mem.get(&key1).unwrap().get_value().unwrap(), &val2);
389 |     }
390 | }
391 | 


--------------------------------------------------------------------------------
/src/params.rs:
--------------------------------------------------------------------------------
 1 | use std::path::{Path, PathBuf};
 2 | 
 3 | /// Parameters to customize the creation of the database
 4 | #[derive(Debug, Clone)]
 5 | pub struct Params {
 6 |     /// Where in the filesystem should the database be stored?
 7 |     pub db_path: PathBuf,
 8 |     /// Maximum size of a memtable (keys+values),
 9 |     /// This indirectly also defines how large a value block can be
10 |     pub max_memtable_size: usize,
11 |     /// How many levels does this store have (default: 5)
12 |     pub num_levels: usize,
13 |     /// How many open files should be held in memory?
14 |     pub max_open_files: usize,
15 |     /// Maximum number of entries in a key block
16 |     pub max_key_block_size: usize,
17 |     /// How often should the full key be stored in a data block?
18 |     /// Larger numbers result in smaller on-disk files, but seeks will be slower
19 |     pub block_restart_interval: u32,
20 |     /// Write the size of each level to a csv file
21 |     pub log_level_stats: Option<String>,
22 |     /// How many concurrent compaction tasks should there be
23 |     pub compaction_concurrency: usize,
24 |     /// How many seeks (per kb) before compaction is triggered?
25 |     pub seek_based_compaction: Option<u32>,
26 | }
27 | 
28 | impl Default for Params {
29 |     fn default() -> Self {
30 |         Self {
31 |             db_path: Path::new("./storage.lsm").to_path_buf(),
32 |             max_memtable_size: 5 * 1024 * 1024,
33 |             num_levels: 5,
34 |             max_open_files: 1_000_000,
35 |             max_key_block_size: 512,
36 |             block_restart_interval: 16,
37 |             log_level_stats: None,
38 |             compaction_concurrency: 4,
39 |             seek_based_compaction: Some(10),
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/sorted_table/builder.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | use std::sync::atomic::{AtomicBool, AtomicI32};
  3 | 
  4 | use crate::data_blocks::{DataBlockBuilder, DataBlockId, DataBlocks, PrefixedKey};
  5 | use crate::index_blocks::IndexBlock;
  6 | use crate::manifest::SeqNumber;
  7 | use crate::{Error, Key, Params, WriteOp};
  8 | 
  9 | #[cfg(feature = "wisckey")]
 10 | use crate::values::ValueId;
 11 | 
 12 | use super::{SortedTable, TableId};
 13 | 
 14 | /// Helper class to construct a table
 15 | /// only used during compaction
 16 | pub struct TableBuilder<'a> {
 17 |     identifier: TableId,
 18 |     params: &'a Params,
 19 |     data_blocks: Arc<DataBlocks>,
 20 |     min_key: Key,
 21 |     max_key: Key,
 22 | 
 23 |     data_block: DataBlockBuilder,
 24 |     block_index: Vec<(Key, DataBlockId)>,
 25 |     last_key: Key,
 26 |     block_entry_count: usize,
 27 |     size: u64,
 28 |     restart_count: u32,
 29 |     index_key: Option<Key>,
 30 | }
 31 | 
 32 | impl<'a> TableBuilder<'a> {
 33 |     #[tracing::instrument(skip(params, data_blocks, min_key, max_key))]
 34 |     pub fn new(
 35 |         identifier: TableId,
 36 |         params: &'a Params,
 37 |         data_blocks: Arc<DataBlocks>,
 38 |         min_key: Key,
 39 |         max_key: Key,
 40 |     ) -> TableBuilder<'a> {
 41 |         let block_index = vec![];
 42 |         let last_key = vec![];
 43 |         let block_entry_count = 0;
 44 |         let size = 0;
 45 |         let restart_count = 0;
 46 |         let index_key = None;
 47 |         let data_block = DataBlocks::build_block(data_blocks.clone());
 48 | 
 49 |         Self {
 50 |             identifier,
 51 |             params,
 52 |             data_blocks,
 53 |             block_index,
 54 |             data_block,
 55 |             last_key,
 56 |             block_entry_count,
 57 |             size,
 58 |             restart_count,
 59 |             index_key,
 60 |             min_key,
 61 |             max_key,
 62 |         }
 63 |     }
 64 | 
 65 |     #[cfg(feature = "wisckey")]
 66 |     #[tracing::instrument(skip(self, key, seq_number, value_ref))]
 67 |     pub async fn add_value(
 68 |         &mut self,
 69 |         key: &[u8],
 70 |         seq_number: SeqNumber,
 71 |         value_ref: ValueId,
 72 |     ) -> Result<(), Error> {
 73 |         self.add_entry(key, seq_number, WriteOp::PUT_OP, value_ref)
 74 |             .await
 75 |     }
 76 | 
 77 |     #[cfg(feature = "wisckey")]
 78 |     #[tracing::instrument(skip(self, key, seq_number))]
 79 |     pub async fn add_deletion(&mut self, key: &[u8], seq_number: SeqNumber) -> Result<(), Error> {
 80 |         self.add_entry(key, seq_number, WriteOp::DELETE_OP, ValueId::default())
 81 |             .await
 82 |     }
 83 | 
 84 |     #[cfg(not(feature = "wisckey"))]
 85 |     #[tracing::instrument(skip(self, key, seq_number, value))]
 86 |     pub async fn add_value(
 87 |         &mut self,
 88 |         key: &[u8],
 89 |         seq_number: SeqNumber,
 90 |         value: &[u8],
 91 |     ) -> Result<(), Error> {
 92 |         self.add_entry(key, seq_number, WriteOp::PUT_OP, value)
 93 |             .await
 94 |     }
 95 | 
 96 |     #[cfg(not(feature = "wisckey"))]
 97 |     #[tracing::instrument(skip(self, key, seq_number))]
 98 |     pub async fn add_deletion(&mut self, key: &[u8], seq_number: SeqNumber) -> Result<(), Error> {
 99 |         self.add_entry(key, seq_number, WriteOp::DELETE_OP, &[])
100 |             .await
101 |     }
102 | 
103 |     async fn add_entry(
104 |         &mut self,
105 |         key: &[u8],
106 |         seq_number: SeqNumber,
107 |         op_type: u8,
108 |         #[cfg(feature = "wisckey")] value: ValueId,
109 |         #[cfg(not(feature = "wisckey"))] value: &[u8],
110 |     ) -> Result<(), Error> {
111 |         if self.index_key.is_none() {
112 |             self.index_key = Some(key.to_vec());
113 |         }
114 |         let mut prefix_len = 0;
115 | 
116 |         // After a certain interval we reset the prefixed keys
117 |         // So that it is possible to binary search blocks
118 |         if self.restart_count == self.params.block_restart_interval {
119 |             self.restart_count = 0;
120 |         } else {
121 |             // Calculate key prefix length
122 |             while prefix_len < key.len()
123 |                 && prefix_len < self.last_key.len()
124 |                 && key[prefix_len] == self.last_key[prefix_len]
125 |             {
126 |                 prefix_len += 1;
127 |             }
128 |         }
129 | 
130 |         let suffix = key[prefix_len..].to_vec();
131 | 
132 |         self.block_entry_count += 1;
133 |         self.restart_count += 1;
134 | 
135 |         let pkey = PrefixedKey::new(prefix_len, suffix);
136 | 
137 |         self.last_key = key.to_vec();
138 | 
139 |         self.data_block
140 |             .add_entry(pkey, key, seq_number, op_type, value);
141 | 
142 |         if self.block_entry_count >= self.params.max_key_block_size {
143 |             self.size += self.data_block.current_size() as u64;
144 | 
145 |             let mut next_block = DataBlocks::build_block(self.data_blocks.clone());
146 |             std::mem::swap(&mut next_block, &mut self.data_block);
147 | 
148 |             let id = next_block.finish().await?.unwrap();
149 |             self.block_index.push((self.index_key.take().unwrap(), id));
150 |             self.block_entry_count = 0;
151 |             self.restart_count = 0;
152 |             self.last_key.clear();
153 |         }
154 | 
155 |         Ok(())
156 |     }
157 | 
158 |     #[tracing::instrument(skip(self))]
159 |     pub async fn finish(mut self) -> Result<SortedTable, Error> {
160 |         let block_size = self.data_block.current_size();
161 | 
162 |         // Block will only be created if it contained entries
163 |         if let Some(id) = self.data_block.finish().await? {
164 |             self.size += block_size as u64;
165 |             self.block_index.push((self.index_key.take().unwrap(), id));
166 |         }
167 | 
168 |         log::debug!("Created new table with {} blocks", self.block_index.len());
169 | 
170 |         let index = IndexBlock::new(
171 |             self.params,
172 |             self.identifier,
173 |             self.block_index,
174 |             self.size,
175 |             self.min_key,
176 |             self.max_key,
177 |         )
178 |         .await?;
179 | 
180 |         let allowed_seeks = if let Some(count) = self.params.seek_based_compaction {
181 |             ((index.get_size() / 1024).max(1) as i32) * (count as i32)
182 |         } else {
183 |             0
184 |         };
185 | 
186 |         Ok(SortedTable {
187 |             index,
188 |             allowed_seeks: AtomicI32::new(allowed_seeks),
189 |             identifier: self.identifier,
190 |             compaction_flag: AtomicBool::new(false),
191 |             data_blocks: self.data_blocks,
192 |         })
193 |     }
194 | }
195 | 


--------------------------------------------------------------------------------
/src/sorted_table/iterator.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::sync::Arc;
  3 | 
  4 | use async_trait::async_trait;
  5 | 
  6 | use crate::data_blocks::{DataBlock, DataEntry, DataEntryType};
  7 | use crate::manifest::SeqNumber;
  8 | use crate::{EntryRef, Key};
  9 | 
 10 | use super::SortedTable;
 11 | 
 12 | #[cfg(feature = "wisckey")]
 13 | use crate::values::{ValueId, ValueLog};
 14 | 
 15 | #[cfg_attr(feature="_async-io", async_trait(?Send))]
 16 | #[cfg_attr(not(feature = "_async-io"), async_trait)]
 17 | pub trait InternalIterator: Send {
 18 |     fn at_end(&self) -> bool;
 19 |     async fn step(&mut self);
 20 | 
 21 |     /// Returns None if this refers to a deletion
 22 |     #[cfg(feature = "wisckey")]
 23 |     async fn get_entry(&self, value_log: &ValueLog) -> Option<EntryRef>;
 24 |     #[cfg(not(feature = "wisckey"))]
 25 |     fn get_entry(&self) -> Option<EntryRef>;
 26 | 
 27 |     fn get_key(&self) -> &[u8];
 28 |     fn get_seq_number(&self) -> SeqNumber;
 29 |     fn get_entry_type(&self) -> DataEntryType;
 30 | }
 31 | 
 32 | /// Returns the entries within a table in order
 33 | pub struct TableIterator {
 34 |     block_pos: i64,
 35 |     block_offset: u32,
 36 |     key: Key,
 37 |     entry: DataEntry,
 38 |     table: Arc<SortedTable>,
 39 |     reverse: bool,
 40 | }
 41 | 
 42 | impl TableIterator {
 43 |     pub async fn new(table: Arc<SortedTable>, reverse: bool) -> Self {
 44 |         let last_key = vec![];
 45 | 
 46 |         if reverse {
 47 |             let num_blocks = table.index.num_data_blocks() as i64;
 48 |             assert!(num_blocks > 0); // tables must have at least one data block
 49 | 
 50 |             let block_id = table.index.get_block_id((num_blocks - 1) as usize);
 51 |             let first_block = table.data_blocks.get_block(&block_id).await;
 52 | 
 53 |             let len = first_block.get_num_entries();
 54 |             assert!(len > 0);
 55 |             let (key, entry) = DataBlock::get_entry_at_index(&first_block, len - 1);
 56 | 
 57 |             // Are we already at the end of the first block?
 58 |             let (block_pos, block_offset) = if len == 1 {
 59 |                 let next_pos = num_blocks - 2;
 60 |                 if next_pos >= 0 {
 61 |                     let block_id = table.index.get_block_id(next_pos as usize);
 62 |                     let next_block = table.data_blocks.get_block(&block_id).await;
 63 |                     let len = next_block.get_num_entries();
 64 |                     assert!(len > 0);
 65 |                     (next_pos, len - 1)
 66 |                 } else {
 67 |                     (next_pos, 0)
 68 |                 }
 69 |             } else {
 70 |                 (num_blocks - 1, len - 2)
 71 |             };
 72 | 
 73 |             Self {
 74 |                 block_pos,
 75 |                 block_offset,
 76 |                 key,
 77 |                 entry,
 78 |                 table,
 79 |                 reverse,
 80 |             }
 81 |         } else {
 82 |             let block_id = table.index.get_block_id(0);
 83 |             let first_block = table.data_blocks.get_block(&block_id).await;
 84 |             let byte_len = first_block.byte_len();
 85 |             let (key, entry) = DataBlock::get_entry_at_offset(first_block, 0, &last_key);
 86 | 
 87 |             let next_offset = entry.get_next_offset();
 88 | 
 89 |             // Are we already at the end of the first block?
 90 |             let (block_pos, block_offset) = if byte_len == next_offset {
 91 |                 (1, 0)
 92 |             } else {
 93 |                 (0, next_offset)
 94 |             };
 95 | 
 96 |             Self {
 97 |                 block_pos,
 98 |                 block_offset,
 99 |                 key,
100 |                 entry,
101 |                 table,
102 |                 reverse,
103 |             }
104 |         }
105 |     }
106 | 
107 |     #[cfg(feature = "wisckey")]
108 |     pub fn get_value_id(&self) -> Option<ValueId> {
109 |         self.entry.get_value_id()
110 |     }
111 | }
112 | 
113 | #[cfg_attr(feature="_async-io", async_trait(?Send))]
114 | #[cfg_attr(not(feature = "_async-io"), async_trait)]
115 | impl InternalIterator for TableIterator {
116 |     fn at_end(&self) -> bool {
117 |         if self.reverse {
118 |             self.block_pos < -1
119 |         } else {
120 |             self.block_pos > self.table.index.num_data_blocks() as i64
121 |         }
122 |     }
123 | 
124 |     fn get_key(&self) -> &[u8] {
125 |         &self.key
126 |     }
127 | 
128 |     fn get_seq_number(&self) -> SeqNumber {
129 |         self.entry.get_sequence_number()
130 |     }
131 | 
132 |     #[cfg(feature = "wisckey")]
133 |     async fn get_entry(&self, value_log: &ValueLog) -> Option<EntryRef> {
134 |         match self.entry.get_type() {
135 |             DataEntryType::Put => Some(EntryRef::SortedTable {
136 |                 value_ref: value_log
137 |                     .get_ref(self.entry.get_value_id().unwrap())
138 |                     .await
139 |                     .expect("No such value?"),
140 |                 entry: self.entry.clone(),
141 |             }),
142 |             DataEntryType::Delete => None,
143 |         }
144 |     }
145 | 
146 |     #[cfg(not(feature = "wisckey"))]
147 |     fn get_entry(&self) -> Option<EntryRef> {
148 |         match self.entry.get_type() {
149 |             DataEntryType::Put => Some(EntryRef::SortedTable {
150 |                 entry: self.entry.clone(),
151 |             }),
152 |             DataEntryType::Delete => None,
153 |         }
154 |     }
155 | 
156 |     fn get_entry_type(&self) -> DataEntryType {
157 |         self.entry.get_type()
158 |     }
159 | 
160 |     #[tracing::instrument(skip(self))]
161 |     async fn step(&mut self) {
162 |         if self.reverse {
163 |             match self.block_pos.cmp(&(-1)) {
164 |                 Ordering::Less => {
165 |                     panic!("Cannot step(); already at end");
166 |                 }
167 |                 Ordering::Equal => self.block_pos -= 1,
168 |                 Ordering::Greater => {
169 |                     let block_id = self.table.index.get_block_id(self.block_pos as usize);
170 |                     let block = self.table.data_blocks.get_block(&block_id).await;
171 | 
172 |                     let (key, entry) = DataBlock::get_entry_at_index(&block, self.block_offset);
173 | 
174 |                     self.key = key;
175 |                     self.entry = entry;
176 | 
177 |                     // At the end of the block?
178 |                     if self.block_offset == 0 {
179 |                         self.block_pos -= 1;
180 | 
181 |                         if self.block_pos >= 0 {
182 |                             let block_id = self.table.index.get_block_id(self.block_pos as usize);
183 |                             let block = self.table.data_blocks.get_block(&block_id).await;
184 |                             self.block_offset = block.get_num_entries() - 1;
185 |                         } else {
186 |                             self.block_offset = 0;
187 |                         }
188 |                     } else {
189 |                         self.block_offset -= 1;
190 |                     }
191 |                 }
192 |             }
193 |         } else {
194 |             let num_blocks = self.table.index.num_data_blocks() as i64;
195 |             match self.block_pos.cmp(&num_blocks) {
196 |                 Ordering::Equal => {
197 |                     self.block_pos += 1;
198 |                     return;
199 |                 }
200 |                 Ordering::Greater => {
201 |                     panic!("Cannot step(); already at end");
202 |                 }
203 |                 Ordering::Less => {
204 |                     let block_id = self.table.index.get_block_id(self.block_pos as usize);
205 |                     let block = self.table.data_blocks.get_block(&block_id).await;
206 |                     let byte_len = block.byte_len();
207 | 
208 |                     let (key, entry) =
209 |                         DataBlock::get_entry_at_offset(block, self.block_offset, &self.key);
210 | 
211 |                     let next_offset = entry.get_next_offset();
212 | 
213 |                     self.key = key;
214 |                     self.entry = entry;
215 | 
216 |                     // At the end of the block?
217 |                     if next_offset >= byte_len {
218 |                         self.block_pos += 1;
219 |                         self.block_offset = 0;
220 |                     } else {
221 |                         self.block_offset = next_offset;
222 |                     }
223 |                 }
224 |             }
225 |         }
226 |     }
227 | }
228 | 


--------------------------------------------------------------------------------
/src/sorted_table/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | use std::sync::atomic::{AtomicBool, AtomicI32, Ordering as AtomicOrdering};
  3 | 
  4 | use crate::data_blocks::{DataBlock, DataBlocks, DataEntry};
  5 | use crate::index_blocks::IndexBlock;
  6 | use crate::{Error, Params};
  7 | 
  8 | mod iterator;
  9 | pub use iterator::{InternalIterator, TableIterator};
 10 | 
 11 | mod builder;
 12 | pub use builder::TableBuilder;
 13 | 
 14 | #[cfg(test)]
 15 | mod tests;
 16 | 
 17 | pub type TableId = u64;
 18 | 
 19 | /// Entries ach level are grouped into sorted tables
 20 | /// These tables contain an ordered set of key/value-pairs
 21 | ///
 22 | /// Except for level 0, sorted tables do not overlap others on the same level
 23 | pub struct SortedTable {
 24 |     identifier: TableId,
 25 |     /// The index of the table; it holds all relevant metadata
 26 |     index: IndexBlock,
 27 |     data_blocks: Arc<DataBlocks>,
 28 |     /// Is this table currently being compacted
 29 |     compaction_flag: AtomicBool,
 30 |     /// The number of seek operations on this table before compaction is triggered
 31 |     /// This improves read performance for heavily queried keys
 32 |     allowed_seeks: AtomicI32,
 33 | }
 34 | 
 35 | impl SortedTable {
 36 |     pub async fn load(
 37 |         identifier: TableId,
 38 |         data_blocks: Arc<DataBlocks>,
 39 |         params: &Params,
 40 |     ) -> Result<Self, Error> {
 41 |         let index = IndexBlock::load(params, identifier).await?;
 42 | 
 43 |         let allowed_seeks = if let Some(count) = params.seek_based_compaction {
 44 |             ((index.get_size() / 1024).max(1) as i32) * (count as i32)
 45 |         } else {
 46 |             0
 47 |         };
 48 | 
 49 |         Ok(Self {
 50 |             identifier,
 51 |             index,
 52 |             data_blocks,
 53 |             allowed_seeks: AtomicI32::new(allowed_seeks),
 54 |             compaction_flag: AtomicBool::new(false),
 55 |         })
 56 |     }
 57 | 
 58 |     /// Checks if seek-based compaction should be triggered for this table
 59 |     pub fn has_maximum_seeks(&self) -> bool {
 60 |         self.allowed_seeks.load(AtomicOrdering::SeqCst) <= 0
 61 |     }
 62 | 
 63 |     /// Returns false if another task is already compacting this table
 64 |     pub fn maybe_start_compaction(&self) -> bool {
 65 |         let order = AtomicOrdering::SeqCst;
 66 |         let result = self
 67 |             .compaction_flag
 68 |             .compare_exchange(false, true, order, order);
 69 | 
 70 |         result.is_ok()
 71 |     }
 72 | 
 73 |     /// Compaction has failed, e.g., due to lock contention
 74 |     /// Remove the compaction flag
 75 |     pub fn abort_compaction(&self) {
 76 |         let prev = self.compaction_flag.swap(false, AtomicOrdering::SeqCst);
 77 |         assert!(prev, "Compaction flag was not set!");
 78 |     }
 79 | 
 80 |     /// The table has moved to a new level during compaction and will be
 81 |     /// reused. Remove the compaction marker.
 82 |     pub fn finish_fast_compaction(&self) {
 83 |         let prev = self.compaction_flag.swap(false, AtomicOrdering::SeqCst);
 84 |         assert!(prev, "Compaction flag was not set!");
 85 |     }
 86 | 
 87 |     pub fn get_id(&self) -> TableId {
 88 |         self.identifier
 89 |     }
 90 | 
 91 |     /// Get the size of this table (in bytes)
 92 |     pub fn get_size(&self) -> usize {
 93 |         self.index.get_size()
 94 |     }
 95 | 
 96 |     /// Get the minimum key of this table
 97 |     pub fn get_min(&self) -> &[u8] {
 98 |         self.index.get_min()
 99 |     }
100 | 
101 |     /// Get the maximum key of this table
102 |     pub fn get_max(&self) -> &[u8] {
103 |         self.index.get_max()
104 |     }
105 | 
106 |     /// Gets an entry for particular key in this table
107 |     /// Returns None if no entry for the key exists
108 |     #[tracing::instrument(skip(self, key))]
109 |     pub async fn get(&self, key: &[u8]) -> Option<DataEntry> {
110 |         self.allowed_seeks.fetch_sub(1, AtomicOrdering::Relaxed);
111 | 
112 |         let block_id = self.index.binary_search(key)?;
113 |         let block = self.data_blocks.get_block(&block_id).await;
114 | 
115 |         DataBlock::get_by_key(&block, key)
116 |     }
117 | 
118 |     /// Check if this table overlaps with the specified range
119 |     ///
120 |     /// min and max are both inclusive
121 |     #[inline(always)]
122 |     pub fn overlaps(&self, min: &[u8], max: &[u8]) -> bool {
123 |         self.get_max() >= min && self.get_min() <= max
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/sorted_table/tests.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | use crate::manifest::Manifest;
  4 | 
  5 | use tempfile::tempdir;
  6 | 
  7 | #[cfg(feature = "tokio-uring")]
  8 | use kioto_uring_executor::test as async_test;
  9 | 
 10 | #[cfg(feature = "monoio")]
 11 | use monoio::test as async_test;
 12 | 
 13 | #[cfg(not(feature = "_async-io"))]
 14 | use tokio::test as async_test;
 15 | 
 16 | #[cfg(feature = "wisckey")]
 17 | #[async_test]
 18 | async fn iterate() {
 19 |     let dir = tempdir().unwrap();
 20 |     let params = Params {
 21 |         db_path: dir.path().to_path_buf(),
 22 |         ..Default::default()
 23 |     };
 24 | 
 25 |     let params = Arc::new(params);
 26 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
 27 | 
 28 |     let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
 29 | 
 30 |     let key1 = vec![5];
 31 |     let key2 = vec![15];
 32 | 
 33 |     let value_id1 = (4, 2);
 34 |     let value_id2 = (4, 50);
 35 | 
 36 |     let id = 124234;
 37 |     let mut builder = TableBuilder::new(id, &params, data_blocks, key1.clone(), key2.clone());
 38 | 
 39 |     builder.add_value(&key1, 1, value_id1).await.unwrap();
 40 | 
 41 |     builder.add_value(&key2, 4, value_id2).await.unwrap();
 42 | 
 43 |     let table = builder.finish().await.unwrap();
 44 | 
 45 |     let mut iter = TableIterator::new(Arc::new(table), false).await;
 46 | 
 47 |     assert!(!iter.at_end());
 48 |     assert_eq!(iter.get_key(), &key1);
 49 |     assert_eq!(iter.get_value_id(), Some(value_id1));
 50 | 
 51 |     iter.step().await;
 52 | 
 53 |     assert!(!iter.at_end());
 54 |     assert_eq!(iter.get_key(), &key2);
 55 |     assert_eq!(iter.get_value_id(), Some(value_id2));
 56 | 
 57 |     iter.step().await;
 58 | 
 59 |     assert!(iter.at_end());
 60 | }
 61 | 
 62 | #[cfg(not(feature = "wisckey"))]
 63 | #[async_test]
 64 | async fn iterate() {
 65 |     let dir = tempdir().unwrap();
 66 |     let params = Params {
 67 |         db_path: dir.path().to_path_buf(),
 68 |         ..Default::default()
 69 |     };
 70 | 
 71 |     let params = Arc::new(params);
 72 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
 73 | 
 74 |     let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
 75 | 
 76 |     let key1 = vec![5, 10, 3];
 77 |     let key2 = vec![15, 10, 3];
 78 | 
 79 |     let value1 = vec![4, 2];
 80 |     let value2 = vec![4, 50];
 81 | 
 82 |     let id = 124234;
 83 |     let mut builder = TableBuilder::new(id, &params, data_blocks, key1.clone(), key2.clone());
 84 | 
 85 |     builder.add_value(&key1, 1, &value1).await.unwrap();
 86 | 
 87 |     builder.add_value(&key2, 4, &value2).await.unwrap();
 88 | 
 89 |     let table = Arc::new(builder.finish().await.unwrap());
 90 | 
 91 |     let mut iter = TableIterator::new(table, false).await;
 92 | 
 93 |     assert!(!iter.at_end());
 94 |     assert_eq!(iter.get_key(), &key1);
 95 |     assert_eq!(iter.get_entry().unwrap().get_value(), &value1);
 96 | 
 97 |     iter.step().await;
 98 | 
 99 |     assert!(!iter.at_end());
100 |     assert_eq!(iter.get_key(), &key2);
101 |     assert_eq!(iter.get_entry().unwrap().get_value(), &value2);
102 | 
103 |     iter.step().await;
104 | 
105 |     assert!(iter.at_end());
106 | }
107 | 
108 | #[cfg(not(feature = "wisckey"))]
109 | #[async_test]
110 | async fn reverse_iterate() {
111 |     let dir = tempdir().unwrap();
112 |     let params = Params {
113 |         db_path: dir.path().to_path_buf(),
114 |         ..Default::default()
115 |     };
116 | 
117 |     let params = Arc::new(params);
118 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
119 | 
120 |     let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
121 | 
122 |     let key1 = vec![5, 10, 3];
123 |     let key2 = vec![15, 10, 3];
124 | 
125 |     let value1 = vec![4, 2];
126 |     let value2 = vec![4, 50];
127 | 
128 |     let id = 124234;
129 |     let mut builder = TableBuilder::new(id, &params, data_blocks, key1.clone(), key2.clone());
130 | 
131 |     builder.add_value(&key1, 1, &value1).await.unwrap();
132 | 
133 |     builder.add_value(&key2, 4, &value2).await.unwrap();
134 | 
135 |     let table = Arc::new(builder.finish().await.unwrap());
136 | 
137 |     let mut iter = TableIterator::new(table, true).await;
138 | 
139 |     assert!(!iter.at_end());
140 |     assert_eq!(iter.get_key(), &key2);
141 |     assert_eq!(iter.get_entry().unwrap().get_value(), &value2);
142 | 
143 |     iter.step().await;
144 | 
145 |     assert!(!iter.at_end());
146 |     assert_eq!(iter.get_key(), &key1);
147 |     assert_eq!(iter.get_entry().unwrap().get_value(), &value1);
148 | 
149 |     iter.step().await;
150 | 
151 |     assert!(iter.at_end());
152 | }
153 | 
154 | #[cfg(feature = "wisckey")]
155 | #[async_test]
156 | async fn iterate_many() {
157 |     const COUNT: u32 = 5_000;
158 | 
159 |     let dir = tempdir().unwrap();
160 |     let params = Params {
161 |         db_path: dir.path().to_path_buf(),
162 |         ..Default::default()
163 |     };
164 | 
165 |     let params = Arc::new(params);
166 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
167 | 
168 |     let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest));
169 | 
170 |     let min_key = (0u32).to_le_bytes().to_vec();
171 |     let max_key = COUNT.to_le_bytes().to_vec();
172 | 
173 |     let id = 1;
174 |     let mut builder = TableBuilder::new(id, &params, data_blocks, min_key, max_key);
175 | 
176 |     for pos in 0..COUNT {
177 |         let key = (pos).to_le_bytes().to_vec();
178 |         let seq_num = (500 + pos) as u64;
179 | 
180 |         builder.add_value(&key, seq_num, (100, pos)).await.unwrap();
181 |     }
182 | 
183 |     let table = Arc::new(builder.finish().await.unwrap());
184 | 
185 |     let mut iter = TableIterator::new(table, false).await;
186 | 
187 |     for pos in 0..COUNT {
188 |         assert!(!iter.at_end());
189 | 
190 |         assert_eq!(iter.get_key(), &pos.to_le_bytes().to_vec());
191 |         assert_eq!(iter.get_value_id(), Some((100, pos)));
192 |         assert_eq!(iter.get_seq_number(), 500 + pos as u64);
193 | 
194 |         iter.step().await;
195 |     }
196 | 
197 |     assert!(iter.at_end());
198 | }
199 | 


--------------------------------------------------------------------------------
/src/tasks.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::sync::Arc;
  3 | use std::sync::atomic::{AtomicBool, Ordering};
  4 | use std::time::Instant;
  5 | 
  6 | use parking_lot::RwLock;
  7 | 
  8 | use tokio::sync::Notify;
  9 | 
 10 | use crate::Error;
 11 | use crate::logic::DbLogic;
 12 | 
 13 | use async_trait::async_trait;
 14 | 
 15 | #[cfg(feature = "_async-io")]
 16 | #[async_trait(?Send)]
 17 | pub trait Task {
 18 |     async fn run(&self) -> Result<bool, Error>;
 19 | }
 20 | 
 21 | #[cfg(not(feature = "_async-io"))]
 22 | #[async_trait]
 23 | pub trait Task: Sync + Send {
 24 |     async fn run(&self) -> Result<bool, Error>;
 25 | }
 26 | 
 27 | #[derive(Debug, PartialEq, Eq, Hash)]
 28 | pub enum TaskType {
 29 |     MemtableCompaction,
 30 |     LevelCompaction,
 31 | }
 32 | 
 33 | struct TaskHandle {
 34 |     stop_flag: Arc<AtomicBool>,
 35 |     task: Box<dyn Task>,
 36 |     update_cond: Arc<UpdateCond>,
 37 | }
 38 | 
 39 | /// This structure manages background tasks
 40 | /// Currently there is only compaction, but there might be more in the future
 41 | pub struct TaskManager {
 42 |     stop_flag: Arc<AtomicBool>,
 43 |     tasks: HashMap<TaskType, TaskGroup>,
 44 | }
 45 | 
 46 | /// Holds a group of tasks that do the same thing
 47 | /// e.g., all compaction tasks
 48 | struct TaskGroup {
 49 |     condition: Arc<UpdateCond>,
 50 | }
 51 | 
 52 | /// Keeps track of a condition variables shared within a task group
 53 | struct UpdateCond {
 54 |     last_change: RwLock<Instant>,
 55 |     condition: Notify,
 56 | }
 57 | 
 58 | struct MemtableCompactionTask {
 59 |     datastore: Arc<DbLogic>,
 60 |     level_update_cond: Arc<UpdateCond>,
 61 | }
 62 | 
 63 | struct LevelCompactionTask {
 64 |     datastore: Arc<DbLogic>,
 65 | }
 66 | 
 67 | impl MemtableCompactionTask {
 68 |     fn new_boxed(datastore: Arc<DbLogic>, level_update_cond: Arc<UpdateCond>) -> Box<dyn Task> {
 69 |         Box::new(Self {
 70 |             datastore,
 71 |             level_update_cond,
 72 |         })
 73 |     }
 74 | }
 75 | 
 76 | impl LevelCompactionTask {
 77 |     fn new_boxed(datastore: Arc<DbLogic>) -> Box<dyn Task> {
 78 |         Box::new(Self { datastore })
 79 |     }
 80 | }
 81 | 
 82 | #[cfg_attr(feature="_async-io", async_trait(?Send))]
 83 | #[cfg_attr(not(feature = "_async-io"), async_trait)]
 84 | impl Task for MemtableCompactionTask {
 85 |     async fn run(&self) -> Result<bool, Error> {
 86 |         let did_work = self.datastore.do_memtable_compaction().await?;
 87 |         if did_work {
 88 |             self.level_update_cond.wake_up();
 89 |         }
 90 |         Ok(did_work)
 91 |     }
 92 | }
 93 | 
 94 | #[cfg_attr(feature="_async-io", async_trait(?Send))]
 95 | #[cfg_attr(not(feature = "_async-io"), async_trait)]
 96 | impl Task for LevelCompactionTask {
 97 |     async fn run(&self) -> Result<bool, Error> {
 98 |         Ok(self.datastore.do_level_compaction().await?)
 99 |     }
100 | }
101 | 
102 | impl UpdateCond {
103 |     fn new() -> Self {
104 |         Self {
105 |             last_change: RwLock::new(Instant::now()),
106 |             condition: Default::default(),
107 |         }
108 |     }
109 | 
110 |     /// Notify the task that there is new work to do
111 |     fn wake_up(&self) {
112 |         let mut last_change = self.last_change.write();
113 |         *last_change = Instant::now();
114 |         self.condition.notify_one();
115 |     }
116 | }
117 | 
118 | impl TaskHandle {
119 |     fn new(stop_flag: Arc<AtomicBool>, update_cond: Arc<UpdateCond>, task: Box<dyn Task>) -> Self {
120 |         Self {
121 |             stop_flag,
122 |             update_cond,
123 |             task,
124 |         }
125 |     }
126 | 
127 |     #[inline(always)]
128 |     fn is_running(&self) -> bool {
129 |         !self.stop_flag.load(Ordering::SeqCst)
130 |     }
131 | 
132 |     async fn work_loop(&self) {
133 |         log::trace!("Task work loop started");
134 |         let mut last_update = Instant::now();
135 | 
136 |         // Indicates whether work was done in the last iteration
137 |         let mut idle = false;
138 | 
139 |         loop {
140 |             let now = Instant::now();
141 | 
142 |             loop {
143 |                 let fut = self.update_cond.condition.notified();
144 |                 tokio::pin!(fut);
145 | 
146 |                 {
147 |                     let lchange = self.update_cond.last_change.read();
148 | 
149 |                     if !self.is_running() || !idle || *lchange > last_update {
150 |                         break;
151 |                     }
152 | 
153 |                     // wait for change to queue and retry
154 |                     fut.as_mut().enable();
155 |                 }
156 | 
157 |                 fut.await;
158 |             }
159 | 
160 |             if !self.is_running() {
161 |                 break;
162 |             }
163 | 
164 |             let did_work = self.task.run().await.expect("Task failed");
165 |             last_update = now;
166 | 
167 |             if did_work {
168 |                 idle = false;
169 |             } else {
170 |                 log::trace!("Task did not do any work");
171 |                 idle = true;
172 |             }
173 |         }
174 | 
175 |         log::trace!("Task work loop ended");
176 |     }
177 | }
178 | 
179 | impl TaskManager {
180 |     pub async fn new(datastore: Arc<DbLogic>, num_compaction_tasks: usize) -> Self {
181 |         let mut tasks = HashMap::default();
182 |         let stop_flag = Arc::new(AtomicBool::new(false));
183 | 
184 |         let memtable_update_cond = Arc::new(UpdateCond::new());
185 |         let level_update_cond = Arc::new(UpdateCond::new());
186 | 
187 |         #[cfg(feature = "tokio-uring")]
188 |         let mut sring = kioto_uring_executor::new_spawn_ring();
189 | 
190 |         {
191 |             let stop_flag = stop_flag.clone();
192 |             let memtable_update_cond = memtable_update_cond.clone();
193 |             let datastore = datastore.clone();
194 |             let level_update_cond = level_update_cond.clone();
195 | 
196 |             #[cfg(feature = "tokio-uring")]
197 |             {
198 |                 kioto_uring_executor::spawn_with(move || {
199 |                     let hdl = TaskHandle::new(
200 |                         stop_flag,
201 |                         memtable_update_cond,
202 |                         MemtableCompactionTask::new_boxed(datastore, level_update_cond),
203 |                     );
204 |                     Box::pin(async move { hdl.work_loop().await })
205 |                 });
206 |             }
207 | 
208 |             #[cfg(not(feature = "tokio-uring"))]
209 |             {
210 |                 let hdl = TaskHandle::new(
211 |                     stop_flag,
212 |                     memtable_update_cond,
213 |                     MemtableCompactionTask::new_boxed(datastore, level_update_cond),
214 |                 );
215 | 
216 |                 cfg_if::cfg_if! {
217 |                     if #[cfg(feature="monoio")] {
218 |                         monoio::spawn(async move { hdl.work_loop().await });
219 |                     } else {
220 |                         tokio::spawn(async move { hdl.work_loop().await });
221 |                     }
222 |                 }
223 |             }
224 |         }
225 | 
226 |         let task_group = TaskGroup {
227 |             condition: memtable_update_cond,
228 |         };
229 | 
230 |         tasks.insert(TaskType::MemtableCompaction, task_group);
231 | 
232 |         {
233 |             for _ in 0..num_compaction_tasks {
234 |                 let stop_flag = stop_flag.clone();
235 |                 let level_update_cond = level_update_cond.clone();
236 |                 let datastore = datastore.clone();
237 | 
238 |                 #[cfg(feature = "tokio-uring")]
239 |                 {
240 |                     sring.spawn_with(move || {
241 |                         let hdl = TaskHandle::new(
242 |                             stop_flag,
243 |                             level_update_cond,
244 |                             LevelCompactionTask::new_boxed(datastore),
245 |                         );
246 |                         Box::pin(async move { hdl.work_loop().await })
247 |                     });
248 |                 }
249 | 
250 |                 #[cfg(not(feature = "tokio-uring"))]
251 |                 {
252 |                     let hdl = TaskHandle::new(
253 |                         stop_flag,
254 |                         level_update_cond,
255 |                         LevelCompactionTask::new_boxed(datastore),
256 |                     );
257 | 
258 |                     cfg_if::cfg_if! {
259 |                         if #[cfg(feature="monoio")] {
260 |                             monoio::spawn(async move { hdl.work_loop().await });
261 |                         } else {
262 |                             tokio::spawn(async move { hdl.work_loop().await });
263 |                         }
264 |                     }
265 |                 }
266 |             }
267 | 
268 |             let task_group = TaskGroup {
269 |                 condition: level_update_cond,
270 |             };
271 | 
272 |             tasks.insert(TaskType::LevelCompaction, task_group);
273 |         }
274 | 
275 |         Self { stop_flag, tasks }
276 |     }
277 | 
278 |     #[tracing::instrument(skip(self))]
279 |     pub fn wake_up(&self, task_type: &TaskType) {
280 |         let task_group = self.tasks.get(task_type).expect("No such task");
281 |         task_group.condition.wake_up();
282 |     }
283 | 
284 |     pub fn terminate(&self) {
285 |         self.stop_flag.store(false, Ordering::SeqCst);
286 | 
287 |         for (_, task_group) in self.tasks.iter() {
288 |             task_group.condition.condition.notify_one();
289 |         }
290 |     }
291 | 
292 |     pub async fn stop_all(&self) -> Result<(), Error> {
293 |         log::trace!("Stopping all background tasks");
294 | 
295 |         self.stop_flag.store(true, Ordering::SeqCst);
296 | 
297 |         for (_, task_group) in self.tasks.iter() {
298 |             task_group.condition.condition.notify_waiters();
299 |         }
300 | 
301 |         Ok(())
302 |     }
303 | }
304 | 


--------------------------------------------------------------------------------
/src/values/batch.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use crate::Error;
  4 | use crate::disk;
  5 | use crate::values::{ValueBatchId, ValueId, ValueLog, ValueOffset, ValueRef};
  6 | 
  7 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout};
  8 | 
  9 | /**
 10 |  * The layout is as follows:
 11 |  *   - value batch header
 12 |  *   - offsets (padded to WORD_SIZE)
 13 |  *   - value entries
 14 |  */
 15 | #[derive(Debug)]
 16 | pub(super) struct ValueBatch {
 17 |     data: Vec<u8>,
 18 | }
 19 | 
 20 | pub struct ValueBatchBuilder<'a> {
 21 |     vlog: &'a ValueLog,
 22 |     identifier: ValueBatchId,
 23 |     /// The locations of the values within this block
 24 |     offsets: Vec<u8>,
 25 |     /// The value data
 26 |     value_data: Vec<u8>,
 27 | }
 28 | 
 29 | #[derive(Debug, KnownLayout, Immutable, IntoBytes, FromBytes)]
 30 | #[repr(C, packed)]
 31 | pub(super) struct ValueBatchHeader {
 32 |     pub num_values: u32,
 33 | }
 34 | 
 35 | pub const BATCH_HEADER_LEN: usize = std::mem::size_of::<ValueBatchHeader>();
 36 | 
 37 | #[derive(Debug, KnownLayout, Immutable, IntoBytes, FromBytes)]
 38 | #[repr(C, packed)]
 39 | pub(super) struct ValueEntryHeader {
 40 |     pub key_length: u32,
 41 |     pub value_length: u32,
 42 | }
 43 | 
 44 | impl<'a> ValueBatchBuilder<'a> {
 45 |     pub fn new(identifier: ValueBatchId, vlog: &'a ValueLog) -> Self {
 46 |         Self {
 47 |             identifier,
 48 |             vlog,
 49 |             value_data: vec![],
 50 |             offsets: vec![],
 51 |         }
 52 |     }
 53 | 
 54 |     /// Add another value to this batch
 55 |     pub async fn add_entry(&mut self, key: &[u8], val: &[u8]) -> ValueId {
 56 |         // Add padding (if needed)
 57 |         let offset = crate::pad_offset(self.value_data.len());
 58 |         assert!(offset >= self.value_data.len());
 59 |         self.value_data.resize(offset, 0u8);
 60 | 
 61 |         self.offsets.extend_from_slice((offset as u32).as_bytes());
 62 | 
 63 |         let entry_header = ValueEntryHeader {
 64 |             key_length: key.len().try_into().expect("Key is too long"),
 65 |             value_length: val.len().try_into().expect("Value is too long"),
 66 |         };
 67 | 
 68 |         self.value_data.extend_from_slice(entry_header.as_bytes());
 69 |         self.value_data.extend_from_slice(key);
 70 |         self.value_data.extend_from_slice(val);
 71 | 
 72 |         (self.identifier, offset as u32)
 73 |     }
 74 | 
 75 |     /// Create the batch and write it to disk
 76 |     pub async fn finish(mut self) -> Result<ValueBatchId, Error> {
 77 |         let num_values = (self.offsets.len() / size_of::<u32>()) as u32;
 78 | 
 79 |         let header = ValueBatchHeader { num_values };
 80 | 
 81 |         crate::add_padding(&mut self.offsets);
 82 | 
 83 |         let mut data = header.as_bytes().to_vec();
 84 |         let file_path = self.vlog.get_batch_file_path(&self.identifier);
 85 | 
 86 |         data.extend_from_slice(&self.value_data);
 87 |         disk::write(&file_path, &data)
 88 |             .await
 89 |             .map_err(|err| Error::from_io_error("Failed to write value log batch", err))?;
 90 | 
 91 |         let batch = Arc::new(ValueBatch { data });
 92 | 
 93 |         // Store in the cache so we don't have to load immediately
 94 |         {
 95 |             let shard_id = ValueLog::batch_to_shard_id(self.identifier);
 96 |             let mut shard = self.vlog.batch_caches[shard_id].lock().await;
 97 |             shard.put(self.identifier, batch);
 98 |         }
 99 | 
100 |         self.vlog
101 |             .index
102 |             .add_batch(self.identifier, num_values as usize)
103 |             .await?;
104 | 
105 |         log::trace!("Created value batch #{}", self.identifier);
106 |         Ok(self.identifier)
107 |     }
108 | }
109 | 
110 | impl ValueBatch {
111 |     pub fn from_existing(data: Vec<u8>) -> Self {
112 |         Self { data }
113 |     }
114 | 
115 |     pub fn get_ref(self_ptr: Arc<ValueBatch>, pos: ValueOffset) -> ValueRef {
116 |         let mut offset = pos as usize;
117 |         let data = &self_ptr.get_value_data()[offset..];
118 | 
119 |         let (vheader, _) = ValueEntryHeader::ref_from_prefix(data).unwrap();
120 | 
121 |         offset += size_of::<ValueEntryHeader>();
122 |         offset += vheader.key_length as usize;
123 | 
124 |         ValueRef {
125 |             length: vheader.value_length as usize,
126 |             batch: self_ptr,
127 |             offset,
128 |         }
129 |     }
130 | 
131 |     pub fn get_entries(&self, offsets: &[ValueOffset]) -> Vec<(Vec<u8>, Vec<u8>)> {
132 |         offsets
133 |             .iter()
134 |             .map(|offset| {
135 |                 let mut offset = *offset as usize;
136 |                 let data = &self.get_value_data()[offset..];
137 | 
138 |                 let (vheader, _) = ValueEntryHeader::ref_from_prefix(data).unwrap();
139 | 
140 |                 offset += size_of::<ValueEntryHeader>();
141 | 
142 |                 let key = data[offset..(vheader.key_length as usize)].to_vec();
143 |                 offset += vheader.key_length as usize;
144 | 
145 |                 let value = data[offset..(vheader.value_length as usize)].to_vec();
146 | 
147 |                 (key, value)
148 |             })
149 |             .collect()
150 |     }
151 | 
152 |     /// Access the raw data of this batch
153 |     #[inline]
154 |     pub(super) fn get_value_data(&self) -> &[u8] {
155 |         &self.data[BATCH_HEADER_LEN..]
156 |     }
157 | 
158 |     #[inline]
159 |     fn get_header(&self) -> &ValueBatchHeader {
160 |         ValueBatchHeader::ref_from_prefix(&self.data[..]).unwrap().0
161 |     }
162 | 
163 |     /// The number of all values in this batch, even deleted ones
164 |     #[allow(dead_code)]
165 |     pub fn total_num_values(&self) -> u32 {
166 |         self.get_header().num_values
167 |     }
168 | }
169 | 


--------------------------------------------------------------------------------
/src/values/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::num::NonZeroUsize;
  2 | use std::sync::Arc;
  3 | 
  4 | use tokio::sync::Mutex;
  5 | 
  6 | use crate::Error;
  7 | 
  8 | use lru::LruCache;
  9 | 
 10 | use crate::Params;
 11 | use crate::disk;
 12 | use crate::manifest::Manifest;
 13 | 
 14 | pub type ValueOffset = u32;
 15 | pub type ValueBatchId = u64;
 16 | 
 17 | pub const MIN_VALUE_BATCH_ID: ValueBatchId = 1;
 18 | 
 19 | pub type ValueId = (ValueBatchId, ValueOffset);
 20 | 
 21 | const NUM_SHARDS: NonZeroUsize = NonZeroUsize::new(16).unwrap();
 22 | 
 23 | pub const GARBAGE_COLLECT_THRESHOLD: f64 = 0.2;
 24 | 
 25 | type BatchShard = LruCache<ValueBatchId, Arc<ValueBatch>>;
 26 | 
 27 | #[cfg(test)]
 28 | mod tests;
 29 | 
 30 | mod index;
 31 | pub use index::{MIN_VALUE_INDEX_PAGE_ID, ValueIndex, ValueIndexPageId};
 32 | 
 33 | mod batch;
 34 | use batch::ValueBatch;
 35 | pub use batch::ValueBatchBuilder;
 36 | 
 37 | use crate::EntryList;
 38 | use crate::wal::{LogEntry, WriteAheadLog};
 39 | 
 40 | pub struct ValueLog {
 41 |     /// The value log uses the write-ahed log
 42 |     /// to batch updates to its index
 43 |     wal: Arc<WriteAheadLog>,
 44 | 
 45 |     /// The value_index keeps track of all used entires within
 46 |     /// the value log and helps to garbage collect and
 47 |     /// defragment
 48 |     index: ValueIndex,
 49 | 
 50 |     /// Sharded storage of log batches
 51 |     batch_caches: Vec<Mutex<BatchShard>>,
 52 | 
 53 |     params: Arc<Params>,
 54 |     manifest: Arc<Manifest>,
 55 | }
 56 | 
 57 | pub struct ValueRef {
 58 |     batch: Arc<ValueBatch>,
 59 |     offset: usize,
 60 |     length: usize,
 61 | }
 62 | 
 63 | impl ValueRef {
 64 |     pub fn get_value(&self) -> &[u8] {
 65 |         &self.batch.get_value_data()[self.offset..self.offset + self.length]
 66 |     }
 67 | }
 68 | 
 69 | impl ValueLog {
 70 |     fn init_caches(params: &Params) -> Vec<Mutex<BatchShard>> {
 71 |         let max_value_files = NonZeroUsize::new(params.max_open_files / 2)
 72 |             .expect("Max open files needs to be greater than 2");
 73 | 
 74 |         let shard_size = NonZeroUsize::new(max_value_files.get() / NUM_SHARDS)
 75 |             .expect("Not enough open files to support the number of shards");
 76 | 
 77 |         (0..NUM_SHARDS.get())
 78 |             .map(|_| Mutex::new(BatchShard::new(shard_size)))
 79 |             .collect()
 80 |     }
 81 | 
 82 |     pub async fn new(
 83 |         wal: Arc<WriteAheadLog>,
 84 |         params: Arc<Params>,
 85 |         manifest: Arc<Manifest>,
 86 |     ) -> Result<Self, Error> {
 87 |         let batch_caches = Self::init_caches(&params);
 88 |         let index = ValueIndex::new(params.clone(), manifest.clone()).await?;
 89 | 
 90 |         Ok(Self {
 91 |             wal,
 92 |             index,
 93 |             params,
 94 |             manifest,
 95 |             batch_caches,
 96 |         })
 97 |     }
 98 | 
 99 |     pub async fn open(
100 |         wal: Arc<WriteAheadLog>,
101 |         params: Arc<Params>,
102 |         manifest: Arc<Manifest>,
103 |         index: ValueIndex,
104 |         to_delete: Vec<ValueBatchId>,
105 |     ) -> Result<Self, Error> {
106 |         let batch_caches = Self::init_caches(&params);
107 |         let obj = Self {
108 |             wal,
109 |             index,
110 |             params,
111 |             manifest,
112 |             batch_caches,
113 |         };
114 | 
115 |         for batch_id in to_delete.into_iter() {
116 |             obj.remove_batch_from_disk(batch_id).await?;
117 |         }
118 | 
119 |         Ok(obj)
120 |     }
121 | 
122 |     /// Marks a value as unused and, potentially, removes old value batches
123 |     /// On success, this might return a list of entries to reinsert in order to defragment the log
124 |     #[tracing::instrument(skip(self))]
125 |     pub async fn mark_value_deleted(&self, vid: ValueId) -> Result<EntryList, Error> {
126 |         let (page_id, page_offset) = self.index.mark_value_as_deleted(vid).await?;
127 |         self.wal
128 |             .store(&[LogEntry::DeleteValue(page_id, page_offset)])
129 |             .await?;
130 | 
131 |         let res = if self.try_to_remove(page_id).await? {
132 |             vec![]
133 |         } else {
134 |             self.try_to_compact(page_id).await?.unwrap_or_else(Vec::new)
135 |         };
136 | 
137 |         Ok(res)
138 |     }
139 | 
140 |     /// Attempts to delete empty batches
141 |     #[tracing::instrument(skip(self))]
142 |     async fn try_to_remove(&self, batch_id: ValueBatchId) -> Result<bool, Error> {
143 |         log::trace!("Checking if value batch #{batch_id} can be removed");
144 | 
145 |         let num_active = self.index.count_active_entries(batch_id).await;
146 | 
147 |         // Can only remove if no values in this batch are active
148 |         if num_active > 0 {
149 |             return Ok(false);
150 |         }
151 | 
152 |         log::trace!("Deleting empty batch #{batch_id}");
153 |         self.index.mark_batch_as_deleted(batch_id).await?;
154 | 
155 |         // Hold lock so nobody else messes with the file while we do this
156 |         self.remove_batch_from_disk(batch_id).await?;
157 |         Ok(true)
158 |     }
159 | 
160 |     async fn remove_batch_from_disk(&self, batch_id: ValueBatchId) -> Result<(), Error> {
161 |         let shard_id = Self::batch_to_shard_id(batch_id);
162 |         let mut cache = self.batch_caches[shard_id].lock().await;
163 |         let fpath = self.get_batch_file_path(&batch_id);
164 |         disk::remove_file(&fpath)
165 |             .await
166 |             .map_err(|err| Error::from_io_error("Failed to remove value log batch", err))?;
167 |         cache.pop(&batch_id);
168 | 
169 |         // Can we remove entries entirely?
170 |         let min_batch = self
171 |             .manifest
172 |             .get_minimum_value_batch()
173 |             .await
174 |             .max(MIN_VALUE_BATCH_ID);
175 | 
176 |         // We can only completely remove batches starting from the oldest one
177 |         // Instead, we "empty" the batch, reducing its size to a single on-disk page
178 |         if batch_id > min_batch {
179 |             return Ok(());
180 |         }
181 | 
182 |         let most_recent = self.manifest.get_most_recent_value_batch_id().await;
183 |         let mut new_minimum = batch_id;
184 | 
185 |         while new_minimum < most_recent {
186 |             if self.index.count_active_entries(batch_id).await > 0 {
187 |                 break;
188 |             }
189 |             new_minimum += 1;
190 |         }
191 | 
192 |         log::debug!("Full removed {} value batches", new_minimum - batch_id + 1);
193 |         self.manifest.set_minimum_value_batch_id(new_minimum).await;
194 | 
195 |         Ok(())
196 |     }
197 | 
198 |     /// Check if we should reinsert entries from this batch
199 |     #[tracing::instrument(skip(self))]
200 |     async fn try_to_compact(&self, batch_id: ValueBatchId) -> Result<Option<EntryList>, Error> {
201 |         log::trace!("Checking if value batch #{batch_id} should be compacted (reinserted)");
202 | 
203 |         let batch = self.get_batch(batch_id).await?;
204 |         let num_entries = batch.total_num_values() as usize;
205 |         let num_active = self.index.count_active_entries(batch_id).await;
206 |         let active_ratio = (num_active * 100) / (num_entries * 100);
207 | 
208 |         if active_ratio < 25 {
209 |             log::trace!("Re-inserting sparse value batch #{batch_id}");
210 |             let offsets = self.index.get_active_entries(batch_id).await;
211 |             self.index.mark_batch_as_compacted(batch_id).await?;
212 | 
213 |             Ok(Some(batch.get_entries(&offsets)))
214 |         } else {
215 |             Ok(None)
216 |         }
217 |     }
218 | 
219 |     #[inline]
220 |     fn batch_to_shard_id(batch_id: ValueBatchId) -> usize {
221 |         (batch_id as usize) % NUM_SHARDS
222 |     }
223 | 
224 |     #[inline]
225 |     fn get_batch_file_path(&self, batch_id: &ValueBatchId) -> std::path::PathBuf {
226 |         self.params.db_path.join(format!("val{batch_id:08}.data"))
227 |     }
228 | 
229 |     pub async fn make_batch(&self) -> ValueBatchBuilder<'_> {
230 |         let identifier = self.manifest.generate_next_value_batch_id().await;
231 |         ValueBatchBuilder::new(identifier, self)
232 |     }
233 | 
234 |     #[tracing::instrument(skip(self))]
235 |     async fn get_batch(&self, identifier: ValueBatchId) -> Result<Arc<ValueBatch>, Error> {
236 |         let shard_id = Self::batch_to_shard_id(identifier);
237 |         let mut cache = self.batch_caches[shard_id].lock().await;
238 | 
239 |         if let Some(batch) = cache.get(&identifier) {
240 |             Ok(batch.clone())
241 |         } else {
242 |             log::trace!("Loading value batch #{identifier} from disk");
243 | 
244 |             let data = disk::read(&self.get_batch_file_path(&identifier), 0)
245 |                 .await
246 |                 .map_err(|err| Error::from_io_error("Failed to read value log batch", err))?;
247 | 
248 |             let obj = Arc::new(ValueBatch::from_existing(data));
249 |             cache.put(identifier, obj.clone());
250 | 
251 |             Ok(obj)
252 |         }
253 |     }
254 | 
255 |     /// Return the reference to a value
256 |     pub async fn get_ref(&self, value_ref: ValueId) -> Result<ValueRef, Error> {
257 |         log::trace!("Getting value at {value_ref:?}");
258 | 
259 |         let (id, offset) = value_ref;
260 |         let batch = self.get_batch(id).await?;
261 | 
262 |         Ok(ValueBatch::get_ref(batch, offset))
263 |     }
264 | 
265 |     pub async fn sync(&self) -> Result<(), Error> {
266 |         self.index.sync().await
267 |     }
268 | }
269 | 


--------------------------------------------------------------------------------
/src/values/tests.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(feature = "tokio-uring")]
 2 | use kioto_uring_executor::test as async_test;
 3 | 
 4 | #[cfg(feature = "monoio")]
 5 | use monoio::test as async_test;
 6 | 
 7 | #[cfg(not(feature = "_async-io"))]
 8 | use tokio::test as async_test;
 9 | 
10 | use super::*;
11 | 
12 | use tempfile::{Builder, TempDir};
13 | 
14 | async fn test_init() -> (TempDir, ValueLog) {
15 |     let tmp_dir = Builder::new()
16 |         .prefix("lsm-value-log-test-")
17 |         .tempdir()
18 |         .unwrap();
19 |     let _ = env_logger::builder().is_test(true).try_init();
20 | 
21 |     let params = Params {
22 |         db_path: tmp_dir.path().to_path_buf(),
23 |         ..Default::default()
24 |     };
25 | 
26 |     let params = Arc::new(params);
27 |     let wal = Arc::new(WriteAheadLog::new(params.clone()).await.unwrap());
28 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
29 | 
30 |     (tmp_dir, ValueLog::new(wal, params, manifest).await.unwrap())
31 | }
32 | 
33 | #[async_test]
34 | async fn delete_batch() {
35 |     const SIZE: usize = 1_000;
36 | 
37 |     let (_tmpdir, values) = test_init().await;
38 |     let mut builder = values.make_batch().await;
39 | 
40 |     let key = "hello".as_bytes().to_vec();
41 |     let value = vec![b'a'; SIZE];
42 | 
43 |     let vid = builder.add_entry(&key, &value).await;
44 | 
45 |     let batch_id = builder.finish().await.unwrap();
46 |     let batch = values.get_batch(batch_id).await.unwrap();
47 | 
48 |     assert_eq!(batch.total_num_values(), 1);
49 | 
50 |     values.mark_value_deleted(vid).await.unwrap();
51 | 
52 |     let result = values.get_batch(batch_id).await;
53 |     assert!(result.is_err());
54 | }
55 | 
56 | #[async_test]
57 | async fn get_put_many() {
58 |     let (_tmpdir, values) = test_init().await;
59 | 
60 |     let mut builder = values.make_batch().await;
61 |     let mut vids = vec![];
62 | 
63 |     for pos in 0..1000u32 {
64 |         let key = format!("key_{pos}").as_bytes().to_vec();
65 |         let value = format!("Number {pos}").into_bytes();
66 |         let vid = builder.add_entry(&key, &value).await;
67 |         vids.push(vid);
68 |     }
69 | 
70 |     builder.finish().await.unwrap();
71 | 
72 |     for (pos, vid) in vids.iter().enumerate() {
73 |         let value = format!("Number {pos}").into_bytes();
74 | 
75 |         let result = values.get_ref(*vid).await.unwrap();
76 |         assert_eq!(result.get_value(), value);
77 |     }
78 | }
79 | 
80 | #[async_test]
81 | async fn get_put_large_value() {
82 |     let (_tmpdir, values) = test_init().await;
83 | 
84 |     const SIZE: usize = 1_000_000;
85 |     let mut builder = values.make_batch().await;
86 | 
87 |     let key = "hello".as_bytes().to_vec();
88 |     let data = vec![b'a'; SIZE];
89 | 
90 |     let vid = builder.add_entry(&key, &data).await;
91 | 
92 |     builder.finish().await.unwrap();
93 | 
94 |     assert!(values.get_ref(vid).await.unwrap().get_value() == data);
95 | }
96 | 


--------------------------------------------------------------------------------
/src/wal/reader.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use zerocopy::FromBytes;
  4 | 
  5 | #[cfg(feature = "wisckey")]
  6 | use crate::values::{ValueBatchId, ValueIndex};
  7 | 
  8 | use crate::memtable::Memtable;
  9 | use crate::{Error, Params, disk};
 10 | 
 11 | use super::{LogEntryType, PAGE_SIZE, WalWriter, WriteOp};
 12 | 
 13 | /// WAL reader used during recovery
 14 | pub struct WalReader {
 15 |     params: Arc<Params>,
 16 |     position: usize,
 17 |     current_page: Vec<u8>,
 18 | }
 19 | 
 20 | #[derive(Default)]
 21 | pub struct RecoveryResult {
 22 |     pub new_position: usize,
 23 |     pub entries_recovered: usize,
 24 |     #[cfg(feature = "wisckey")]
 25 |     pub value_batches_to_delete: Vec<ValueBatchId>,
 26 | }
 27 | 
 28 | impl WalReader {
 29 |     pub async fn new(params: Arc<Params>, start_position: usize) -> Result<Self, Error> {
 30 |         let position = start_position;
 31 |         let fpos = position / PAGE_SIZE;
 32 | 
 33 |         let fpath = WalWriter::get_file_path(&params, fpos);
 34 |         log::trace!("Opening next log file at {fpath:?}");
 35 | 
 36 |         let current_page = disk::read_uncompressed(&fpath, 0)
 37 |             .await
 38 |             .map_err(|err| Error::from_io_error("Failed to open WAL file", err))?;
 39 | 
 40 |         Ok(Self {
 41 |             params,
 42 |             current_page,
 43 |             position,
 44 |         })
 45 |     }
 46 | 
 47 |     #[cfg(feature = "wisckey")]
 48 |     pub async fn run(
 49 |         &mut self,
 50 |         memtable: &mut Memtable,
 51 |         value_index: &mut ValueIndex,
 52 |     ) -> Result<RecoveryResult, Error> {
 53 |         let mut result = RecoveryResult::default();
 54 | 
 55 |         // Re-insert ops into memtable
 56 |         loop {
 57 |             let mut log_type = [0u8; 1];
 58 |             let success = self.read_from_log(&mut log_type[..], true).await?;
 59 | 
 60 |             if !success {
 61 |                 break;
 62 |             }
 63 | 
 64 |             if log_type[0] == LogEntryType::Write as u8 {
 65 |                 self.parse_write_entry(memtable).await?
 66 |             } else if log_type[0] == LogEntryType::DeleteValue as u8 {
 67 |                 self.parse_value_deletion_entry(value_index).await?
 68 |             } else if log_type[0] == LogEntryType::DeleteBatch as u8 {
 69 |                 self.parse_batch_deletion_entry(value_index).await?
 70 |             } else {
 71 |                 panic!("Unexpected log entry type! {}", log_type[0]);
 72 |             }
 73 | 
 74 |             result.entries_recovered += 1;
 75 |         }
 76 | 
 77 |         log::debug!(
 78 |             "Found {} entries in write-ahead log",
 79 |             result.entries_recovered
 80 |         );
 81 |         result.new_position = self.position;
 82 |         Ok(result)
 83 |     }
 84 | 
 85 |     #[cfg(not(feature = "wisckey"))]
 86 |     pub async fn run(&mut self, memtable: &mut Memtable) -> Result<RecoveryResult, Error> {
 87 |         let mut result = RecoveryResult::default();
 88 | 
 89 |         // Re-insert ops into memtable
 90 |         loop {
 91 |             let mut log_type = [0u8; 1];
 92 |             let success = self.read_from_log(&mut log_type[..], true).await?;
 93 | 
 94 |             if !success {
 95 |                 break;
 96 |             }
 97 | 
 98 |             if log_type[0] == LogEntryType::Write as u8 {
 99 |                 self.parse_write_entry(memtable).await?
100 |             } else {
101 |                 panic!("Unexpected log entry type!");
102 |             }
103 | 
104 |             result.entries_recovered += 1;
105 |         }
106 | 
107 |         log::debug!(
108 |             "Found {} entries in write-ahead log",
109 |             result.entries_recovered
110 |         );
111 |         result.new_position = self.position;
112 |         Ok(result)
113 |     }
114 | 
115 |     async fn parse_write_entry(&mut self, memtable: &mut Memtable) -> Result<(), Error> {
116 |         let op_type: u8 = self.read_value().await?;
117 |         let key_len: u64 = self.read_value().await?;
118 | 
119 |         let mut key = vec![0; key_len as usize];
120 |         self.read_from_log(&mut key, false).await?;
121 | 
122 |         if op_type == WriteOp::PUT_OP {
123 |             let val_len: u64 = self.read_value().await?;
124 |             let mut value = vec![0; val_len as usize];
125 |             self.read_from_log(&mut value, false).await?;
126 |             memtable.put(key, value);
127 |         } else if op_type == WriteOp::DELETE_OP {
128 |             memtable.delete(key);
129 |         } else {
130 |             panic!("Unexpected op type!");
131 |         }
132 | 
133 |         Ok(())
134 |     }
135 | 
136 |     /// Fetches a value from the current position at the log
137 |     /// and advances the position
138 |     ///
139 |     /// This might open the next page of the log, if needed
140 |     async fn read_value<T: Sized + FromBytes>(&mut self) -> Result<T, Error> {
141 |         let mut data = vec![0u8; std::mem::size_of::<T>()];
142 |         self.read_from_log(&mut data, false).await?;
143 |         Ok(T::read_from_bytes(&data).unwrap())
144 |     }
145 | 
146 |     #[cfg(feature = "wisckey")]
147 |     async fn parse_value_deletion_entry(
148 |         &mut self,
149 |         value_index: &mut ValueIndex,
150 |     ) -> Result<(), Error> {
151 |         let page_id = self.read_value().await?;
152 |         let offset = self.read_value().await?;
153 |         value_index.mark_value_as_deleted_at(page_id, offset).await;
154 | 
155 |         Ok(())
156 |     }
157 | 
158 |     #[cfg(feature = "wisckey")]
159 |     async fn parse_batch_deletion_entry(
160 |         &mut self,
161 |         value_index: &mut ValueIndex,
162 |     ) -> Result<(), Error> {
163 |         let page_id = self.read_value().await?;
164 |         let offset = self.read_value().await?;
165 |         value_index
166 |             .mark_batch_as_deleted_at(page_id, offset)
167 |             .await?;
168 |         Ok(())
169 |     }
170 | 
171 |     /// Read the next entry from the log
172 |     /// (only used during recovery)
173 |     ///
174 |     /// TODO: Change this to just fetch an entire page at a time
175 |     async fn read_from_log(&mut self, out: &mut [u8], maybe: bool) -> Result<bool, Error> {
176 |         let buffer_len = out.len();
177 |         let mut buffer_pos = 0;
178 |         assert!(buffer_len > 0);
179 | 
180 |         while buffer_pos < buffer_len {
181 |             let offset = self.position % PAGE_SIZE;
182 |             let file_remaining = self
183 |                 .current_page
184 |                 .len()
185 |                 .checked_sub(offset)
186 |                 .expect("Invalid offset. Page too small?");
187 |             let buffer_remaining = buffer_len - buffer_pos;
188 | 
189 |             let len = buffer_remaining.min(file_remaining);
190 | 
191 |             if len > 0 {
192 |                 out[buffer_pos..buffer_pos + len]
193 |                     .copy_from_slice(&self.current_page[offset..offset + len]);
194 |                 buffer_pos += len;
195 |                 self.position += len;
196 |             } else if self.position % PAGE_SIZE != 0 {
197 |                 log::trace!(
198 |                     "WAL reader is done. Current file was not full; assuming it is the most recent."
199 |                 );
200 |                 assert!(self.current_page.len() < PAGE_SIZE);
201 |                 return Ok(false);
202 |             }
203 | 
204 |             // Move to next file?
205 |             if self.position % PAGE_SIZE == 0 {
206 |                 let fpos = self.position / PAGE_SIZE;
207 |                 let fpath = WalWriter::get_file_path(&self.params, fpos);
208 |                 log::trace!("Opening next log file at {fpath:?}");
209 | 
210 |                 self.current_page = match disk::read_uncompressed(&fpath, 0).await {
211 |                     Ok(data) => data,
212 |                     Err(err) => {
213 |                         if maybe && err.kind() == std::io::ErrorKind::NotFound {
214 |                             // At last file but it is still exactly
215 |                             // one page
216 |                             log::trace!("WAL reader is done. No next log file found");
217 |                             return Ok(false);
218 |                         } else {
219 |                             return Err(Error::from_io_error("Failed to open WAL file", err));
220 |                         }
221 |                     }
222 |                 }
223 |             }
224 |         }
225 | 
226 |         Ok(true)
227 |     }
228 | }
229 | 


--------------------------------------------------------------------------------
/src/wal/tests.rs:
--------------------------------------------------------------------------------
  1 | /// Tests for the write-ahead log, especially its behavior during recovery
  2 | use tempfile::TempDir;
  3 | 
  4 | use super::*;
  5 | 
  6 | #[cfg(feature = "wisckey")]
  7 | use crate::{manifest::Manifest, values::ValueIndex};
  8 | 
  9 | #[cfg(feature = "tokio-uring")]
 10 | use kioto_uring_executor::test as async_test;
 11 | 
 12 | #[cfg(feature = "monoio")]
 13 | use monoio::test as async_test;
 14 | 
 15 | #[cfg(not(feature = "_async-io"))]
 16 | use tokio::test as async_test;
 17 | 
 18 | async fn test_init() -> (TempDir, Arc<Params>, WriteAheadLog) {
 19 |     let _ = env_logger::builder().is_test(true).try_init();
 20 | 
 21 |     let tempdir = tempfile::Builder::new()
 22 |         .prefix("lsm-wal-test-")
 23 |         .tempdir()
 24 |         .expect("Failed to create temporary directory");
 25 | 
 26 |     log::debug!("Created tempdir at {:?}", tempdir.path());
 27 | 
 28 |     let params = Arc::new(Params {
 29 |         db_path: tempdir.path().to_path_buf(),
 30 |         ..Default::default()
 31 |     });
 32 | 
 33 |     let wal = WriteAheadLog::new(params.clone()).await.unwrap();
 34 |     (tempdir, params, wal)
 35 | }
 36 | 
 37 | async fn test_cleanup(tempdir: TempDir, wal: WriteAheadLog) {
 38 |     // Finish all writes before we stop the tests
 39 |     wal.stop().await.expect("WAL sync failed");
 40 | 
 41 |     // Ensure that the tempdir is dropped last
 42 |     drop(wal);
 43 | 
 44 |     log::trace!("Removing tempdir at {:?}", tempdir.path());
 45 |     drop(tempdir);
 46 | }
 47 | 
 48 | #[cfg(feature = "wisckey")]
 49 | async fn reopen_wal(params: Arc<Params>, offset: u64) -> (Memtable, WriteAheadLog) {
 50 |     let mut memtable = Memtable::new(0);
 51 | 
 52 |     let manifest = Arc::new(Manifest::new(params.clone()).await);
 53 |     let mut freelist = ValueIndex::new(params.clone(), manifest).await.unwrap();
 54 |     let (wal, _) = WriteAheadLog::open(params, offset, &mut memtable, &mut freelist)
 55 |         .await
 56 |         .unwrap();
 57 | 
 58 |     (memtable, wal)
 59 | }
 60 | 
 61 | #[cfg(not(feature = "wisckey"))]
 62 | async fn reopen_wal(params: Arc<Params>, offset: u64) -> (Memtable, WriteAheadLog) {
 63 |     let mut memtable = Memtable::new(0);
 64 | 
 65 |     let (wal, _) = WriteAheadLog::open(params, offset, &mut memtable)
 66 |         .await
 67 |         .unwrap();
 68 | 
 69 |     (memtable, wal)
 70 | }
 71 | 
 72 | #[async_test]
 73 | async fn empty_sync() {
 74 |     let (tempdir, _, wal) = test_init().await;
 75 | 
 76 |     assert_eq!(wal.inner.status.read().sync_pos, 0);
 77 |     assert_eq!(wal.inner.status.read().write_pos, 0);
 78 | 
 79 |     test_cleanup(tempdir, wal).await;
 80 | }
 81 | 
 82 | #[async_test]
 83 | async fn write_and_sync() {
 84 |     let (tempdir, _, wal) = test_init().await;
 85 | 
 86 |     let key = vec![1, 2];
 87 |     let value = vec![2, 3];
 88 |     let op = WriteOp::Put(key.clone(), value.clone());
 89 | 
 90 |     wal.store(&[LogEntry::Write(&op)]).await.unwrap();
 91 |     wal.sync().await.unwrap();
 92 | 
 93 |     assert_eq!(wal.inner.status.read().sync_pos, 22);
 94 |     assert_eq!(wal.inner.status.read().write_pos, 22);
 95 | 
 96 |     test_cleanup(tempdir, wal).await;
 97 | }
 98 | 
 99 | #[async_test]
100 | async fn write_large_value() {
101 |     let (tempdir, _, wal) = test_init().await;
102 | 
103 |     let key = vec![1, 2];
104 |     let value = vec![1; 2 * PAGE_SIZE];
105 |     let op = WriteOp::Put(key.clone(), value.clone());
106 | 
107 |     wal.store(&[LogEntry::Write(&op)]).await.unwrap();
108 |     wal.sync().await.unwrap();
109 | 
110 |     assert_eq!(wal.inner.status.read().sync_pos, 8212);
111 |     assert_eq!(wal.inner.status.read().write_pos, 8212);
112 | 
113 |     test_cleanup(tempdir, wal).await;
114 | }
115 | 
116 | #[async_test]
117 | async fn reopen() {
118 |     let (tempdir, params, wal) = test_init().await;
119 | 
120 |     let key = vec![1, 2];
121 |     let value = vec![2, 3];
122 |     let op = WriteOp::Put(key.clone(), value.clone());
123 | 
124 |     wal.store(&[LogEntry::Write(&op)]).await.unwrap();
125 |     wal.sync().await.unwrap();
126 |     drop(wal);
127 | 
128 |     let (memtable, wal) = reopen_wal(params, 0).await;
129 |     assert_eq!(wal.inner.status.read().sync_pos, 22);
130 |     assert_eq!(wal.inner.status.read().write_pos, 22);
131 | 
132 |     let entry = memtable.get(&key).unwrap();
133 |     assert_eq!(entry.get_value(), Some(value).as_deref());
134 | 
135 |     test_cleanup(tempdir, wal).await;
136 | }
137 | 
138 | #[async_test]
139 | async fn reopen_with_offset1() {
140 |     let (tempdir, params, wal) = test_init().await;
141 | 
142 |     let key1 = vec![1, 2];
143 |     let key2 = vec![1, 2, 3];
144 |     let value = vec![2, 3];
145 | 
146 |     let op1 = WriteOp::Put(key1.clone(), value.clone());
147 |     let op2 = WriteOp::Put(key2.clone(), value.clone());
148 | 
149 |     wal.store(&[LogEntry::Write(&op1)]).await.unwrap();
150 |     wal.store(&[LogEntry::Write(&op2)]).await.unwrap();
151 |     wal.sync().await.unwrap();
152 | 
153 |     drop(wal);
154 | 
155 |     let (memtable, wal) = reopen_wal(params, 22).await;
156 | 
157 |     assert_eq!(wal.inner.status.read().sync_pos, 45);
158 |     assert_eq!(wal.inner.status.read().write_pos, 45);
159 | 
160 |     assert!(memtable.get(&key1).is_none());
161 |     let entry = memtable.get(&key2).unwrap();
162 |     assert_eq!(entry.get_value(), Some(value).as_deref());
163 | 
164 |     test_cleanup(tempdir, wal).await;
165 | }
166 | 
167 | #[async_test]
168 | async fn reopen_with_offset_and_cleanup1() {
169 |     let (tempdir, params, wal) = test_init().await;
170 | 
171 |     let key1 = vec![1, 2];
172 |     let key2 = vec![1, 2, 3];
173 |     let value = vec![2, 3];
174 | 
175 |     let op1 = WriteOp::Put(key1.clone(), value.clone());
176 |     let op2 = WriteOp::Put(key2.clone(), value.clone());
177 | 
178 |     wal.store(&[LogEntry::Write(&op1)]).await.unwrap();
179 |     wal.store(&[LogEntry::Write(&op2)]).await.unwrap();
180 |     wal.sync().await.unwrap();
181 | 
182 |     let offset = 22;
183 |     wal.set_offset(offset).await;
184 |     drop(wal);
185 | 
186 |     let (memtable, wal) = reopen_wal(params, offset).await;
187 | 
188 |     assert_eq!(wal.inner.status.read().sync_pos, 45);
189 |     assert_eq!(wal.inner.status.read().write_pos, 45);
190 | 
191 |     assert!(memtable.get(&key1).is_none());
192 |     let entry = memtable.get(&key2).unwrap();
193 |     assert_eq!(entry.get_value(), Some(value).as_deref());
194 | 
195 |     test_cleanup(tempdir, wal).await;
196 | }
197 | 
198 | #[async_test]
199 | async fn reopen_with_offset_and_cleanup2() {
200 |     let (tempdir, params, wal) = test_init().await;
201 | 
202 |     let key1 = vec![1, 2];
203 |     let key2 = vec![1, 2, 3];
204 |     let value1 = vec![2; 2 * PAGE_SIZE];
205 |     let value2 = vec![2, 3];
206 | 
207 |     let op1 = WriteOp::Put(key1.clone(), value1.clone());
208 |     let op2 = WriteOp::Put(key2.clone(), value2.clone());
209 | 
210 |     wal.store(&[LogEntry::Write(&op1)]).await.unwrap();
211 |     wal.store(&[LogEntry::Write(&op2)]).await.unwrap();
212 |     wal.sync().await.unwrap();
213 | 
214 |     let offset = 8212;
215 |     wal.set_offset(offset).await;
216 | 
217 |     drop(wal);
218 | 
219 |     let (memtable, wal) = reopen_wal(params, offset).await;
220 | 
221 |     assert_eq!(wal.inner.status.read().sync_pos, 8235);
222 |     assert_eq!(wal.inner.status.read().write_pos, 8235);
223 | 
224 |     assert!(memtable.get(&key1).is_none());
225 |     let entry = memtable.get(&key2).unwrap();
226 |     assert_eq!(entry.get_value(), Some(value2).as_deref());
227 | 
228 |     test_cleanup(tempdir, wal).await;
229 | }
230 | 
231 | #[async_test]
232 | async fn reopen_with_offset2() {
233 |     let (tempdir, params, wal) = test_init().await;
234 | 
235 |     let key1 = vec![1, 2];
236 |     let key2 = vec![1, 2, 3];
237 |     let value1 = vec![2; 2 * PAGE_SIZE];
238 |     let value2 = vec![2, 3];
239 | 
240 |     let op1 = WriteOp::Put(key1.clone(), value1.clone());
241 |     let op2 = WriteOp::Put(key2.clone(), value2.clone());
242 | 
243 |     wal.store(&[LogEntry::Write(&op1)]).await.unwrap();
244 |     wal.store(&[LogEntry::Write(&op2)]).await.unwrap();
245 |     wal.sync().await.unwrap();
246 | 
247 |     drop(wal);
248 | 
249 |     let (memtable, wal) = reopen_wal(params, 8212).await;
250 | 
251 |     assert_eq!(wal.inner.status.read().sync_pos, 8235);
252 |     assert_eq!(wal.inner.status.read().write_pos, 8235);
253 | 
254 |     assert!(memtable.get(&key1).is_none());
255 |     let entry = memtable.get(&key2).unwrap();
256 |     assert_eq!(entry.get_value(), Some(value2).as_deref());
257 | 
258 |     test_cleanup(tempdir, wal).await;
259 | }
260 | 
261 | #[async_test]
262 | async fn reopen_large_file() {
263 |     let (tempdir, params, wal) = test_init().await;
264 | 
265 |     let key = vec![1, 2];
266 |     let value = vec![2; 2 * PAGE_SIZE];
267 |     let op = WriteOp::Put(key.clone(), value.clone());
268 | 
269 |     wal.store(&[LogEntry::Write(&op)]).await.unwrap();
270 |     wal.sync().await.unwrap();
271 | 
272 |     drop(wal);
273 | 
274 |     let (memtable, wal) = reopen_wal(params, 0).await;
275 | 
276 |     assert_eq!(wal.inner.status.read().sync_pos, 8212);
277 |     assert_eq!(wal.inner.status.read().write_pos, 8212);
278 | 
279 |     let entry = memtable.get(&key).unwrap();
280 |     assert_eq!(entry.get_value(), Some(value).as_deref());
281 | 
282 |     test_cleanup(tempdir, wal).await;
283 | }
284 | 


--------------------------------------------------------------------------------
/src/wal/writer.rs:
--------------------------------------------------------------------------------
  1 | use std::path::{Path, PathBuf};
  2 | use std::sync::Arc;
  3 | 
  4 | #[cfg(not(feature = "_async-io"))]
  5 | use std::io::Write;
  6 | 
  7 | #[cfg(feature = "tokio-uring")]
  8 | use tokio_uring::fs::{File, OpenOptions};
  9 | 
 10 | #[cfg(feature = "tokio-uring")]
 11 | use tokio_uring::buf::BoundedBuf;
 12 | 
 13 | #[cfg(feature = "monoio")]
 14 | use monoio::fs::{File, OpenOptions};
 15 | 
 16 | #[cfg(not(feature = "_async-io"))]
 17 | use std::fs::{File, OpenOptions};
 18 | 
 19 | #[cfg(feature = "monoio")]
 20 | use monoio::buf::IoBuf;
 21 | 
 22 | use cfg_if::cfg_if;
 23 | 
 24 | use crate::wal::{LogInner, PAGE_SIZE};
 25 | use crate::{Error, Params, disk};
 26 | 
 27 | /// The task that actually writes the log to disk
 28 | pub struct WalWriter {
 29 |     log_file: File,
 30 |     position: usize,
 31 |     params: Arc<Params>,
 32 | }
 33 | 
 34 | impl WalWriter {
 35 |     pub async fn new(params: Arc<Params>) -> Self {
 36 |         let log_file = Self::create_file(&params, 0).await.unwrap_or_else(|err| {
 37 |             panic!(
 38 |                 "Failed to create WAL file in directory {:?}: {err}",
 39 |                 params.db_path
 40 |             )
 41 |         });
 42 | 
 43 |         Self {
 44 |             log_file,
 45 |             params,
 46 |             position: 0,
 47 |         }
 48 |     }
 49 | 
 50 |     /// Start the writer at a specific position after opening a log
 51 |     pub async fn continue_from(position: usize, params: Arc<Params>) -> Self {
 52 |         let fpos = position / PAGE_SIZE;
 53 | 
 54 |         let log_file = if position % PAGE_SIZE == 0 {
 55 |             // At the beginning of a new file
 56 |             Self::create_file(&params, fpos)
 57 |                 .await
 58 |                 .unwrap_or_else(|err| {
 59 |                     panic!(
 60 |                         "Failed to create WAL file in directory {:?}: {err}",
 61 |                         params.db_path
 62 |                     )
 63 |                 })
 64 |         } else {
 65 |             Self::open_file(&params, fpos).await.unwrap_or_else(|err| {
 66 |                 panic!(
 67 |                     "Failed to open WAL file in directory {:?}: {err}",
 68 |                     params.db_path
 69 |                 )
 70 |             })
 71 |         };
 72 | 
 73 |         Self {
 74 |             log_file,
 75 |             params,
 76 |             position,
 77 |         }
 78 |     }
 79 | 
 80 |     pub fn get_file_path(params: &Params, fpos: usize) -> PathBuf {
 81 |         params
 82 |             .db_path
 83 |             .join(Path::new(&format!("log{:08}.data", fpos + 1)))
 84 |     }
 85 | 
 86 |     /// Open an existing log file (used during recovery/restart)
 87 |     pub async fn open_file(params: &Params, fpos: usize) -> Result<File, std::io::Error> {
 88 |         let fpath = Self::get_file_path(params, fpos);
 89 |         log::trace!("Opening file at {fpath:?}");
 90 | 
 91 |         cfg_if! {
 92 |             if #[cfg(feature="_async-io")] {
 93 |                 let log_file = OpenOptions::new()
 94 |                     .read(true).write(true).create(false).truncate(false)
 95 |                     .open(fpath).await?;
 96 |             } else {
 97 |                  let log_file = OpenOptions::new()
 98 |                     .read(true).write(true).create(false).truncate(false)
 99 |                     .open(fpath)?;
100 |             }
101 |         }
102 | 
103 |         Ok(log_file)
104 |     }
105 | 
106 |     /// Returns true if the writer is done and the associated task should terminate
107 |     pub async fn update_log(&mut self, inner: &LogInner) -> Result<bool, Error> {
108 |         let (to_write, sync_flag, sync_pos, new_offset, stop_flag) = loop {
109 |             // This works around the following bug:
110 |             // https://github.com/rust-lang/rust/issues/63768
111 |             let fut = inner.queue_cond.notified();
112 |             tokio::pin!(fut);
113 | 
114 |             {
115 |                 let mut lock = inner.status.write();
116 |                 let to_write = std::mem::take(&mut lock.queue);
117 |                 let sync_flag = lock.sync_flag;
118 |                 let sync_pos = lock.sync_pos;
119 |                 let stop_flag = lock.stop_flag;
120 | 
121 |                 let new_offset = if lock.offset_pos > lock.flush_pos {
122 |                     Some((lock.offset_pos, lock.flush_pos))
123 |                 } else {
124 |                     assert_eq!(lock.offset_pos, lock.flush_pos);
125 |                     None
126 |                 };
127 | 
128 |                 // Check whether there is something to do
129 |                 if !to_write.is_empty() || new_offset.is_some() || sync_flag || stop_flag {
130 |                     assert_eq!(self.position, lock.write_pos);
131 | 
132 |                     lock.sync_flag = false;
133 |                     break (to_write, sync_flag, sync_pos, new_offset, stop_flag);
134 |                 }
135 | 
136 |                 // wait for change to queue and retry
137 |                 assert_eq!(lock.write_pos, lock.queue_pos);
138 |                 fut.as_mut().enable();
139 |             }
140 | 
141 |             fut.await;
142 |         };
143 | 
144 |         // Don't hold lock while write
145 |         for buf in to_write.into_iter() {
146 |             self.write_all(buf)
147 |                 .await
148 |                 .map_err(|err| Error::from_io_error("Failed to writ write-ahead log", err))?;
149 |         }
150 | 
151 |         // Only sync if necessary
152 |         // We do not need to hold the lock while syncing
153 |         // because there is only one write-ahead writer
154 |         if sync_flag && sync_pos < self.position {
155 |             self.sync().await;
156 |             inner.status.write().sync_pos = self.position;
157 |         }
158 | 
159 |         if let Some((new_offset, old_offset)) = new_offset {
160 |             self.set_offset(new_offset, old_offset).await?;
161 |         }
162 | 
163 |         // Notify about finished write(s)
164 |         {
165 |             let mut lock = inner.status.write();
166 |             assert!(lock.write_pos <= self.position);
167 |             lock.write_pos = self.position;
168 | 
169 |             if let Some((new_offset, _)) = new_offset {
170 |                 lock.flush_pos = new_offset;
171 |             }
172 | 
173 |             inner.write_cond.notify_waiters();
174 |         }
175 | 
176 |         if stop_flag {
177 |             log::debug!("WAL writer finished");
178 |         }
179 | 
180 |         Ok(stop_flag)
181 |     }
182 | 
183 |     async fn set_offset(&mut self, new_offset: usize, old_offset: usize) -> Result<(), Error> {
184 |         let old_file_pos = old_offset / PAGE_SIZE;
185 |         let new_file_pos = new_offset / PAGE_SIZE;
186 | 
187 |         for fpos in old_file_pos..new_file_pos {
188 |             let fpath = self
189 |                 .params
190 |                 .db_path
191 |                 .join(Path::new(&format!("log{:08}.data", fpos + 1)));
192 |             log::trace!("Removing file {fpath:?}");
193 | 
194 |             disk::remove_file(&fpath).await.map_err(|err| {
195 |                 Error::from_io_error(format!("Failed to remove log file {fpath:?}"), err)
196 |             })?;
197 |         }
198 | 
199 |         Ok(())
200 |     }
201 | 
202 |     async fn sync(&mut self) {
203 |         cfg_if! {
204 |             if #[cfg(feature="_async-io") ] {
205 |                 self.log_file.sync_data().await
206 |                     .expect("Data sync failed");
207 |             } else {
208 |                 self.log_file.sync_data()
209 |                    .expect("Data sync failed");
210 |             }
211 |         }
212 |     }
213 | 
214 |     #[allow(unused_mut)]
215 |     async fn write_all(&mut self, mut data: Vec<u8>) -> Result<(), std::io::Error> {
216 |         let mut buf_pos = 0;
217 |         while buf_pos < data.len() {
218 |             let mut file_offset = self.position % PAGE_SIZE;
219 | 
220 |             // Figure out how much we can fit into the current file
221 |             assert!(file_offset < PAGE_SIZE);
222 | 
223 |             let page_remaining = PAGE_SIZE - file_offset;
224 |             let buffer_remaining = data.len() - buf_pos;
225 |             let write_len = (buffer_remaining).min(page_remaining);
226 | 
227 |             assert!(write_len > 0);
228 |             cfg_if! {
229 |                 if #[cfg(feature="tokio-uring")] {
230 |                     let to_write = data.slice(buf_pos..buf_pos + write_len);
231 |                     let (res, buf) = self.log_file.write_all_at(to_write, file_offset as u64).await;
232 |                     res.expect("Failed to write to log file");
233 | 
234 |                     data = buf.into_inner();
235 |                 } else if #[cfg(feature="monoio")] {
236 |                     let to_write = data.slice(buf_pos..buf_pos + write_len);
237 |                     let (res, buf) = self.log_file.write_all_at(to_write, file_offset as u64).await;
238 |                     res.expect("Failed to write to log file");
239 | 
240 |                     data = buf.into_inner();
241 | 
242 | 
243 |                 }else {
244 |                     let to_write = &data[buf_pos..buf_pos + write_len];
245 |                     self.log_file.write_all(to_write).expect("Failed to write log file");
246 |                 }
247 |             }
248 | 
249 |             buf_pos += write_len;
250 |             self.position += write_len;
251 |             file_offset += write_len;
252 | 
253 |             assert!(file_offset <= PAGE_SIZE);
254 | 
255 |             // Create a new file?
256 |             if file_offset == PAGE_SIZE {
257 |                 let file_pos = self.position / PAGE_SIZE;
258 |                 self.log_file = Self::create_file(&self.params, file_pos).await?;
259 |             }
260 |         }
261 | 
262 |         Ok(())
263 |     }
264 | 
265 |     /// Create a new file that is part of the log
266 |     pub async fn create_file(params: &Params, file_pos: usize) -> Result<File, std::io::Error> {
267 |         let fpath = Self::get_file_path(params, file_pos);
268 |         log::trace!("Creating new log file at {fpath:?}");
269 | 
270 |         cfg_if! {
271 |             if #[cfg(feature="_async-io")] {
272 |                 File::create(fpath).await
273 |             } else {
274 |                 File::create(fpath)
275 |             }
276 |         }
277 |     }
278 | }
279 | 


--------------------------------------------------------------------------------
/src/write_batch.rs:
--------------------------------------------------------------------------------
 1 | use crate::{Key, Value};
 2 | 
 3 | #[derive(Debug)]
 4 | pub enum WriteOp {
 5 |     Put(Key, Value),
 6 |     Delete(Key),
 7 | }
 8 | 
 9 | /// A WriteBatch allows to bundle multiple updates together for higher throughput
10 | ///
11 | /// Note: The batch will not be applied to the database until it is passed to `Database::write`
12 | #[derive(Debug)]
13 | pub struct WriteBatch {
14 |     pub(crate) writes: Vec<WriteOp>,
15 | }
16 | 
17 | impl WriteOp {
18 |     pub(crate) const PUT_OP: u8 = 1;
19 |     pub(crate) const DELETE_OP: u8 = 2;
20 | 
21 |     pub fn get_key(&self) -> &[u8] {
22 |         match self {
23 |             Self::Put(key, _) => key,
24 |             Self::Delete(key) => key,
25 |         }
26 |     }
27 | 
28 |     pub fn get_type(&self) -> u8 {
29 |         match self {
30 |             Self::Put(_, _) => Self::PUT_OP,
31 |             Self::Delete(_) => Self::DELETE_OP,
32 |         }
33 |     }
34 | 
35 |     pub(crate) fn get_key_length(&self) -> u64 {
36 |         match self {
37 |             Self::Put(key, _) | Self::Delete(key) => key.len() as u64,
38 |         }
39 |     }
40 | 
41 |     #[allow(dead_code)]
42 |     pub(crate) fn get_value_length(&self) -> u64 {
43 |         match self {
44 |             Self::Put(_, value) => value.len() as u64,
45 |             Self::Delete(_) => 0u64,
46 |         }
47 |     }
48 | }
49 | 
50 | impl WriteBatch {
51 |     pub fn new() -> Self {
52 |         Self { writes: Vec::new() }
53 |     }
54 | 
55 |     /// Record a put operation in the write batch
56 |     /// Will not be applied to the Database until the WriteBatch is written
57 |     pub fn put(&mut self, key: Key, value: Value) {
58 |         self.writes.push(WriteOp::Put(key, value));
59 |     }
60 | 
61 |     pub fn delete(&mut self, key: Key) {
62 |         self.writes.push(WriteOp::Delete(key));
63 |     }
64 | }
65 | 
66 | impl Default for WriteBatch {
67 |     fn default() -> Self {
68 |         Self::new()
69 |     }
70 | }
71 | 
72 | /// Allows specifying details of a write
73 | #[derive(Debug, Clone)]
74 | pub struct WriteOptions {
75 |     /// Should the call block until it is guaranteed to be written to disk?
76 |     pub sync: bool,
77 | }
78 | 
79 | impl WriteOptions {
80 |     pub const fn new() -> Self {
81 |         Self { sync: true }
82 |     }
83 | }
84 | 
85 | impl Default for WriteOptions {
86 |     fn default() -> Self {
87 |         Self::new()
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/sync/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "lsm-sync"
 3 | version = "0.5.0-dev"
 4 | authors = ["Kai Mast <kai@kaimast.com>"]
 5 | edition = "2024"
 6 | repository = "https://github.com/kaimast/lsm-rs"
 7 | description = "Synchronous API for the lsm crate"
 8 | license = "MIT"
 9 | readme = "../README.md"
10 | keywords = ["storage", "database", "async"]
11 | 
12 | [dependencies]
13 | lsm = { path="..", version="0.5.0-dev" }
14 | cfg-if = "1"
15 | 
16 | [dependencies.tokio]
17 | version="1"
18 | default-features=false
19 | features=["rt-multi-thread", "io-util", "sync", "macros", "tracing"]
20 | 
21 | [dev-dependencies]
22 | env_logger = "0.11"
23 | tempfile = "3"
24 | 
25 | [features]
26 | default = ["snappy-compression", "bloom-filters"]
27 | snappy-compression = ["lsm/snappy-compression"]
28 | wisckey = ["lsm/wisckey"]
29 | bloom-filters = ["lsm/bloom-filters"]
30 | 
31 | [[test]]
32 | name = "basic"
33 | path = "tests/basic.rs"
34 | 
35 | [[test]]
36 | name = "reopen"
37 | path = "tests/reopen.rs"
38 | 


--------------------------------------------------------------------------------
/sync/justfile:
--------------------------------------------------------------------------------
 1 | LOG_LEVEL := "debug"
 2 | 
 3 | tests: default-tests wisckey-tests
 4 | 
 5 | default-tests:
 6 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test
 7 | 
 8 | lint:
 9 |     cargo clippy --no-default-features -- -D warnings
10 | 
11 | check-formatting:
12 |     cargo fmt --check
13 | 
14 | fix-formatting:
15 |     cargo fmt
16 | 
17 | udeps:
18 |     cargo udeps --all-targets --release
19 | 
20 | wisckey-tests:
21 |     env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=snappy-compression,wisckey
22 | 


--------------------------------------------------------------------------------
/sync/src/database.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use tokio::runtime::Runtime as TokioRuntime;
  4 | 
  5 | use lsm::logic::DbLogic;
  6 | use lsm::tasks::{TaskManager, TaskType};
  7 | use lsm::{EntryRef, Error, Key, Params, StartMode, Value, WriteBatch, WriteOptions};
  8 | 
  9 | use crate::iterate::DbIterator;
 10 | 
 11 | pub struct Database {
 12 |     inner: Arc<DbLogic>,
 13 |     tasks: Arc<TaskManager>,
 14 |     tokio_rt: Arc<TokioRuntime>,
 15 | }
 16 | 
 17 | impl Database {
 18 |     pub fn new(mode: StartMode) -> Result<Self, Error> {
 19 |         let params = Params::default();
 20 |         Self::new_with_params(mode, params)
 21 |     }
 22 | 
 23 |     pub fn new_with_params(mode: StartMode, params: Params) -> Result<Self, Error> {
 24 |         let tokio_rt = Arc::new(TokioRuntime::new().expect("Failed to start tokio"));
 25 |         let (inner, tasks) = tokio_rt.block_on(async {
 26 |             let compaction_concurrency = params.compaction_concurrency;
 27 | 
 28 |             match DbLogic::new(mode, params).await {
 29 |                 Ok(inner) => {
 30 |                     let inner = Arc::new(inner);
 31 |                     let tasks =
 32 |                         Arc::new(TaskManager::new(inner.clone(), compaction_concurrency).await);
 33 | 
 34 |                     Ok((inner, tasks))
 35 |                 }
 36 |                 Err(err) => Err(err),
 37 |             }
 38 |         })?;
 39 | 
 40 |         Ok(Self {
 41 |             inner,
 42 |             tasks,
 43 |             tokio_rt,
 44 |         })
 45 |     }
 46 | 
 47 |     /// Will deserialize V from the raw data (avoids an additional copy)
 48 |     #[inline]
 49 |     pub fn get(&self, key: &[u8]) -> Result<Option<EntryRef>, Error> {
 50 |         let inner = &*self.inner;
 51 | 
 52 |         self.tokio_rt.block_on(async {
 53 |             let result = inner.get(key).await;
 54 | 
 55 |             match result {
 56 |                 Ok((needs_compaction, data)) => {
 57 |                     if needs_compaction {
 58 |                         self.tasks.wake_up(&TaskType::LevelCompaction);
 59 |                     }
 60 | 
 61 |                     Ok(data)
 62 |                 }
 63 |                 Err(err) => Err(err),
 64 |             }
 65 |         })
 66 |     }
 67 | 
 68 |     /// Ensure all data is written to disk
 69 |     /// Only has an effect if there were previous writes with sync=false
 70 |     pub fn synchronize(&self) -> Result<(), Error> {
 71 |         let inner = &*self.inner;
 72 | 
 73 |         self.tokio_rt
 74 |             .block_on(async move { inner.synchronize().await })
 75 |     }
 76 | 
 77 |     /// Store entry
 78 |     #[inline]
 79 |     pub fn put(&self, key: Key, value: Value) -> Result<(), Error> {
 80 |         const OPTS: WriteOptions = WriteOptions::new();
 81 |         self.put_opts(key, value, &OPTS)
 82 |     }
 83 | 
 84 |     /// Store entry (with options)
 85 |     #[inline]
 86 |     pub fn put_opts(&self, key: Key, value: Value, opts: &WriteOptions) -> Result<(), Error> {
 87 |         let mut batch = WriteBatch::new();
 88 |         batch.put(key, value);
 89 |         self.write_opts(batch, opts)
 90 |     }
 91 | 
 92 |     /// Delete an existing entry
 93 |     /// For efficiency, the datastore does not check whether the key actually existed
 94 |     /// Instead, it will just mark the most recent (which could be the first one) as deleted
 95 |     pub fn delete(&self, key: Key) -> Result<(), Error> {
 96 |         const OPTS: WriteOptions = WriteOptions::new();
 97 | 
 98 |         let mut batch = WriteBatch::new();
 99 |         batch.delete(key);
100 | 
101 |         self.write_opts(batch, &OPTS)
102 |     }
103 | 
104 |     /// Delete an existing entry (with additional options)
105 |     pub fn delete_opts(&self, key: Key, opts: &WriteOptions) -> Result<(), Error> {
106 |         let mut batch = WriteBatch::new();
107 |         batch.delete(key);
108 | 
109 |         self.write_opts(batch, opts)
110 |     }
111 | 
112 |     /// Iterate over all entries in the database
113 |     pub fn iter(&self) -> DbIterator {
114 |         let tokio_rt = self.tokio_rt.clone();
115 | 
116 |         self.tokio_rt.block_on(async {
117 |             let (mem_iters, table_iters, min_key, max_key) =
118 |                 self.inner.prepare_iter(None, None).await;
119 | 
120 |             DbIterator::new(
121 |                 mem_iters,
122 |                 table_iters,
123 |                 min_key,
124 |                 max_key,
125 |                 false,
126 |                 #[cfg(feature = "wisckey")]
127 |                 self.inner.get_value_log(),
128 |                 tokio_rt,
129 |             )
130 |         })
131 |     }
132 | 
133 |     /// Like iter(), but reverse
134 |     pub fn reverse_iter(&self) -> DbIterator {
135 |         let tokio_rt = self.tokio_rt.clone();
136 | 
137 |         self.tokio_rt.block_on(async {
138 |             let (mem_iters, table_iters, min_key, max_key) =
139 |                 self.inner.prepare_reverse_iter(None, None).await;
140 | 
141 |             DbIterator::new(
142 |                 mem_iters,
143 |                 table_iters,
144 |                 min_key,
145 |                 max_key,
146 |                 true,
147 |                 #[cfg(feature = "wisckey")]
148 |                 self.inner.get_value_log(),
149 |                 tokio_rt,
150 |             )
151 |         })
152 |     }
153 | 
154 |     /// Like iter(), but will only include entries with keys in [min_key;max_key)
155 |     pub fn range_iter(&self, min: &[u8], max: &[u8]) -> DbIterator {
156 |         let tokio_rt = self.tokio_rt.clone();
157 | 
158 |         self.tokio_rt.block_on(async {
159 |             let (mem_iters, table_iters, min_key, max_key) =
160 |                 self.inner.prepare_iter(Some(min), Some(max)).await;
161 | 
162 |             DbIterator::new(
163 |                 mem_iters,
164 |                 table_iters,
165 |                 min_key,
166 |                 max_key,
167 |                 false,
168 |                 #[cfg(feature = "wisckey")]
169 |                 self.inner.get_value_log(),
170 |                 tokio_rt,
171 |             )
172 |         })
173 |     }
174 | 
175 |     /// Like range_iter(), but in reverse.
176 |     /// It will only include entries with keys in (min_key;max_key]
177 |     pub fn reverse_range_iter(&self, max_key: &[u8], min_key: &[u8]) -> DbIterator {
178 |         let tokio_rt = self.tokio_rt.clone();
179 | 
180 |         self.tokio_rt.block_on(async {
181 |             let (mem_iters, table_iters, min_key, max_key) = self
182 |                 .inner
183 |                 .prepare_reverse_iter(Some(max_key), Some(min_key))
184 |                 .await;
185 | 
186 |             DbIterator::new(
187 |                 mem_iters,
188 |                 table_iters,
189 |                 min_key,
190 |                 max_key,
191 |                 true,
192 |                 #[cfg(feature = "wisckey")]
193 |                 self.inner.get_value_log(),
194 |                 tokio_rt,
195 |             )
196 |         })
197 |     }
198 | 
199 |     /// Write a batch of updates to the database
200 |     ///
201 |     /// If you only want to write to a single key, use `Database::put` instead
202 |     pub fn write(&self, write_batch: WriteBatch) -> Result<(), Error> {
203 |         self.write_opts(write_batch, &WriteOptions::default())
204 |     }
205 | 
206 |     pub fn write_opts(&self, write_batch: WriteBatch, opts: &WriteOptions) -> Result<(), Error> {
207 |         let inner = &*self.inner;
208 | 
209 |         self.tokio_rt.block_on(async move {
210 |             let needs_compaction = inner.write_opts(write_batch, opts).await?;
211 |             if needs_compaction {
212 |                 self.tasks.wake_up(&TaskType::MemtableCompaction);
213 |             }
214 | 
215 |             Ok(())
216 |         })
217 |     }
218 | 
219 |     /// Stop all background tasks gracefully
220 |     pub fn stop(&self) -> Result<(), Error> {
221 |         let tasks = self.tasks.clone();
222 | 
223 |         self.tokio_rt
224 |             .block_on(async move { tasks.stop_all().await })
225 |     }
226 | }
227 | 
228 | impl Drop for Database {
229 |     /// This might abort some tasks is stop() has not been called
230 |     /// crash consistency should prevent this from being a problem
231 |     fn drop(&mut self) {
232 |         self.tasks.terminate();
233 |     }
234 | }
235 | 


--------------------------------------------------------------------------------
/sync/src/iterate.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "wisckey")]
  2 | use lsm::values::ValueLog;
  3 | 
  4 | use lsm::EntryRef;
  5 | use lsm::memtable::MemtableIterator;
  6 | use lsm::sorted_table::{InternalIterator, TableIterator};
  7 | 
  8 | use std::cmp::Ordering;
  9 | use std::sync::Arc;
 10 | 
 11 | use cfg_if::cfg_if;
 12 | 
 13 | /// Allows iterating over a consistent snapshot of the database
 14 | pub struct DbIterator {
 15 |     last_key: Option<Vec<u8>>,
 16 |     iterators: Vec<Box<dyn InternalIterator>>,
 17 | 
 18 |     min_key: Option<Vec<u8>>,
 19 |     max_key: Option<Vec<u8>>,
 20 | 
 21 |     tokio_rt: Arc<tokio::runtime::Runtime>,
 22 | 
 23 |     reverse: bool,
 24 | 
 25 |     #[cfg(feature = "wisckey")]
 26 |     value_log: Arc<ValueLog>,
 27 | }
 28 | 
 29 | type NextKV = Option<(lsm::manifest::SeqNumber, usize)>;
 30 | 
 31 | impl DbIterator {
 32 |     pub(crate) fn new(
 33 |         mem_iters: Vec<MemtableIterator>,
 34 |         table_iters: Vec<TableIterator>,
 35 |         min_key: Option<Vec<u8>>,
 36 |         max_key: Option<Vec<u8>>,
 37 |         reverse: bool,
 38 |         #[cfg(feature = "wisckey")] value_log: Arc<ValueLog>,
 39 |         tokio_rt: Arc<tokio::runtime::Runtime>,
 40 |     ) -> Self {
 41 |         let mut iterators: Vec<Box<dyn InternalIterator>> = vec![];
 42 | 
 43 |         for iter in mem_iters.into_iter() {
 44 |             iterators.push(Box::new(iter));
 45 |         }
 46 | 
 47 |         for iter in table_iters.into_iter() {
 48 |             iterators.push(Box::new(iter));
 49 |         }
 50 | 
 51 |         Self {
 52 |             last_key: None,
 53 |             iterators,
 54 |             tokio_rt,
 55 |             min_key,
 56 |             max_key,
 57 |             reverse,
 58 |             #[cfg(feature = "wisckey")]
 59 |             value_log,
 60 |         }
 61 |     }
 62 | 
 63 |     async fn parse_iter(
 64 |         &self,
 65 |         pos: usize,
 66 |         last_key: &Option<Vec<u8>>,
 67 |         next_iter: Option<&dyn InternalIterator>,
 68 |         iter: &mut dyn InternalIterator,
 69 |         next_kv: NextKV,
 70 |     ) -> (bool, NextKV) {
 71 |         if self.reverse {
 72 |             // This iterator might be "behind" other iterators
 73 |             if let Some(last_key) = last_key {
 74 |                 while !iter.at_end() && iter.get_key() >= last_key.as_slice() {
 75 |                     iter.step().await;
 76 |                 }
 77 |             }
 78 | 
 79 |             // Don't pick a key that is greater than the maximum
 80 |             if let Some(max_key) = &self.max_key {
 81 |                 while !iter.at_end() && iter.get_key() > max_key.as_slice() {
 82 |                     iter.step().await;
 83 |                 }
 84 | 
 85 |                 // There might be no key in this iterator that is <=max_key
 86 |                 if iter.at_end() || iter.get_key() > max_key.as_slice() {
 87 |                     return (false, next_kv);
 88 |                 }
 89 |             }
 90 | 
 91 |             if iter.at_end() {
 92 |                 return (false, next_kv);
 93 |             }
 94 | 
 95 |             let key = iter.get_key();
 96 | 
 97 |             // Don't pick a key that is less or equal to the minimum
 98 |             if let Some(min_key) = &self.min_key
 99 |                 && iter.get_key() <= min_key.as_slice()
100 |             {
101 |                 return (false, next_kv);
102 |             }
103 | 
104 |             let seq_number = iter.get_seq_number();
105 | 
106 |             if let Some((max_seq_number, _)) = next_kv {
107 |                 let max_key = next_iter.unwrap().get_key();
108 | 
109 |                 match key.cmp(max_key) {
110 |                     Ordering::Greater => (true, Some((seq_number, pos))),
111 |                     Ordering::Equal => {
112 |                         if seq_number > max_seq_number {
113 |                             (true, Some((seq_number, pos)))
114 |                         } else {
115 |                             (false, next_kv)
116 |                         }
117 |                     }
118 |                     Ordering::Less => (false, next_kv),
119 |                 }
120 |             } else {
121 |                 (true, Some((seq_number, pos)))
122 |             }
123 |         } else {
124 |             // This iterator might be "behind" other iterators
125 |             if let Some(last_key) = last_key {
126 |                 while !iter.at_end() && iter.get_key() <= last_key.as_slice() {
127 |                     iter.step().await;
128 |                 }
129 |             }
130 | 
131 |             // Don't pick a key that is smaller than the minimum
132 |             if let Some(min_key) = &self.min_key {
133 |                 while !iter.at_end() && iter.get_key() < min_key.as_slice() {
134 |                     iter.step().await;
135 |                 }
136 | 
137 |                 // There might be no key in this iterator that is >=min_key
138 |                 if iter.at_end() || iter.get_key() < min_key.as_slice() {
139 |                     return (false, next_kv);
140 |                 }
141 |             }
142 | 
143 |             if iter.at_end() {
144 |                 return (false, next_kv);
145 |             }
146 | 
147 |             let key = iter.get_key();
148 | 
149 |             // Don't pick a key that is greater or equal to the maximum
150 |             if let Some(max_key) = &self.max_key
151 |                 && iter.get_key() >= max_key.as_slice()
152 |             {
153 |                 return (false, next_kv);
154 |             }
155 | 
156 |             let seq_number = iter.get_seq_number();
157 | 
158 |             if let Some((min_seq_number, _)) = next_kv {
159 |                 let min_key = next_iter.unwrap().get_key();
160 | 
161 |                 match key.cmp(min_key) {
162 |                     Ordering::Less => (true, Some((seq_number, pos))),
163 |                     Ordering::Equal => {
164 |                         if seq_number > min_seq_number {
165 |                             (true, Some((seq_number, pos)))
166 |                         } else {
167 |                             (false, next_kv)
168 |                         }
169 |                     }
170 |                     Ordering::Greater => (false, next_kv),
171 |                 }
172 |             } else {
173 |                 (true, Some((seq_number, pos)))
174 |             }
175 |         }
176 |     }
177 | }
178 | 
179 | impl Iterator for DbIterator {
180 |     type Item = (Vec<u8>, EntryRef);
181 | 
182 |     fn next(&mut self) -> Option<Self::Item> {
183 |         let mut iterators = std::mem::take(&mut self.iterators);
184 |         let mut last_key = self.last_key.clone();
185 |         let mut result = None;
186 | 
187 |         while result.is_none() {
188 |             let (out_result, out_last_key, out_iterators) = self.tokio_rt.block_on(async {
189 |                 let mut next_kv = None;
190 |                 let num_iterators = iterators.len();
191 | 
192 |                 for pos in 0..num_iterators {
193 |                     // Split slices to make the borrow checker happy
194 |                     let (prev, cur) = iterators[..].split_at_mut(pos);
195 | 
196 |                     let next_iter = if let Some((_, pos)) = next_kv {
197 |                         // see https://github.com/rust-lang/rust-clippy/issues/9309
198 |                         #[allow(clippy::borrowed_box)]
199 |                         let iter: &Box<dyn InternalIterator> = &prev[pos];
200 |                         Some(&**iter)
201 |                     } else {
202 |                         None
203 |                     };
204 | 
205 |                     let current_iter = &mut *cur[0];
206 |                     let (change, kv) = self
207 |                         .parse_iter(pos, &last_key, next_iter, current_iter, next_kv)
208 |                         .await;
209 | 
210 |                     if change {
211 |                         next_kv = kv;
212 |                     }
213 |                 }
214 | 
215 |                 let result = if let Some((_, pos)) = next_kv.take() {
216 |                     #[allow(clippy::explicit_auto_deref)]
217 |                     let iter: &dyn InternalIterator = &*iterators[pos];
218 | 
219 |                     let res_key = iter.get_key().to_vec();
220 |                     last_key = Some(res_key.clone());
221 | 
222 |                     cfg_if! {
223 |                         if #[ cfg(feature="wisckey") ] {
224 |                             iter.get_entry(&self.value_log).await
225 |                                 .map(|entry| Some((res_key, entry)))
226 |                         } else {
227 |                             iter.get_entry().map(|entry|Some((res_key, entry)))
228 |                         }
229 |                     }
230 |                 } else {
231 |                     Some(None)
232 |                 };
233 | 
234 |                 (result, last_key, iterators)
235 |             });
236 | 
237 |             result = out_result;
238 |             last_key = out_last_key;
239 |             iterators = out_iterators;
240 |         }
241 | 
242 |         self.last_key = last_key;
243 |         self.iterators = iterators;
244 | 
245 |         result.unwrap()
246 |     }
247 | }
248 | 


--------------------------------------------------------------------------------
/sync/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub use lsm::{Params, StartMode, WriteBatch, WriteOptions};
2 | 
3 | pub mod iterate;
4 | 
5 | mod database;
6 | pub use database::Database;
7 | 


--------------------------------------------------------------------------------
/sync/tests/basic.rs:
--------------------------------------------------------------------------------
  1 | use lsm_sync::{Database, Params, StartMode, WriteBatch, WriteOptions};
  2 | use tempfile::{Builder, TempDir};
  3 | 
  4 | const SM: StartMode = StartMode::CreateOrOverride;
  5 | 
  6 | fn test_init() -> (TempDir, Database) {
  7 |     let tmp_dir = Builder::new().prefix("lsm-sync-test-").tempdir().unwrap();
  8 |     let _ = env_logger::builder().is_test(true).try_init();
  9 | 
 10 |     let mut db_path = tmp_dir.path().to_path_buf();
 11 |     db_path.push("storage.lsm");
 12 | 
 13 |     let params = Params {
 14 |         db_path,
 15 |         ..Default::default()
 16 |     };
 17 |     let database =
 18 |         Database::new_with_params(SM, params).expect("Failed to create database instance");
 19 | 
 20 |     (tmp_dir, database)
 21 | }
 22 | 
 23 | #[test]
 24 | fn get_put() {
 25 |     let (_tmpdir, database) = test_init();
 26 | 
 27 |     let key1 = "Foo".to_string().into_bytes();
 28 |     let key2 = "Foz".to_string().into_bytes();
 29 |     let value1 = "Bar".to_string().into_bytes();
 30 |     let value2 = "Baz".to_string().into_bytes();
 31 | 
 32 |     assert!(database.get(&key1).unwrap().is_none());
 33 |     assert!(database.get(&key2).unwrap().is_none());
 34 | 
 35 |     database.put(key1.clone(), value1.clone()).unwrap();
 36 | 
 37 |     assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value1);
 38 |     assert!(database.get(&key2).unwrap().is_none());
 39 | 
 40 |     database.put(key1.clone(), value2.clone()).unwrap();
 41 |     assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value2);
 42 | }
 43 | 
 44 | #[test]
 45 | fn iterate() {
 46 |     const COUNT: u64 = 5_000;
 47 | 
 48 |     let (_tmpdir, database) = test_init();
 49 | 
 50 |     // Write without fsync to speed up tests
 51 |     let options = WriteOptions { sync: false };
 52 | 
 53 |     for pos in 0..COUNT {
 54 |         let key = format!("key_{pos:05}").into_bytes();
 55 |         let value = format!("some_string_{pos}").into_bytes();
 56 |         database.put_opts(key, value, &options).unwrap();
 57 |     }
 58 | 
 59 |     let mut count = 0;
 60 | 
 61 |     for (pos, (key, val)) in database.iter().enumerate() {
 62 |         let expected_key = format!("key_{pos:05}").into_bytes();
 63 |         let expected_val = format!("some_string_{pos}").into_bytes();
 64 | 
 65 |         assert_eq!(expected_key, key);
 66 |         assert_eq!(expected_val, val.get_value());
 67 | 
 68 |         count += 1;
 69 |     }
 70 | 
 71 |     assert_eq!(count, COUNT);
 72 | }
 73 | 
 74 | #[test]
 75 | fn range_iterate() {
 76 |     const COUNT: u64 = 25_000;
 77 | 
 78 |     let (_tmpdir, database) = test_init();
 79 | 
 80 |     // Write without fsync to speed up tests
 81 |     let options = WriteOptions { sync: false };
 82 | 
 83 |     for pos in 0..COUNT {
 84 |         let key = format!("key_{pos:05}").into_bytes();
 85 |         let value = format!("some_string_{pos}").into_bytes();
 86 |         database.put_opts(key, value, &options).unwrap();
 87 |     }
 88 | 
 89 |     let start = "key_00300".to_string().into_bytes();
 90 |     let end = "key_10150".to_string().into_bytes();
 91 |     let iter = database.range_iter(&start, &end);
 92 | 
 93 |     let mut pos = 0;
 94 |     for (key, val) in iter {
 95 |         let real_pos = pos + 300;
 96 |         let expected_key = format!("key_{real_pos:05}").into_bytes();
 97 |         let expected_val = format!("some_string_{real_pos}").into_bytes();
 98 | 
 99 |         assert_eq!(expected_key, key);
100 |         assert_eq!(expected_val, val.get_value());
101 | 
102 |         pos += 1;
103 |     }
104 | 
105 |     assert_eq!(pos, 9850);
106 | 
107 |     database.stop().unwrap();
108 | }
109 | 
110 | #[test]
111 | fn range_iterate_reverse() {
112 |     const COUNT: u64 = 25_000;
113 | 
114 |     let (_tmpdir, database) = test_init();
115 | 
116 |     // Write without fsync to speed up tests
117 |     let options = WriteOptions { sync: false };
118 | 
119 |     for pos in 0..COUNT {
120 |         let key = format!("key_{pos:05}").into_bytes();
121 |         let value = format!("some_string_{pos}").into_bytes();
122 |         database.put_opts(key, value, &options).unwrap();
123 |     }
124 | 
125 |     let start = "key_10150".to_string().into_bytes();
126 |     let end = "key_00300".to_string().into_bytes();
127 |     let iter = database.reverse_range_iter(&start, &end);
128 | 
129 |     let mut pos = 0;
130 |     for (key, val) in iter {
131 |         let real_pos = 10150 - pos;
132 |         let expected_key = format!("key_{real_pos:05}").into_bytes();
133 | 
134 |         assert_eq!(expected_key, key);
135 |         assert_eq!(
136 |             format!("some_string_{real_pos}").into_bytes(),
137 |             val.get_value()
138 |         );
139 | 
140 |         pos += 1;
141 |     }
142 | 
143 |     assert_eq!(pos, 9850);
144 | 
145 |     database.stop().unwrap();
146 | }
147 | 
148 | #[test]
149 | fn range_iterate_empty() {
150 |     let (_tmpdir, database) = test_init();
151 | 
152 |     const COUNT: u64 = 5_000;
153 | 
154 |     // Write without fsync to speed up tests
155 |     let options = WriteOptions { sync: false };
156 | 
157 |     for pos in 0..COUNT {
158 |         let key = format!("key_{pos:05}").into_bytes();
159 |         let value = format!("some_string_{pos}").into_bytes();
160 |         database.put_opts(key, value, &options).unwrap();
161 |     }
162 | 
163 |     // Pick a range that is outside of the put range
164 |     let start = "key_05300".to_string().into_bytes();
165 |     let end = "key_10150".to_string().into_bytes();
166 |     let mut iter = database.range_iter(&start, &end);
167 | 
168 |     if let Some((_key, _val)) = iter.next() {
169 |         panic!("Found a key where there should be none");
170 |     }
171 | 
172 |     database.stop().unwrap();
173 | }
174 | 
175 | #[test]
176 | fn get_put_many() {
177 |     const COUNT: u64 = 100_000;
178 | 
179 |     let (_tmpdir, database) = test_init();
180 | 
181 |     // Write without fsync to speed up tests
182 |     let options = WriteOptions { sync: false };
183 | 
184 |     for pos in 0..COUNT {
185 |         let key = format!("key_{pos}").into_bytes();
186 |         let value = format!("some_string_{pos}").into_bytes();
187 | 
188 |         database.put_opts(key, value, &options).unwrap();
189 |     }
190 | 
191 |     for pos in 0..COUNT {
192 |         let key = format!("key_{pos}").into_bytes();
193 |         let value = format!("some_string_{pos}").into_bytes();
194 | 
195 |         assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,);
196 |     }
197 | }
198 | 
199 | #[test]
200 | fn get_put_delete_many() {
201 |     const COUNT: u64 = 10_000;
202 | 
203 |     let (_tmpdir, database) = test_init();
204 | 
205 |     // Write without fsync to speed up tests
206 |     let options = WriteOptions { sync: false };
207 | 
208 |     for pos in 0..COUNT {
209 |         let key = format!("key_{pos}").into_bytes();
210 |         let value = format!("some_string_{pos}").into_bytes();
211 | 
212 |         database.put_opts(key, value, &options).unwrap();
213 |     }
214 | 
215 |     for pos in 0..COUNT {
216 |         let key = format!("key_{pos}").into_bytes();
217 |         database.delete(key).unwrap();
218 |     }
219 | 
220 |     for pos in 0..COUNT {
221 |         let key = format!("key_{pos}").into_bytes();
222 |         assert!(database.get(&key).unwrap().is_none());
223 |     }
224 | }
225 | 
226 | #[test]
227 | fn override_many() {
228 |     const COUNT: u64 = 100_000;
229 | 
230 |     let (_tmpdir, database) = test_init();
231 | 
232 |     // Write without fsync to speed up tests
233 |     let options = WriteOptions { sync: false };
234 | 
235 |     for pos in 0..COUNT {
236 |         let key = format!("key_{pos}").into_bytes();
237 |         let value = format!("some_string_{pos}").into_bytes();
238 | 
239 |         database.put_opts(key, value, &options).unwrap();
240 |     }
241 | 
242 |     for pos in 0..COUNT {
243 |         let key = format!("key_{pos}").into_bytes();
244 |         let value = format!("some_other_string_{pos}").into_bytes();
245 | 
246 |         database.put_opts(key, value, &options).unwrap();
247 |     }
248 | 
249 |     for pos in 0..COUNT {
250 |         let key = format!("key_{pos}").into_bytes();
251 |         let value = format!("some_other_string_{pos}").into_bytes();
252 | 
253 |         assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value);
254 |     }
255 | }
256 | 
257 | #[test]
258 | fn override_subset() {
259 |     const NCOUNT: u64 = 100_000;
260 |     const COUNT: u64 = 25_000;
261 | 
262 |     let (_tmpdir, database) = test_init();
263 | 
264 |     // Write without fsync to speed up tests
265 |     let options = WriteOptions { sync: false };
266 | 
267 |     for pos in 0..NCOUNT {
268 |         let key = format!("key_{pos}").into_bytes();
269 |         let value = format!("some_string_{pos}").into_bytes();
270 |         database.put_opts(key, value, &options).unwrap();
271 |     }
272 | 
273 |     for pos in 0..COUNT {
274 |         let key = format!("key_{pos}").into_bytes();
275 |         let value = format!("some_other_string_{pos}").into_bytes();
276 |         database.put_opts(key, value, &options).unwrap();
277 |     }
278 | 
279 |     for pos in 0..COUNT {
280 |         let key = format!("key_{pos}").into_bytes();
281 |         let value = format!("some_other_string_{pos}").into_bytes();
282 | 
283 |         assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,);
284 |     }
285 | 
286 |     for pos in COUNT..NCOUNT {
287 |         let key = format!("key_{pos}").into_bytes();
288 |         let value = format!("some_string_{pos}").into_bytes();
289 | 
290 |         assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,);
291 |     }
292 | 
293 |     database.stop().unwrap();
294 | }
295 | 
296 | #[test]
297 | fn batched_write() {
298 |     const COUNT: u64 = 1000;
299 | 
300 |     let (_tmpdir, database) = test_init();
301 | 
302 |     let mut batch = WriteBatch::new();
303 | 
304 |     for pos in 0..COUNT {
305 |         let key = format!("key{pos}").into_bytes();
306 |         let value = format!("value{pos}").into_bytes();
307 |         batch.put(key, value);
308 |     }
309 | 
310 |     database.write(batch).unwrap();
311 | 
312 |     for pos in 0..COUNT {
313 |         let key = format!("key{pos}").into_bytes();
314 |         let value = format!("value{pos}").into_bytes();
315 | 
316 |         let entry = database.get(&key).unwrap();
317 | 
318 |         assert!(entry.is_some());
319 |         assert_eq!(entry.unwrap().get_value(), value.as_slice());
320 |     }
321 | }
322 | 


--------------------------------------------------------------------------------
/sync/tests/reopen.rs:
--------------------------------------------------------------------------------
  1 | use lsm_sync::{Database, Params, StartMode, WriteOptions};
  2 | use tempfile::{Builder, TempDir};
  3 | 
  4 | fn test_init() -> (TempDir, Params, Database) {
  5 |     let tmp_dir = Builder::new()
  6 |         .prefix("lsm-sync-test-reopen-")
  7 |         .tempdir()
  8 |         .unwrap();
  9 |     let _ = env_logger::builder().is_test(true).try_init();
 10 | 
 11 |     let mut db_path = tmp_dir.path().to_path_buf();
 12 |     db_path.push("storage.lsm");
 13 | 
 14 |     let params = Params {
 15 |         db_path,
 16 |         ..Default::default()
 17 |     };
 18 |     let database = Database::new_with_params(StartMode::CreateOrOverride, params.clone())
 19 |         .expect("Failed to create database instance");
 20 | 
 21 |     (tmp_dir, params, database)
 22 | }
 23 | 
 24 | #[test]
 25 | fn get_put() {
 26 |     let (_tmpdir, params, database) = test_init();
 27 | 
 28 |     let key1 = String::from("Foo").into_bytes();
 29 |     let value1 = String::from("Bar").into_bytes();
 30 |     let value2 = String::from("Baz").into_bytes();
 31 | 
 32 |     assert!(database.get(&key1).unwrap().is_none());
 33 | 
 34 |     database.put(key1.clone(), value1.clone()).unwrap();
 35 |     drop(database);
 36 | 
 37 |     // Reopen
 38 |     let database = Database::new_with_params(StartMode::Open, params.clone())
 39 |         .expect("Failed to create database instance");
 40 | 
 41 |     assert_eq!(
 42 |         database.get(&key1).unwrap().unwrap().get_value(),
 43 |         value1.clone()
 44 |     );
 45 |     database.put(key1.clone(), value2.clone()).unwrap();
 46 | 
 47 |     drop(database);
 48 | 
 49 |     // Reopen again
 50 |     let database = Database::new_with_params(StartMode::Open, params)
 51 |         .expect("Failed to create database instance");
 52 | 
 53 |     assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value2);
 54 | }
 55 | 
 56 | #[test]
 57 | fn get_put_many() {
 58 |     const COUNT: u64 = 100_000;
 59 | 
 60 |     let (_tmpdir, params, database) = test_init();
 61 | 
 62 |     // Write without fsync to speed up tests
 63 |     let options = WriteOptions { sync: false };
 64 | 
 65 |     for pos in 0..COUNT {
 66 |         let key = format!("key_{pos}").into_bytes();
 67 |         let value = format!("some_string_{pos}").into_bytes();
 68 |         database.put_opts(key, value, &options).unwrap();
 69 |     }
 70 | 
 71 |     drop(database);
 72 | 
 73 |     // Reopen
 74 |     let database = Database::new_with_params(StartMode::Open, params.clone())
 75 |         .expect("Failed to create database instance");
 76 | 
 77 |     for pos in 0..COUNT {
 78 |         let key = format!("key_{pos}").into_bytes();
 79 |         let value = format!("some_string_{pos}").into_bytes();
 80 | 
 81 |         assert_eq!(
 82 |             database.get(&key).unwrap().unwrap().get_value(),
 83 |             value.as_slice(),
 84 |         );
 85 |     }
 86 | }
 87 | 
 88 | #[test]
 89 | fn get_put_large() {
 90 |     const COUNT: usize = 100;
 91 |     const SIZE: usize = 100_000;
 92 | 
 93 |     let (_tmpdir, params, database) = test_init();
 94 | 
 95 |     // Write without fsync to speed up tests
 96 |     let options = WriteOptions { sync: false };
 97 | 
 98 |     for pos in 0..COUNT {
 99 |         let key = format!("key_{pos:05}").into_bytes();
100 |         let value = format!("value_{pos}").repeat(SIZE).into_bytes();
101 | 
102 |         database.put_opts(key, value, &options).unwrap();
103 |     }
104 | 
105 |     database.synchronize().unwrap();
106 |     drop(database);
107 | 
108 |     // Reopen
109 |     let database = Database::new_with_params(StartMode::Open, params.clone())
110 |         .expect("Failed to create database instance");
111 | 
112 |     let mut iterator = database.iter();
113 |     let mut pos = 0;
114 | 
115 |     while let Some((key, value)) = iterator.next() {
116 |         let expected_key = format!("key_{pos:05}").into_bytes();
117 |         let expected_value = format!("value_{pos}").repeat(SIZE).into_bytes();
118 | 
119 |         assert_eq!(expected_key, key);
120 |         assert_eq!(expected_value, value.get_value());
121 | 
122 |         pos += 1;
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/reopen.rs:
--------------------------------------------------------------------------------
  1 | use lsm::{Database, Params, StartMode, WriteOptions};
  2 | use tempfile::{Builder, TempDir};
  3 | 
  4 | use futures::stream::StreamExt;
  5 | 
  6 | #[cfg(feature = "tokio-uring")]
  7 | use kioto_uring_executor::test as async_test;
  8 | 
  9 | #[cfg(feature = "monoio")]
 10 | use monoio::test as async_test;
 11 | 
 12 | #[cfg(not(feature = "_async-io"))]
 13 | use tokio::test as async_test;
 14 | 
 15 | async fn test_init() -> (TempDir, Params, Database) {
 16 |     let tmp_dir = Builder::new()
 17 |         .prefix("lsm-async-test-reopen-")
 18 |         .tempdir()
 19 |         .unwrap();
 20 |     let _ = env_logger::builder().is_test(true).try_init();
 21 | 
 22 |     let mut db_path = tmp_dir.path().to_path_buf();
 23 |     db_path.push("storage.lsm");
 24 | 
 25 |     let params = Params {
 26 |         db_path,
 27 |         ..Default::default()
 28 |     };
 29 |     let database = Database::new_with_params(StartMode::CreateOrOverride, params.clone())
 30 |         .await
 31 |         .expect("Failed to create database instance");
 32 | 
 33 |     (tmp_dir, params, database)
 34 | }
 35 | 
 36 | #[async_test]
 37 | async fn get_put() {
 38 |     let (_tmpdir, params, database) = test_init().await;
 39 | 
 40 |     let key1 = String::from("Foo").into_bytes();
 41 |     let value1 = String::from("Bar").into_bytes();
 42 |     let value2 = String::from("Baz").into_bytes();
 43 | 
 44 |     assert!(database.get(&key1).await.unwrap().is_none());
 45 | 
 46 |     database.put(key1.clone(), value1.clone()).await.unwrap();
 47 |     drop(database);
 48 | 
 49 |     // Reopen
 50 |     let database = Database::new_with_params(StartMode::Open, params.clone())
 51 |         .await
 52 |         .expect("Failed to create database instance");
 53 | 
 54 |     assert_eq!(
 55 |         database.get(&key1).await.unwrap().unwrap().get_value(),
 56 |         value1
 57 |     );
 58 |     database.put(key1.clone(), value2.clone()).await.unwrap();
 59 | 
 60 |     drop(database);
 61 | 
 62 |     // Reopen again
 63 |     let database = Database::new_with_params(StartMode::Open, params)
 64 |         .await
 65 |         .expect("Failed to create database instance");
 66 | 
 67 |     assert_eq!(
 68 |         database.get(&key1).await.unwrap().unwrap().get_value(),
 69 |         value2
 70 |     );
 71 | }
 72 | 
 73 | #[async_test]
 74 | async fn get_put_many() {
 75 |     const COUNT: u64 = 100_000;
 76 | 
 77 |     let (_tmpdir, params, database) = test_init().await;
 78 | 
 79 |     // Write without fsync to speed up tests
 80 |     let options = WriteOptions { sync: false };
 81 | 
 82 |     for pos in 0..COUNT {
 83 |         let key = format!("key_{pos:05}").into_bytes();
 84 |         let value = format!("some_string_{pos}").into_bytes();
 85 |         database.put_opts(key, value, &options).await.unwrap();
 86 |     }
 87 | 
 88 |     database.synchronize().await.unwrap();
 89 |     drop(database);
 90 | 
 91 |     // Reopen
 92 |     let database = Database::new_with_params(StartMode::Open, params.clone())
 93 |         .await
 94 |         .expect("Failed to create database instance");
 95 | 
 96 |     for pos in 0..COUNT {
 97 |         let key = format!("key_{pos:05}").into_bytes();
 98 |         let value = format!("some_string_{pos}").into_bytes();
 99 | 
100 |         assert_eq!(
101 |             database.get(&key).await.unwrap().unwrap().get_value(),
102 |             value,
103 |         );
104 |     }
105 | 
106 |     // Ensure iteration still works
107 |     let mut iterator = database.iter().await;
108 |     let mut pos = 0;
109 |     while let Some((key, value)) = iterator.next().await {
110 |         assert_eq!(format!("key_{pos:05}").into_bytes(), key);
111 |         assert_eq!(format!("some_string_{pos}").into_bytes(), value.get_value());
112 |         pos += 1;
113 |     }
114 | }
115 | 
116 | #[async_test]
117 | async fn get_put_large() {
118 |     const COUNT: usize = 100;
119 |     const SIZE: usize = 100_000;
120 | 
121 |     let (_tmpdir, params, database) = test_init().await;
122 | 
123 |     // Write without fsync to speed up tests
124 |     let options = WriteOptions { sync: false };
125 | 
126 |     for pos in 0..COUNT {
127 |         let key = format!("key_{pos:05}").into_bytes();
128 |         let value = format!("value_{pos}").repeat(SIZE).into_bytes();
129 | 
130 |         database.put_opts(key, value, &options).await.unwrap();
131 |     }
132 | 
133 |     database.synchronize().await.unwrap();
134 |     drop(database);
135 | 
136 |     // Reopen
137 |     let database = Database::new_with_params(StartMode::Open, params.clone())
138 |         .await
139 |         .expect("Failed to create database instance");
140 | 
141 |     let mut iterator = database.iter().await;
142 |     let mut pos = 0;
143 | 
144 |     while let Some((key, value)) = iterator.next().await {
145 |         let expected_key = format!("key_{pos:05}").into_bytes();
146 |         let expected_value = format!("value_{pos}").repeat(SIZE).into_bytes();
147 | 
148 |         assert_eq!(expected_key, key);
149 |         // Value is very long, so don't print
150 |         assert!(expected_value == value.get_value());
151 | 
152 |         pos += 1;
153 |     }
154 | }
155 | 


--------------------------------------------------------------------------------