├── .cargo └── config.toml ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── NEWS ├── README.md ├── benchmarks └── async.rs ├── bigtest ├── Cargo.toml └── src │ └── main.rs ├── justfile ├── rust-toolchain.toml ├── src ├── data_blocks │ ├── block.rs │ ├── builder.rs │ └── mod.rs ├── database.rs ├── disk.rs ├── index_blocks.rs ├── iterate.rs ├── level.rs ├── level_logger.rs ├── lib.rs ├── logic.rs ├── manifest.rs ├── memtable.rs ├── params.rs ├── sorted_table │ ├── builder.rs │ ├── iterator.rs │ ├── mod.rs │ └── tests.rs ├── tasks.rs ├── values │ ├── batch.rs │ ├── index.rs │ ├── mod.rs │ └── tests.rs ├── wal │ ├── mod.rs │ ├── reader.rs │ ├── tests.rs │ └── writer.rs └── write_batch.rs ├── sync ├── Cargo.toml ├── justfile ├── src │ ├── database.rs │ ├── iterate.rs │ └── lib.rs └── tests │ ├── basic.rs │ └── reopen.rs └── tests ├── basic.rs └── reopen.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [net] 2 | git-fetch-with-cli = true 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | test: 14 | runs-on: ubuntu-24.04 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Install rustc and clippy nightly 18 | uses: dtolnay/rust-toolchain@stable 19 | with: 20 | toolchain: nightly-2025-05-26 21 | components: cargo, rustc, clippy, rustfmt 22 | - name: Install Just 23 | uses: taiki-e/install-action@v2 24 | with: 25 | tool: just 26 | - name: "Test: Async" 27 | run: just async-tests 28 | timeout-minutes: 10 29 | - name: "Test: Tokio-Uring" 30 | run: just tokio-uring-tests 31 | - name: "Test: Monoio" 32 | run: just monoio-tests 33 | - name: "Tests: Sync" 34 | run: just sync-tests 35 | timeout-minutes: 10 36 | - name: "Tests: No compression" 37 | run: just no-compression-tests 38 | timeout-minutes: 10 39 | - name: "Tests: Wisckey" 40 | run: just wisckey-tests 41 | - name: "Tests: Wisckey with no compression" 42 | run: just wisckey-no-compression-tests 43 | timeout-minutes: 10 44 | - name: "Tests: Sync Wisckey" 45 | run: just wisckey-sync-tests 46 | - name: "Test: Wisckey and Tokio-Uring" 47 | run: just tokio-uring-wisckey-tests 48 | - name: "Test: Wisckey and Monoio" 49 | run: just monoio-wisckey-tests 50 | timeout-minutes: 10 51 | big-test: 52 | runs-on: ubuntu-24.04 53 | steps: 54 | - uses: actions/checkout@v4 55 | - name: Install rustc and clippy nightly 56 | uses: dtolnay/rust-toolchain@stable 57 | with: 58 | toolchain: nightly 59 | components: cargo, rustc, clippy, rustfmt 60 | - name: Install Just 61 | uses: taiki-e/install-action@v2 62 | with: 63 | tool: just 64 | - name: Insert many entries 65 | run: just bigtest-many 66 | - name: Insert large entries 67 | run: just bigtest-large 68 | lint: 69 | runs-on: ubuntu-24.04 70 | steps: 71 | - uses: actions/checkout@v4 72 | - name: Install rustc and clippy nightly 73 | uses: dtolnay/rust-toolchain@stable 74 | with: 75 | toolchain: nightly 76 | components: cargo, rustc, clippy, rustfmt 77 | - name: Install Just 78 | uses: taiki-e/install-action@v2 79 | with: 80 | tool: just 81 | - name: "Lint Checks: Tokio (with sync FS)" 82 | run: just async-lint 83 | - name: "Lint Checks: Tokio-Uring" 84 | run: just tokio-uring-lint 85 | - name: "Lint Checks: Monoio " 86 | run: just monoio-lint 87 | - name: "Lint Checks: Synchronous API" 88 | run: just sync-lint 89 | - name: "Lint Checks: Wisckey" 90 | run: just wisckey-lint 91 | - name: "Lint Checks: Wisckey with no comrpession" 92 | run: just wisckey-lint 93 | - name: "Lint Checks: Wisckey and Tokio-Uring" 94 | run: just tokio-uring-wisckey-lint 95 | - name: "Lint Checks: Wisckey and Monoio" 96 | run: just monoio-wisckey-lint 97 | - name: "Formatting Checks" 98 | run: just check-formatting 99 | - name: Check for unused dependencies 100 | run: | 101 | cargo install cargo-machete 102 | cargo machete 103 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.lsm 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsm" 3 | version = "0.5.0-dev" 4 | authors = ["Kai Mast "] 5 | edition = "2024" 6 | repository = "https://github.com/kaimast/lsm-rs" 7 | description = "An implementation of log-structured merge trees in pure Rust" 8 | license = "MIT" 9 | readme = "README.md" 10 | keywords = ["storage", "database", "async"] 11 | rust-version = "1.88" 12 | 13 | [dependencies] 14 | lru = "0.14" 15 | parking_lot = "0.12" 16 | memmap2 = "0.9" 17 | byte-slice-cast = "1" 18 | zerocopy = { version="0.8", features=["derive"] } 19 | log = "0.4" 20 | futures = "0.3" 21 | snap = { version="1", optional=true } 22 | async-trait = "0.1" 23 | cfg-if = "1" 24 | tracing = { version="0.1", default-features=false, features=["attributes"] } 25 | csv = "1" 26 | tokio-condvar = { version="0.3", features=["parking_lot"] } 27 | tokio-uring = { version="0.5", optional=true } 28 | bloomfilter = { version="3", optional=true } 29 | monoio = { version="0.2", optional=true, features=["sync"] } 30 | kioto-uring-executor = { version="0.3.0-dev", optional=true, default-features=false, features=["macros", "tokio-uring"] } 31 | bitvec = { version="1", optional=true } 32 | 33 | [dependencies.tokio] 34 | version="1" 35 | default-features=false 36 | features=["io-util", "sync", "macros", "tracing", "time"] 37 | 38 | [dev-dependencies] 39 | clap = { version="4", features=["derive"] } 40 | env_logger = "0.11" 41 | tempfile = "3" 42 | tracing-tracy = "0.11" 43 | tokio = { version="1", default-features=false, features=["rt-multi-thread"] } 44 | tracing-subscriber = { version="0.3", default-features=false } 45 | rand = "0.9" 46 | 47 | [lib] 48 | path = "src/lib.rs" 49 | 50 | [features] 51 | default = ["snappy-compression", "bloom-filters"] 52 | monoio = ["dep:monoio", "_async-io"] 53 | snappy-compression = ["dep:snap"] 54 | wisckey = ["dep:bitvec"] 55 | bloom-filters = ["dep:bloomfilter"] 56 | tokio-uring = ["dep:tokio-uring", "dep:kioto-uring-executor", "_async-io"] 57 | _async-io = [] 58 | 59 | [[test]] 60 | name = "basic" 61 | path = "tests/basic.rs" 62 | 63 | [[test]] 64 | name = "reopen" 65 | path = "tests/reopen.rs" 66 | 67 | [[example]] 68 | name = "lsm-benchmark" 69 | path = "benchmarks/async.rs" 70 | 71 | [workspace] 72 | members = ["sync", "bigtest"] 73 | 74 | [patch.crates-io] 75 | kioto-uring-executor = { git = "https://github.com/kaimast/kioto-uring-executor" } 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kai Mast 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | UNRELEASED (0.5): 2 | - Bump MSRV to 1.88 3 | - More extensive testing 4 | - Add support for monoio 5 | - Codebase moved from Rust 2021 to 2024 6 | - Updated bloomfilter to 0.3 7 | 8 | 0.4.1: 9 | - Fix an error during compaction 10 | - Ensure everything has been written to the write-ahead log before shutting down 11 | 12 | 0.4.0: 13 | - Move sync API into a separate lsm-sync crate 14 | - Removed KvTrait. The crate now only accept and returns bytes 15 | - Get operations now return a reference to the data without copying 16 | - Leverage zerocopy wherever possible to reduce serialization cost 17 | - Update tokio-uring and kioto-uring-executor dependencies 18 | 19 | 0.3.0: 20 | - Write-Ahead logging moved to a dedicated thread (or async task) 21 | - Support for io_uring 22 | - Allow iterating in reverse order 23 | - Add bloom filter support 24 | - Various performance improvements 25 | - Use tokio-condvar in more places 26 | 27 | 0.2.0: 28 | - Support for tracing to benchmark the library 29 | - Removed custom Condition Variable implementation 30 | - Databases can be reopened 31 | - WiscKey now uses a more efficient "folding"-based implementation 32 | - Allow disabling key-value separation using the "wisckey" feature flag 33 | - Implemented proper garbage collection for the value log 34 | - The Write-Ahead-Log is properly truncated after writes are flushed to L0 35 | 36 | 0.1.0: 37 | - Basic key-value store functionality 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modular, Asynchronous Implementation of a Log-Structured Merge Tree 2 | 3 | [![ci-badge](https://github.com/kaimast/lsm-rs/actions/workflows/ci.yml/badge.svg)](https://github.com/kaimast/lsm-rs/actions) 4 | [![license-badge](https://img.shields.io/crates/l/lsm)](https://github.com/kaimast/lsm-rs/blob/main/LICENSE) 5 | [![crates-badge](https://img.shields.io/crates/v/lsm)](https://crates.io/crates/lsm) 6 | 7 | **Note: While this implementation is used by us and has not caused major problems, we do not recommend it yet production environments.** 8 | Please use the [leveldb](https://github.com/skade/leveldb) or [rocksdb](https://github.com/rust-rocksdb/rust-rocksdb) crate for this purpose. 9 | 10 | This implementation does *not* aim to reimplement LevelDB. The major differences are: 11 | * *Separation of keys and values*: Values can be stored seperately to increase compaction speed as outlined in the [WiscKey](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf) paper 12 | * *Concurrent compaction*: Multiple threads can compact at the same time for higher write throughput 13 | * *Async-support*: All API calls are exposed as async functions 14 | * *io_uring-support*: For async file system access on Linux. Optional and still considered experimental. 15 | * *Bloom filters* for faster lookups 16 | 17 | ## Supported Platforms and Architectures 18 | Currently, the code is only tested with Linux on x86 machines, but it should run on most systems supported by the Rust compiler. 19 | 20 | ## On-Disk Format 21 | LSM stores data using [zerocopy](https://github.com/google/zerocopy) to achieve high performance. 22 | The implementation does not account for endianness so on-disk formats are not portable. 23 | Replication across machines should be handled at a different layer of the system. 24 | However, we may add a converter tool in the future or an `endianess` feature flag if needed. 25 | 26 | ## Planned Features 27 | * FLSM: Like [PebblesDB](https://github.com/utsaslab/pebblesdb) LSM-rs will fragment the keyspace to reduce write amplification and increase compaction speed 28 | * Custom sorting functions 29 | * More modularity and configuration options 30 | 31 | ## Feature Flags 32 | * `snappy-compression`: Use the [snappy format](https://docs.rs/snap/1.0.5/snap/) to compress data on disk *(enabled by default)* 33 | * `bloom-filters`: Add bloom filters to data blocks for more efficient searching. *(enabled by default)* 34 | * `monoio`: Use `monoio` as async runtime I/O instead of `tokio. Note, this will not spawn any additional OS threads. ` *(disabled by default)* 35 | * `tokio-uring`: Use `tokio_uring` as async runtime (using `toio-uring-executor`) instead of regular `tokio`. *(disabled by default)* 36 | * `wisckey`: Store keys and values separately. This usually results in higher throughput with slightly higher CPU-usage. *(disabled by default)* 37 | 38 | ## Synchronous API 39 | This crate exposes an async API intended to be used with Tokio or a similar runtime. 40 | Alternatively, you can use the lsm-sync crate included in this repo, which internally uses Tokio but expose a synchronous API. 41 | 42 | ## Sort Order 43 | You need to serialize your data in a way that its byte representation maintains the same ordering as the unserialized data. 44 | For example, you may want to use [big endian](https://en.wikipedia.org/wiki/Endianness) encoding so that numerical values are ordered correctly. 45 | 46 | ## Usage 47 | 48 | You can create or open a new databse instance as shown below. 49 | ```rust 50 | use lsm::{Database, Params}; 51 | 52 | // Set options here, such as the location of the database files 53 | let params = Params { 54 | db_path, 55 | ..Default::default() 56 | }; 57 | 58 | // Instantiate database 59 | let database = Database::new_with_params(SM, params) 60 | .await 61 | .expect("Failed to create database instance"); 62 | ``` 63 | 64 | To write to the database use the `get` call. Note that the crate only supports 65 | writing byte vectors. (De-)serialization is supposed to happen at another layer. 66 | ```rust 67 | let key = String::from("mykey").into_bytes(); 68 | let value = String::from("hello world").into_bytes(); 69 | 70 | database.put(key, value).await.expect("Writing to database failed"); 71 | ``` 72 | 73 | When reading, LSM will return a reference to the data to avoid copying. 74 | ```rust 75 | let value_ref = database.get(&key).await.expect("Reading failed"); 76 | 77 | // Returns a slice to the data 78 | let data: &[u8] = value_ref.get_value(); 79 | 80 | // Assuming the put from above workd, this will print "hello world" 81 | println!("{}", std::str::from_utf(data).unwrap()); 82 | ``` 83 | 84 | Please refer to the tests for more examples to how to use the crate. 85 | 86 | ## Tests 87 | This library ships with several tests. We provide a [justfile](https://github.com/casey/just) for convenience: 88 | 89 | ```sh 90 | just test #runs all tests for all configurations 91 | just lint #runs cargo clippy 92 | ``` 93 | 94 | ## Notes on io-uring 95 | Currently, the io-uring feature relies on [tokio-uring-executor](https://github.com/kaimast/tokio-uring-executor), a simplistic multi-threaded wrapper around `tokio-uring`. 96 | Eventually `tokio-uring` will [support multiple threads natively](https://github.com/tokio-rs/tokio-uring/issues/258) and this workaround will be removed. 97 | 98 | I would also like to add support for more mature io_uring runtimes such as [gloomio](https://github.com/DataDog/glommio) but only have limited time to work on this crate. Help is very welcome. 99 | 100 | ## Similar Crates 101 | This is an incomplete list of crates that provide similar functionality. Please reach out if you know of others to add. 102 | 103 | ### LSM trees 104 | * [rust-rocksdb](https://github.com/rust-rocksdb/rust-rocksdb): Rust bindings for RocksDB 105 | * [leveldb](https://github.com/skade/leveldb): Rust bindings for LevelDB 106 | * [wickdb](https://github.com/Fullstop000/wickdb): Rust re-implementation of vanilla LevelDB 107 | * [agatedb](https://github.com/tikv/agatedb): A WiscKey implementation in Rust for TiKV 108 | 109 | ### Other Key-Value Stores 110 | These differ significantly in their approach but also provide a key-value store abstraction 111 | * [redb](https://github.com/cberner/redb) 112 | 113 | -------------------------------------------------------------------------------- /benchmarks/async.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | 3 | use tempfile::{Builder, TempDir}; 4 | 5 | use tracing_subscriber::prelude::*; 6 | use tracing_tracy::TracyLayer; 7 | 8 | use lsm::{Database, Params, StartMode, WriteOptions}; 9 | 10 | #[derive(Parser)] 11 | #[clap(author, version, about, long_about = None)] 12 | struct Args { 13 | #[clap(long)] 14 | enable_tracing: bool, 15 | #[clap(long)] 16 | log_level_stats: Option, 17 | #[clap(long, default_value = "100000")] 18 | num_entries: usize, 19 | } 20 | 21 | async fn bench_init(args: &Args) -> (TempDir, Database) { 22 | if args.enable_tracing { 23 | tracing_subscriber::registry() 24 | .with(TracyLayer::default()) 25 | .init(); 26 | } 27 | 28 | let _ = env_logger::builder().is_test(true).try_init(); 29 | let tmp_dir = Builder::new() 30 | .prefix("lsm-async-benchmark-") 31 | .tempdir() 32 | .unwrap(); 33 | 34 | let mut db_path = tmp_dir.path().to_path_buf(); 35 | db_path.push("storage.lsm"); 36 | 37 | let params = Params { 38 | db_path, 39 | log_level_stats: args.log_level_stats.clone(), 40 | ..Default::default() 41 | }; 42 | const SM: StartMode = StartMode::CreateOrOverride; 43 | 44 | let database = Database::new_with_params(SM, params) 45 | .await 46 | .expect("Failed to create database instance"); 47 | 48 | (tmp_dir, database) 49 | } 50 | 51 | #[cfg_attr(feature = "tokio-uring", kioto_uring_executor::main)] 52 | #[cfg_attr(feature = "monoio", monoio::main)] 53 | #[cfg_attr(not(feature = "_async-io"), tokio::main)] 54 | async fn main() { 55 | let args = Args::parse(); 56 | 57 | let (_tmpdir, database) = bench_init(&args).await; 58 | 59 | log::info!("Starting read/write benchmark"); 60 | 61 | let options = WriteOptions { sync: false }; 62 | 63 | log::debug!("Writing {} entries", args.num_entries); 64 | 65 | for pos in 0..args.num_entries { 66 | let key = format!("{pos}").into_bytes(); 67 | let value = format!("some_string_{pos}").into_bytes(); 68 | database.put_opts(key, value, &options).await.unwrap(); 69 | } 70 | 71 | log::debug!("Reading {} entries", args.num_entries); 72 | 73 | for pos in 0..args.num_entries { 74 | let key = format!("{pos}").into_bytes(); 75 | let expected = format!("some_string_{pos}").into_bytes(); 76 | 77 | assert_eq!( 78 | database.get(&key).await.unwrap().unwrap().get_value(), 79 | expected, 80 | ); 81 | } 82 | 83 | database.stop().await.unwrap(); 84 | log::info!("Done"); 85 | } 86 | -------------------------------------------------------------------------------- /bigtest/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsm-bigtest" 3 | version = "0.5.0-dev" 4 | edition = "2024" 5 | authors = ["Kai Mast "] 6 | license = "MIT" 7 | description = "Runs a longer test with lots of data" 8 | readme = "../README.md" 9 | 10 | [dependencies] 11 | kioto-uring-executor = { version="0.3.0-dev", default-features=false, features=["macros"] } 12 | clap = { version="4", features=["derive"] } 13 | env_logger = "0.11" 14 | tempfile = "3" 15 | rand = "0.8" 16 | 17 | [dependencies.lsm] 18 | path = ".." 19 | features = ["tokio-uring"] 20 | -------------------------------------------------------------------------------- /bigtest/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use clap::Parser; 4 | use rand::Rng; 5 | 6 | use lsm::{Database, Params, StartMode}; 7 | 8 | #[derive(Parser)] 9 | struct Args { 10 | #[clap(long, short = 'n', default_value_t = 100_000)] 11 | #[clap(help = "The number of insertions per thread")] 12 | num_insertions: usize, 13 | 14 | #[clap(long, short = 't', default_value_t = 10)] 15 | num_threads: usize, 16 | 17 | #[clap(long, default_value_t = 1_000_000)] 18 | key_range: usize, 19 | 20 | #[clap(long, default_value_t = 1024)] 21 | entry_size: usize, 22 | 23 | #[clap(long, default_value = "/tmp")] 24 | #[clap( 25 | help = "Where to create the temporary working directory? Note, this is the parent directoy of the directory not the directoy itself. 26 | It is recommended to use a tmpfs to not wear out a physical disk" 27 | )] 28 | workdir_location: String, 29 | } 30 | 31 | #[kioto_uring_executor::main] 32 | async fn main() { 33 | env_logger::init(); 34 | 35 | let args = Args::parse(); 36 | 37 | if args.num_insertions == 0 { 38 | panic!("Need to insert at least one entry"); 39 | } 40 | 41 | if args.key_range == 0 { 42 | panic!("Key range cannot be zero"); 43 | } 44 | 45 | println!("Creating working directory and empty database"); 46 | let tmp_dir = tempfile::Builder::new() 47 | .prefix("lsm-bigest-") 48 | .tempdir_in(args.workdir_location) 49 | .expect("Failed to create working directory"); 50 | 51 | let mut db_path = tmp_dir.path().to_path_buf(); 52 | db_path.push("storage.lsm"); 53 | 54 | let params = Params { 55 | db_path, 56 | ..Default::default() 57 | }; 58 | 59 | let database = Arc::new( 60 | Database::new_with_params(StartMode::CreateOrOverride, params) 61 | .await 62 | .expect("Failed to create database instance"), 63 | ); 64 | 65 | println!( 66 | "Inserting a total of {} entries of size {} across {} threads", 67 | args.num_insertions * args.num_threads, 68 | args.entry_size, 69 | args.num_threads 70 | ); 71 | 72 | let tasks: Vec<_> = (1..=args.num_threads) 73 | .map(|idx| { 74 | let database = database.clone(); 75 | kioto_uring_executor::spawn_with(move || { 76 | let mut rng = rand::thread_rng(); 77 | Box::pin(async move { 78 | for count in 1..=args.num_insertions { 79 | let key_idx = rng.gen_range(0..args.key_range); 80 | let key = format!("key{key_idx}").as_bytes().to_vec(); 81 | 82 | let mut value = vec![0; args.entry_size]; 83 | rng.fill(value.as_mut_slice()); 84 | 85 | database.put(key, value).await.expect("Insert failed"); 86 | 87 | if count % 10_000 == 0 { 88 | println!( 89 | "Thread #{idx} inserted {count} entries so far ({}%)", 90 | (count as f64) * 100.0 / (args.num_insertions as f64) 91 | ); 92 | } 93 | } 94 | println!("Thread #{idx} is done"); 95 | }) 96 | }) 97 | }) 98 | .collect(); 99 | 100 | for task in tasks { 101 | task.join().await; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | LOG_LEVEL := "debug" 2 | 3 | # Common prefix for lints 4 | CLIPPY := "cargo clippy --no-default-features --tests" 5 | 6 | all: tests lint 7 | 8 | tests: sync-tests async-tests no-compression-tests \ 9 | tokio-uring-tests wisckey-tests \ 10 | wisckey-no-compression-tests wisckey-sync-tests \ 11 | monoio-tests monoio-wisckey-tests 12 | 13 | sync-tests: 14 | cd sync && just default-tests 15 | 16 | async-tests: 17 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features 18 | 19 | tokio-uring-tests: 20 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=tokio-uring,bloom-filters -- --test-threads=1 21 | 22 | monoio-tests: 23 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=monoio,bloom-filters -- --test-threads=1 24 | 25 | monoio-wisckey-tests: 26 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=monoio,wisckey,bloom-filters -- --test-threads=1 27 | 28 | tokio-uring-wisckey-tests: 29 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=tokio-uring,wisckey,bloom-filters -- --test-threads=1 30 | 31 | no-compression-tests: 32 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features 33 | 34 | wisckey-tests: 35 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=snappy-compression,wisckey 36 | 37 | wisckey-no-compression-tests: 38 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=wisckey 39 | 40 | wisckey-sync-tests: 41 | cd sync && just wisckey-tests 42 | 43 | lint: sync-lint async-lint wisckey-lint \ 44 | wisckey-no-compression-lint tokio-uring-lint \ 45 | tokio-uring-wisckey-lint monoio-lint monoio-wisckey-lint \ 46 | bigtest-lint 47 | 48 | fix-formatting: 49 | cargo fmt 50 | cd sync && just fix-formatting 51 | cd bigtest && cargo fmt 52 | 53 | check-formatting: 54 | cargo fmt --check 55 | cd sync && just check-formatting 56 | 57 | clean: 58 | rm -rf target/ 59 | 60 | update-dependencies: 61 | cargo update 62 | cd sync && cargo update 63 | 64 | udeps: 65 | cargo udeps --all-targets --release 66 | cd sync && just udeps 67 | 68 | sync-lint: 69 | cd sync && just lint 70 | 71 | async-lint: 72 | {{CLIPPY}} -- -D warnings 73 | 74 | tokio-uring-lint: 75 | {{CLIPPY}} --features=tokio-uring,bloom-filters -- -D warnings 76 | 77 | monoio-lint: 78 | {{CLIPPY}} --features=monoio,bloom-filters -- -D warnings 79 | 80 | monoio-wisckey-lint: 81 | {{CLIPPY}} --features=monoio,wisckey,bloom-filters -- -D warnings 82 | 83 | wisckey-lint: 84 | {{CLIPPY}} --features=snappy-compression,wisckey -- -D warnings 85 | 86 | wisckey-no-compression-lint: 87 | {{CLIPPY}} --features=wisckey -- -D warnings 88 | 89 | tokio-uring-wisckey-lint: 90 | {{CLIPPY}} --features=tokio-uring,snappy-compression,wisckey -- -D warnings 91 | 92 | bigtest-lint: 93 | {{CLIPPY}} --package=lsm-bigtest 94 | 95 | bigtest-many: 96 | cargo run --release --package=lsm-bigtest -- -n100000 --entry-size=1024 97 | 98 | bigtest-large: 99 | cargo run --release --package=lsm-bigtest -- -n100 --entry-size=100000 100 | 101 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel="nightly-2025-05-26" 3 | components=["cargo", "rustc", "clippy", "rustfmt"] 4 | -------------------------------------------------------------------------------- /src/data_blocks/block.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::sync::Arc; 3 | 4 | use crate::Key; 5 | 6 | use super::{DataEntry, SearchResult}; 7 | 8 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; 9 | 10 | #[cfg(feature = "bloom-filters")] 11 | use bloomfilter::Bloom; 12 | 13 | #[cfg(feature = "wisckey")] 14 | use crate::values::{ValueBatchId, ValueOffset}; 15 | 16 | #[cfg(feature = "bloom-filters")] 17 | //TODO change the size of this depending on max_key_block_length 18 | pub(super) const BLOOM_LENGTH: usize = 1024; 19 | 20 | #[cfg(feature = "bloom-filters")] 21 | pub(super) const BLOOM_ITEM_COUNT: usize = 1024; 22 | 23 | #[cfg(feature = "bloom-filters")] 24 | /// Taken from https://github.com/jedisct1/rust-bloom-filter/blob/6b93b922be474998514b696dc84333d6c04ed991/src/bitmap.rs#L5 25 | pub(super) const BLOOM_HEADER_SIZE: usize = 1 + 8 + 4 + 32; 26 | 27 | /** 28 | * Layout of a data block on disk 29 | * 30 | * 1. 4 bytes marking where the restart list starts 31 | * 2. 4 bytes indicating the number of entries in this block 32 | * 3. 1024+32 bytes for the bloom filter (if enabled) 33 | * 4. Sequence of variable-length entries 34 | * 5. Variable length restart list (each entry is 4bytes; so we don't need length information) 35 | */ 36 | #[derive(IntoBytes, Immutable, KnownLayout, FromBytes)] 37 | #[repr(C, packed)] 38 | pub(super) struct DataBlockHeader { 39 | pub(super) restart_list_start: u32, 40 | pub(super) number_of_entries: u32, 41 | #[cfg(feature = "bloom-filters")] 42 | pub(super) bloom_filter: [u8; BLOOM_LENGTH + BLOOM_HEADER_SIZE], 43 | } 44 | 45 | /** 46 | * For WiscKey an entry contains: 47 | * 48 | * Header: 49 | * - Key prefix len (4 bytes) 50 | * - Key suffix len (4 bytes) 51 | * - Seq_number (8 bytes) 52 | * - Entry type (1 byte) 53 | * - Value reference (batch id and offset) 54 | * 55 | * Content (not part of the header): 56 | * - Variable length key suffix 57 | * 58 | * When not using WiscKey an entry is variable length and contains the following 59 | * 60 | * Header: 61 | * - Key prefix len (4 bytes) 62 | * - Key suffix len (4 bytes) 63 | * - Value length (8 bytes) 64 | * - Entry Type (1 byte) 65 | * - Sequence number (8 bytes) 66 | * 67 | * Content (not part of the header): 68 | * - Variable length key suffix 69 | * - Variable length value 70 | */ 71 | #[derive(IntoBytes, Immutable, FromBytes, KnownLayout)] 72 | #[repr(C, packed)] 73 | pub(super) struct EntryHeader { 74 | pub(super) prefix_len: u32, 75 | pub(super) suffix_len: u32, 76 | pub(super) entry_type: u8, 77 | pub(super) seq_number: u64, 78 | #[cfg(feature = "wisckey")] 79 | pub(super) value_batch: ValueBatchId, 80 | #[cfg(feature = "wisckey")] 81 | pub(super) value_offset: ValueOffset, 82 | #[cfg(not(feature = "wisckey"))] 83 | pub(super) value_length: u64, 84 | } 85 | 86 | //TODO support data block layouts without prefixed keys 87 | pub struct DataBlock { 88 | pub(super) restart_list_start: usize, 89 | pub(super) num_entries: u32, 90 | pub(super) restart_interval: u32, 91 | pub(super) data: Vec, 92 | #[cfg(feature = "bloom-filters")] 93 | pub(super) bloom_filter: Bloom<[u8]>, 94 | } 95 | 96 | impl DataBlock { 97 | pub fn new_from_data(data: Vec, restart_interval: u32) -> Self { 98 | assert!(!data.is_empty(), "No data?"); 99 | 100 | let header = DataBlockHeader::ref_from_bytes(&data[..Self::header_length()]).unwrap(); 101 | 102 | #[cfg(feature = "bloom-filters")] 103 | let bloom_filter = Bloom::from_bytes(header.bloom_filter.as_slice().to_vec()) 104 | .expect("Failed to load bloom filter"); 105 | 106 | log::trace!("Created new data block from existing data"); 107 | 108 | Self { 109 | num_entries: header.number_of_entries, 110 | restart_list_start: header.restart_list_start as usize, 111 | data, 112 | restart_interval, 113 | #[cfg(feature = "bloom-filters")] 114 | bloom_filter, 115 | } 116 | } 117 | 118 | fn header_length() -> usize { 119 | std::mem::size_of::() 120 | } 121 | 122 | /// Get the key and entry at the specified offset in bytes (must be valid!) 123 | /// The third entry in this result is the new offset after the entry 124 | #[tracing::instrument(skip(self_ptr, previous_key))] 125 | pub fn get_entry_at_offset( 126 | self_ptr: Arc, 127 | offset: u32, 128 | previous_key: &[u8], 129 | ) -> (Key, DataEntry) { 130 | let mut offset = (offset as usize) + Self::header_length(); 131 | 132 | let header_len = std::mem::size_of::(); 133 | 134 | if offset + header_len > self_ptr.restart_list_start { 135 | panic!("Invalid offset {offset}"); 136 | } 137 | 138 | let header = EntryHeader::ref_from_bytes(&self_ptr.data[offset..offset + header_len]) 139 | .expect("Failed to read entry header"); 140 | let entry_offset = offset; 141 | 142 | offset += std::mem::size_of::(); 143 | 144 | let kdata = [ 145 | &previous_key[..(header.prefix_len as usize)], 146 | &self_ptr.data[offset..offset + (header.suffix_len as usize)], 147 | ] 148 | .concat(); 149 | offset += header.suffix_len as usize; 150 | 151 | // Move offset to after the entry 152 | #[cfg(not(feature = "wisckey"))] 153 | { 154 | offset += header.value_length as usize; 155 | } 156 | 157 | let next_offset = offset - Self::header_length(); 158 | 159 | let entry = DataEntry { 160 | block: self_ptr, 161 | offset: entry_offset, 162 | next_offset: next_offset as u32, 163 | }; 164 | 165 | (kdata, entry) 166 | } 167 | 168 | /// How many entries are in this data block? 169 | pub fn get_num_entries(&self) -> u32 { 170 | self.num_entries 171 | } 172 | 173 | /// Get they entry at the specified index 174 | /// (the index is in entries not bytes) 175 | #[tracing::instrument(skip(self_ptr))] 176 | pub fn get_entry_at_index(self_ptr: &Arc, index: u32) -> (Key, DataEntry) { 177 | // First, get the closest restart offset 178 | let restart_pos = index / self_ptr.restart_interval; 179 | 180 | let restart_offset = self_ptr.get_restart_offset(restart_pos); 181 | let (mut key, mut entry) = Self::get_entry_at_offset(self_ptr.clone(), restart_offset, &[]); 182 | 183 | let mut current_idx = restart_pos * self_ptr.restart_interval; 184 | 185 | while current_idx < index { 186 | (key, entry) = 187 | Self::get_entry_at_offset(self_ptr.clone(), entry.get_next_offset(), &key); 188 | current_idx += 1; 189 | } 190 | 191 | (key, entry) 192 | } 193 | 194 | /// Length of this block in bytes without the header and restart list 195 | pub fn byte_len(&self) -> u32 { 196 | // "Cut-off" the beginning and end 197 | let rl_len = self.data.len() - self.restart_list_start; 198 | (self.data.len() - Self::header_length() - rl_len) as u32 199 | } 200 | 201 | #[inline(always)] 202 | fn restart_list_len(&self) -> usize { 203 | let offset_len = std::mem::size_of::(); 204 | let rl_len = self.data.len() - self.restart_list_start; 205 | 206 | assert!(rl_len % offset_len == 0); 207 | rl_len / offset_len 208 | } 209 | 210 | /// Get get byte offset of a restart entry 211 | #[inline(always)] 212 | fn get_restart_offset(&self, pos: u32) -> u32 { 213 | let offset_len = std::mem::size_of::(); 214 | let pos = self.restart_list_start + (pos as usize) * offset_len; 215 | 216 | u32::read_from_bytes(&self.data[pos..pos + offset_len]).unwrap() 217 | - Self::header_length() as u32 218 | } 219 | 220 | #[tracing::instrument(skip(self_ptr, key))] 221 | fn binary_search(self_ptr: &Arc, key: &[u8]) -> SearchResult { 222 | let rl_len = self_ptr.restart_list_len(); 223 | 224 | let mut start: u32 = 0; 225 | let mut end = (rl_len as u32) - 1; 226 | 227 | // binary search 228 | while end - start > 1 { 229 | let mid = start + (end - start) / 2; 230 | 231 | // We always perform the search at the restart positions for efficiency 232 | let offset = self_ptr.get_restart_offset(mid); 233 | let (this_key, entry) = Self::get_entry_at_offset(self_ptr.clone(), offset, &[]); 234 | 235 | match this_key.as_slice().cmp(key) { 236 | Ordering::Equal => { 237 | // Exact match 238 | return SearchResult::ExactMatch(entry); 239 | } 240 | Ordering::Less => { 241 | // continue with right half 242 | start = mid; 243 | } 244 | Ordering::Greater => { 245 | // continue with left half 246 | end = mid; 247 | } 248 | } 249 | } 250 | 251 | // There is no reset at the very end so we need to include 252 | // that part in the sequential search 253 | let end = if end + 1 == rl_len as u32 { 254 | self_ptr.byte_len() 255 | } else { 256 | self_ptr.get_restart_offset(end) 257 | }; 258 | 259 | SearchResult::Range(start, end) 260 | } 261 | 262 | /// Get the entry for the specified key 263 | /// Will return None if no such entry exists 264 | #[tracing::instrument(skip(self_ptr, key))] 265 | pub fn get_by_key(self_ptr: &Arc, key: &[u8]) -> Option { 266 | #[cfg(feature = "bloom-filters")] 267 | if !self_ptr.bloom_filter.check(key) { 268 | return None; 269 | } 270 | 271 | let (start, end) = match Self::binary_search(self_ptr, key) { 272 | SearchResult::ExactMatch(entry) => { 273 | return Some(entry); 274 | } 275 | SearchResult::Range(start, end) => (start, end), 276 | }; 277 | 278 | let mut pos = self_ptr.get_restart_offset(start); 279 | 280 | let mut last_key = vec![]; 281 | while pos < end { 282 | let (this_key, entry) = Self::get_entry_at_offset(self_ptr.clone(), pos, &last_key); 283 | 284 | if key == this_key { 285 | return Some(entry); 286 | } 287 | 288 | pos = entry.get_next_offset(); 289 | last_key = this_key; 290 | } 291 | 292 | // Not found 293 | None 294 | } 295 | } 296 | -------------------------------------------------------------------------------- /src/data_blocks/builder.rs: -------------------------------------------------------------------------------- 1 | use cfg_if::cfg_if; 2 | 3 | use std::sync::Arc; 4 | 5 | use crate::manifest::SeqNumber; 6 | use crate::{Error, disk}; 7 | 8 | use zerocopy::IntoBytes; 9 | 10 | use super::block::{DataBlockHeader, EntryHeader}; 11 | use super::{DataBlock, DataBlockId, DataBlocks, PrefixedKey}; 12 | 13 | #[cfg(feature = "bloom-filters")] 14 | use bloomfilter::Bloom; 15 | 16 | #[cfg(feature = "bloom-filters")] 17 | use super::block::{BLOOM_HEADER_SIZE, BLOOM_ITEM_COUNT, BLOOM_LENGTH}; 18 | 19 | #[cfg(feature = "wisckey")] 20 | use crate::data_blocks::ValueId; 21 | 22 | pub struct DataBlockBuilder { 23 | data_blocks: Arc, 24 | data: Vec, 25 | 26 | /// The position/index of the next entry 27 | /// This is also the current number of entries in this block builder 28 | position: u32, 29 | 30 | /// The restart list keeps track of when the keys are fully reset 31 | /// This enables using binary search in get() instead of seeking linearly 32 | restart_list: Vec, 33 | 34 | #[cfg(feature = "bloom-filters")] 35 | bloom_filter: Bloom<[u8]>, 36 | } 37 | 38 | impl DataBlockBuilder { 39 | #[tracing::instrument(skip(data_blocks))] 40 | pub(super) fn new(data_blocks: Arc) -> Self { 41 | // Reserve space for the header 42 | let data = vec![0u8; std::mem::size_of::()]; 43 | 44 | Self { 45 | data_blocks, 46 | data, 47 | position: 0, 48 | restart_list: vec![], 49 | #[cfg(feature = "bloom-filters")] 50 | bloom_filter: Bloom::new(BLOOM_LENGTH, BLOOM_ITEM_COUNT) 51 | .expect("Failed to create bloom filter"), 52 | } 53 | } 54 | 55 | pub fn add_entry( 56 | &mut self, 57 | mut key: PrefixedKey, 58 | full_key: &[u8], 59 | seq_number: SeqNumber, 60 | entry_type: u8, 61 | #[cfg(not(feature = "wisckey"))] entry_data: &[u8], 62 | #[cfg(feature = "wisckey")] value_ref: ValueId, 63 | ) { 64 | if self.position % self.data_blocks.params.block_restart_interval == 0 { 65 | assert!(key.prefix_len == 0); 66 | self.restart_list.push(self.data.len() as u32); 67 | } 68 | 69 | cfg_if! { 70 | if #[cfg(feature="bloom-filters")] { 71 | self.bloom_filter.set(full_key); 72 | } else { 73 | let _ = full_key; 74 | } 75 | } 76 | 77 | let header = EntryHeader { 78 | prefix_len: key.prefix_len, 79 | suffix_len: key.suffix.len() as u32, 80 | seq_number, 81 | entry_type, 82 | #[cfg(feature = "wisckey")] 83 | value_batch: value_ref.0, 84 | #[cfg(feature = "wisckey")] 85 | value_offset: value_ref.1, 86 | #[cfg(not(feature = "wisckey"))] 87 | value_length: entry_data.len() as u64, 88 | }; 89 | 90 | self.data.extend_from_slice(header.as_bytes()); 91 | 92 | self.data.append(&mut key.suffix); 93 | 94 | #[cfg(not(feature = "wisckey"))] 95 | self.data.extend_from_slice(entry_data); 96 | 97 | self.position += 1; 98 | } 99 | 100 | /// Finish building an return the data blocks 101 | /// 102 | /// This will return Ok(None) if the builder did not have any entries 103 | /// An error might be generated if we failed to write to disk 104 | #[tracing::instrument(skip(self))] 105 | pub async fn finish(mut self) -> Result, Error> { 106 | if self.position == 0 { 107 | return Ok(None); 108 | } 109 | 110 | let identifier = self 111 | .data_blocks 112 | .manifest 113 | .generate_next_data_block_id() 114 | .await; 115 | 116 | #[cfg(feature = "bloom-filters")] 117 | let bloom_filter: &[u8; BLOOM_LENGTH + BLOOM_HEADER_SIZE] = 118 | self.bloom_filter.as_slice().try_into().unwrap(); 119 | 120 | let header = DataBlockHeader { 121 | #[cfg(feature = "bloom-filters")] 122 | bloom_filter: *bloom_filter, 123 | number_of_entries: self.position, 124 | restart_list_start: self.data.len() as u32, 125 | }; 126 | 127 | // Write header 128 | self.data[..std::mem::size_of::()].copy_from_slice(header.as_bytes()); 129 | 130 | // Write restart list 131 | for restart_offset in self.restart_list.drain(..) { 132 | self.data.extend_from_slice(restart_offset.as_bytes()); 133 | } 134 | 135 | let block = Arc::new(DataBlock { 136 | data: self.data, 137 | num_entries: header.number_of_entries, 138 | restart_interval: self.data_blocks.params.block_restart_interval, 139 | restart_list_start: header.restart_list_start as usize, 140 | #[cfg(feature = "bloom-filters")] 141 | bloom_filter: self.bloom_filter, 142 | }); 143 | let shard_id = DataBlocks::block_to_shard_id(identifier); 144 | 145 | // Store on disk before grabbing the lock 146 | let block_data = &block.data; 147 | let fpath = self.data_blocks.get_file_path(&identifier); 148 | 149 | disk::write(&fpath, block_data).await.map_err(|err| { 150 | Error::from_io_error(format!("Failed to write data block at `{fpath:?}`"), err) 151 | })?; 152 | 153 | self.data_blocks.block_caches[shard_id] 154 | .lock() 155 | .put(identifier, block); 156 | 157 | Ok(Some(identifier)) 158 | } 159 | 160 | /// How big is the block now? 161 | pub fn current_size(&self) -> usize { 162 | self.data.len() 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/data_blocks/mod.rs: -------------------------------------------------------------------------------- 1 | /// Data blocks hold the actual contents of storted table 2 | /// (In the case of WiscKey the content is only the key and the value reference) 3 | use std::num::NonZeroUsize; 4 | use std::sync::Arc; 5 | 6 | use parking_lot::Mutex; 7 | 8 | use lru::LruCache; 9 | 10 | use zerocopy::FromBytes; 11 | 12 | use crate::Params; 13 | use crate::manifest::Manifest; 14 | use crate::{WriteOp, disk}; 15 | 16 | mod builder; 17 | pub use builder::DataBlockBuilder; 18 | 19 | mod block; 20 | pub use block::DataBlock; 21 | 22 | use block::EntryHeader; 23 | 24 | #[cfg(feature = "wisckey")] 25 | use crate::values::ValueId; 26 | 27 | pub type DataBlockId = u64; 28 | 29 | /// The minimum valid data block identifier 30 | pub const MIN_DATA_BLOCK_ID: DataBlockId = 1; 31 | 32 | const NUM_SHARDS: NonZeroUsize = NonZeroUsize::new(64).unwrap(); 33 | 34 | #[derive(Debug)] 35 | pub struct PrefixedKey { 36 | prefix_len: u32, 37 | suffix: Vec, 38 | } 39 | 40 | impl PrefixedKey { 41 | pub fn new(prefix_len: usize, suffix: Vec) -> Self { 42 | Self { 43 | prefix_len: prefix_len as u32, 44 | suffix, 45 | } 46 | } 47 | } 48 | 49 | type BlockShard = LruCache>; 50 | 51 | pub enum DataEntryType { 52 | Put, 53 | Delete, 54 | } 55 | 56 | #[derive(Clone)] 57 | pub struct DataEntry { 58 | /// The block containing th 59 | block: Arc, 60 | 61 | /// The of this entry in the block's buffer 62 | offset: usize, 63 | 64 | /// The end of this entry 65 | next_offset: u32, 66 | } 67 | 68 | enum SearchResult { 69 | ExactMatch(DataEntry), 70 | Range(u32, u32), 71 | } 72 | 73 | impl DataEntry { 74 | fn get_header(&self) -> &EntryHeader { 75 | let header_len = std::mem::size_of::(); 76 | let header_data = &self.block.data[self.offset..self.offset + header_len]; 77 | EntryHeader::ref_from_bytes(header_data).expect("Failed to read entry header") 78 | } 79 | 80 | pub fn get_sequence_number(&self) -> u64 { 81 | self.get_header().seq_number 82 | } 83 | 84 | /// The offset of the next entry 85 | pub fn get_next_offset(&self) -> u32 { 86 | self.next_offset 87 | } 88 | 89 | pub fn get_type(&self) -> DataEntryType { 90 | let header = self.get_header(); 91 | 92 | if header.entry_type == WriteOp::PUT_OP { 93 | DataEntryType::Put 94 | } else if header.entry_type == WriteOp::DELETE_OP { 95 | DataEntryType::Delete 96 | } else { 97 | panic!("Unknown data entry type"); 98 | } 99 | } 100 | 101 | #[cfg(not(feature = "wisckey"))] 102 | pub fn get_value(&self) -> Option<&[u8]> { 103 | let header = self.get_header(); 104 | let value_offset = 105 | self.offset + std::mem::size_of::() + (header.suffix_len as usize); 106 | 107 | if header.entry_type == WriteOp::PUT_OP { 108 | let end = value_offset + (header.value_length as usize); 109 | Some(&self.block.data[value_offset..end]) 110 | } else if header.entry_type == WriteOp::DELETE_OP { 111 | None 112 | } else { 113 | panic!("Unknown write op"); 114 | } 115 | } 116 | 117 | #[cfg(feature = "wisckey")] 118 | pub fn get_value_id(&self) -> Option { 119 | let header = self.get_header(); 120 | 121 | if header.entry_type == WriteOp::PUT_OP { 122 | Some((header.value_batch, header.value_offset)) 123 | } else if header.entry_type == WriteOp::DELETE_OP { 124 | None 125 | } else { 126 | panic!("Unknown write op"); 127 | } 128 | } 129 | } 130 | 131 | /// Keeps track of all in-memory data blocks 132 | pub struct DataBlocks { 133 | params: Arc, 134 | block_caches: Vec>, 135 | manifest: Arc, 136 | } 137 | 138 | impl DataBlocks { 139 | pub fn new(params: Arc, manifest: Arc) -> Self { 140 | let max_data_files = NonZeroUsize::new(params.max_open_files / 2) 141 | .expect("Max open files needs to be greater than 2"); 142 | 143 | let shard_size = NonZeroUsize::new(max_data_files.get() / NUM_SHARDS) 144 | .expect("Not enough open files to support the number of shards"); 145 | 146 | let mut block_caches = Vec::new(); 147 | for _ in 0..NUM_SHARDS.get() { 148 | block_caches.push(Mutex::new(BlockShard::new(shard_size))); 149 | } 150 | 151 | Self { 152 | params, 153 | block_caches, 154 | manifest, 155 | } 156 | } 157 | 158 | #[inline] 159 | fn block_to_shard_id(block_id: DataBlockId) -> usize { 160 | (block_id as usize) % NUM_SHARDS 161 | } 162 | 163 | /// The path where the block with the given id 164 | /// will be stored at. 165 | #[inline] 166 | fn get_file_path(&self, block_id: &DataBlockId) -> std::path::PathBuf { 167 | self.params.db_path.join(format!("key{block_id:08}.data")) 168 | } 169 | 170 | /// Start creation of a new block 171 | #[tracing::instrument(skip(self_ptr))] 172 | pub fn build_block(self_ptr: Arc) -> DataBlockBuilder { 173 | DataBlockBuilder::new(self_ptr) 174 | } 175 | 176 | /// Get a block by its id 177 | /// Will either return the block from cache or load it from disk 178 | #[tracing::instrument(skip(self))] 179 | pub async fn get_block(&self, id: &DataBlockId) -> Arc { 180 | let shard_id = Self::block_to_shard_id(*id); 181 | let cache = &self.block_caches[shard_id]; 182 | 183 | if let Some(block) = cache.lock().get(id) { 184 | return block.clone(); 185 | } 186 | 187 | // Do not hold the lock while loading form disk for better concurrency 188 | // Worst case this means we load the same block multiple times... 189 | let fpath = self.get_file_path(id); 190 | log::trace!("Loading data block from disk at {fpath:?}"); 191 | let data = disk::read(&fpath, 0).await.unwrap_or_else(|err| { 192 | panic!("Failed to load data block from disk at {fpath:?}: {err}") 193 | }); 194 | let block = Arc::new(DataBlock::new_from_data( 195 | data, 196 | self.params.block_restart_interval, 197 | )); 198 | 199 | cache.lock().put(*id, block.clone()); 200 | log::trace!("Stored new block in cache"); 201 | block 202 | } 203 | } 204 | 205 | #[cfg(test)] 206 | mod tests { 207 | use super::*; 208 | use tempfile::tempdir; 209 | 210 | #[cfg(feature = "tokio-uring")] 211 | use kioto_uring_executor::test as async_test; 212 | 213 | #[cfg(feature = "monoio")] 214 | use monoio::test as async_test; 215 | 216 | #[cfg(not(feature = "_async-io"))] 217 | use tokio::test as async_test; 218 | 219 | #[cfg(feature = "wisckey")] 220 | #[async_test] 221 | async fn store_and_load() { 222 | let dir = tempdir().unwrap(); 223 | let params = Arc::new(Params { 224 | db_path: dir.path().to_path_buf(), 225 | ..Default::default() 226 | }); 227 | 228 | let manifest = Arc::new(Manifest::new(params.clone()).await); 229 | 230 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 231 | let mut builder = DataBlocks::build_block(data_blocks.clone()); 232 | 233 | let key1 = PrefixedKey { 234 | prefix_len: 0, 235 | suffix: vec![5], 236 | }; 237 | let seq1 = 14234524; 238 | let val1 = (4, 2); 239 | builder.add_entry(key1, &[5], seq1, WriteOp::PUT_OP, val1); 240 | 241 | let key2 = PrefixedKey { 242 | prefix_len: 1, 243 | suffix: vec![2], 244 | }; 245 | let seq2 = 424234; 246 | let val2 = (4, 5); 247 | builder.add_entry(key2, &[5, 2], seq2, WriteOp::PUT_OP, val2); 248 | 249 | let id = builder.finish().await.unwrap().unwrap(); 250 | let data_block1 = data_blocks.get_block(&id).await; 251 | let data_block2 = Arc::new(DataBlock::new_from_data( 252 | data_block1.data.clone(), 253 | params.block_restart_interval, 254 | )); 255 | 256 | let prev_key = vec![]; 257 | let (key, entry) = DataBlock::get_entry_at_offset(data_block2.clone(), 0, &prev_key); 258 | 259 | assert_eq!(key, vec![5]); 260 | assert_eq!(entry.get_value_id(), Some(val1)); 261 | 262 | let (key, entry) = 263 | DataBlock::get_entry_at_offset(data_block2.clone(), entry.get_next_offset(), &key); 264 | 265 | assert_eq!(key, vec![5, 2]); 266 | assert_eq!(entry.get_value_id(), Some(val2)); 267 | assert_eq!(entry.get_next_offset(), data_block2.byte_len()); 268 | } 269 | 270 | #[cfg(not(feature = "wisckey"))] 271 | #[async_test] 272 | async fn store_and_load() { 273 | let dir = tempdir().unwrap(); 274 | let params = Arc::new(Params { 275 | db_path: dir.path().to_path_buf(), 276 | ..Default::default() 277 | }); 278 | 279 | let manifest = Arc::new(Manifest::new(params.clone()).await); 280 | 281 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 282 | let mut builder = DataBlocks::build_block(data_blocks.clone()); 283 | 284 | let key1 = PrefixedKey { 285 | prefix_len: 0, 286 | suffix: vec![5], 287 | }; 288 | let seq1 = 14234524; 289 | let val1 = vec![4, 2]; 290 | builder.add_entry(key1, &[5u8], seq1, WriteOp::PUT_OP, &val1); 291 | 292 | let key2 = PrefixedKey { 293 | prefix_len: 1, 294 | suffix: vec![2], 295 | }; 296 | let seq2 = 424234; 297 | let val2 = vec![24, 50]; 298 | builder.add_entry(key2, &[5u8, 2u8], seq2, WriteOp::PUT_OP, &val2); 299 | 300 | let id = builder.finish().await.unwrap().unwrap(); 301 | let data_block1 = data_blocks.get_block(&id).await; 302 | let data_block2 = Arc::new(DataBlock::new_from_data( 303 | data_block1.data.clone(), 304 | params.block_restart_interval, 305 | )); 306 | 307 | let prev_key = vec![]; 308 | let (key, entry) = DataBlock::get_entry_at_offset(data_block2.clone(), 0, &prev_key); 309 | 310 | assert_eq!(key, vec![5]); 311 | assert_eq!(entry.get_value(), Some(&val1[..])); 312 | 313 | let (key, entry) = 314 | DataBlock::get_entry_at_offset(data_block2.clone(), entry.get_next_offset(), &key); 315 | 316 | assert_eq!(key, vec![5, 2]); 317 | assert_eq!(entry.get_value(), Some(&val2[..])); 318 | assert_eq!(entry.get_next_offset(), data_block2.byte_len()); 319 | } 320 | } 321 | -------------------------------------------------------------------------------- /src/database.rs: -------------------------------------------------------------------------------- 1 | use crate::iterate::DbIterator; 2 | use crate::logic::{DbLogic, EntryRef}; 3 | use crate::tasks::{TaskManager, TaskType}; 4 | use crate::{Error, Key, Params, StartMode, Value, WriteBatch, WriteOptions}; 5 | 6 | use std::sync::Arc; 7 | 8 | /// The main database structure 9 | /// This struct can be accessed concurrently and you should 10 | /// never instantiate it more than once for the same on-disk files 11 | pub struct Database { 12 | inner: Arc, 13 | tasks: Arc, 14 | } 15 | 16 | impl Database { 17 | /// Create a new database instance with default parameters 18 | pub async fn new(mode: StartMode) -> Result { 19 | let params = Params::default(); 20 | Self::new_with_params(mode, params).await 21 | } 22 | 23 | /// Create a new database instance with specific parameters 24 | pub async fn new_with_params(mode: StartMode, params: Params) -> Result { 25 | let compaction_concurrency = params.compaction_concurrency; 26 | 27 | let inner = Arc::new(DbLogic::new(mode, params).await?); 28 | let tasks = Arc::new(TaskManager::new(inner.clone(), compaction_concurrency).await); 29 | 30 | Ok(Self { inner, tasks }) 31 | } 32 | 33 | /// Will deserialize V from the raw data (avoids an additional data copy) 34 | #[tracing::instrument(skip(self, key))] 35 | pub async fn get(&self, key: &[u8]) -> Result, Error> { 36 | match self.inner.get(key).await { 37 | Ok((needs_compaction, data)) => { 38 | if needs_compaction { 39 | self.tasks.wake_up(&TaskType::LevelCompaction); 40 | } 41 | 42 | Ok(data) 43 | } 44 | Err(err) => Err(err), 45 | } 46 | } 47 | 48 | /// Delete an existing entry 49 | /// For efficiency, the datastore does not check whether the key actually existed 50 | /// Instead, it will just mark the most recent version (which could be the first one) as deleted 51 | #[tracing::instrument(skip(self, key))] 52 | pub async fn delete(&self, key: Key) -> Result<(), Error> { 53 | let mut batch = WriteBatch::new(); 54 | batch.delete(key); 55 | 56 | self.write_opts(batch, &WriteOptions::default()).await 57 | } 58 | 59 | /// Ensure all data is written to disk 60 | /// Only has an effect if there were previous writes with sync=false 61 | pub async fn synchronize(&self) -> Result<(), Error> { 62 | self.inner.synchronize().await 63 | } 64 | 65 | /// Delete an existing entry (with additional options) 66 | pub async fn delete_opts(&self, key: Key, opts: &WriteOptions) -> Result<(), Error> { 67 | let mut batch = WriteBatch::new(); 68 | batch.delete(key); 69 | self.write_opts(batch, opts).await 70 | } 71 | 72 | /// Insert or update a single entry 73 | pub async fn put(&self, key: Key, value: Value) -> Result<(), Error> { 74 | const OPTS: WriteOptions = WriteOptions::new(); 75 | self.put_opts(key, value, &OPTS).await 76 | } 77 | 78 | /// Insert or update a single entry (with additional options) 79 | #[tracing::instrument(skip(self))] 80 | pub async fn put_opts(&self, key: Key, value: Value, opts: &WriteOptions) -> Result<(), Error> { 81 | let mut batch = WriteBatch::new(); 82 | batch.put(key, value); 83 | self.write_opts(batch, opts).await 84 | } 85 | 86 | /// Iterate over all entries in the database 87 | pub async fn iter(&self) -> DbIterator { 88 | let (mem_iters, table_iters, min_key, max_key) = self.inner.prepare_iter(None, None).await; 89 | 90 | DbIterator::new( 91 | mem_iters, 92 | table_iters, 93 | min_key, 94 | max_key, 95 | false, 96 | #[cfg(feature = "wisckey")] 97 | self.inner.get_value_log(), 98 | ) 99 | } 100 | 101 | /// Like iter(), but will only include entries with keys in [min_key;max_key) 102 | pub async fn range_iter(&self, min_key: &[u8], max_key: &[u8]) -> DbIterator { 103 | let (mem_iters, table_iters, min_key, max_key) = 104 | self.inner.prepare_iter(Some(min_key), Some(max_key)).await; 105 | 106 | DbIterator::new( 107 | mem_iters, 108 | table_iters, 109 | min_key, 110 | max_key, 111 | false, 112 | #[cfg(feature = "wisckey")] 113 | self.inner.get_value_log(), 114 | ) 115 | } 116 | 117 | /// Like range_iter(), but in reverse. 118 | /// It will only include entries with keys in (min_key;max_key] 119 | pub async fn reverse_range_iter(&self, max_key: &[u8], min_key: &[u8]) -> DbIterator { 120 | let (mem_iters, table_iters, min_key, max_key) = self 121 | .inner 122 | .prepare_reverse_iter(Some(max_key), Some(min_key)) 123 | .await; 124 | 125 | DbIterator::new( 126 | mem_iters, 127 | table_iters, 128 | min_key.map(|k| k.to_vec()), 129 | max_key.map(|k| k.to_vec()), 130 | true, 131 | #[cfg(feature = "wisckey")] 132 | self.inner.get_value_log(), 133 | ) 134 | } 135 | 136 | /// Write a batch of updates to the database 137 | /// 138 | /// If you only want to write to a single key, use `Database::put` instead 139 | pub async fn write(&self, write_batch: WriteBatch) -> Result<(), Error> { 140 | const OPTS: WriteOptions = WriteOptions::new(); 141 | self.write_opts(write_batch, &OPTS).await 142 | } 143 | 144 | /// Write a batch of updates to the database 145 | /// This version of write allows you to specify options such as "synchronous" 146 | #[tracing::instrument(skip(self, write_batch, opts))] 147 | pub async fn write_opts( 148 | &self, 149 | write_batch: WriteBatch, 150 | opts: &WriteOptions, 151 | ) -> Result<(), Error> { 152 | let needs_compaction = self.inner.write_opts(write_batch, opts).await?; 153 | 154 | if needs_compaction { 155 | self.tasks.wake_up(&TaskType::MemtableCompaction); 156 | } 157 | 158 | Ok(()) 159 | } 160 | 161 | /// Stop all background tasks gracefully 162 | pub async fn stop(&self) -> Result<(), Error> { 163 | self.inner.stop().await?; 164 | self.tasks.stop_all().await 165 | } 166 | } 167 | 168 | impl Drop for Database { 169 | fn drop(&mut self) { 170 | self.tasks.terminate(); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/disk.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "tokio-uring")] 2 | use tokio_uring::fs; 3 | 4 | #[cfg(feature = "monoio")] 5 | use monoio::fs; 6 | 7 | #[cfg(not(feature = "_async-io"))] 8 | use std::fs; 9 | 10 | #[cfg(not(feature = "_async-io"))] 11 | use std::io::{Read, Seek, Write}; 12 | 13 | use std::path::Path; 14 | 15 | use cfg_if::cfg_if; 16 | 17 | /// Read from the offset to the end of the file 18 | /// 19 | /// - This is not supported by tokio-uring yet, so it is added as a helper function here 20 | #[cfg(feature = "_async-io")] 21 | #[inline(always)] 22 | #[tracing::instrument] 23 | pub async fn read_uncompressed(fpath: &Path, offset: u64) -> Result, std::io::Error> { 24 | let file = fs::File::open(fpath).await?; 25 | let mut buffer = vec![0u8; 4096]; 26 | let mut result = vec![]; 27 | let mut pos = offset; 28 | 29 | loop { 30 | let (res, buf) = file.read_at(buffer, pos).await; 31 | 32 | match res { 33 | Ok(0) => return Ok(result), 34 | Ok(n) => { 35 | buffer = buf; 36 | result.extend_from_slice(&buffer[..n]); 37 | pos += n as u64; 38 | } 39 | Err(err) => return Err(err), 40 | } 41 | } 42 | } 43 | 44 | #[cfg(not(feature = "_async-io"))] 45 | #[inline(always)] 46 | #[tracing::instrument] 47 | pub async fn read_uncompressed(fpath: &Path, offset: u64) -> Result, std::io::Error> { 48 | let mut file = fs::File::open(fpath)?; 49 | 50 | if offset > 0 { 51 | file.seek(std::io::SeekFrom::Start(offset))?; 52 | } 53 | 54 | let mut buf = vec![]; 55 | file.read_to_end(&mut buf)?; 56 | 57 | Ok(buf) 58 | } 59 | 60 | /// Read the contents of the file from the given offset to 61 | /// its end. 62 | #[inline(always)] 63 | #[tracing::instrument] 64 | pub async fn read(fpath: &Path, offset: u64) -> Result, std::io::Error> { 65 | let compressed = read_uncompressed(fpath, offset).await?; 66 | 67 | cfg_if! { 68 | if #[ cfg(feature="snappy-compression") ] { 69 | let mut decoder = snap::raw::Decoder::new(); 70 | Ok(decoder.decompress_vec(&compressed)?) 71 | } else { 72 | Ok(compressed) 73 | } 74 | } 75 | } 76 | 77 | /// Writes the data to the specified file path 78 | /// 79 | /// This will create the file if it does not exist yet. 80 | /// It will also compress the data, if enabled. 81 | #[tracing::instrument(skip(data))] 82 | #[inline(always)] 83 | pub async fn write(fpath: &Path, data: &[u8]) -> Result<(), std::io::Error> { 84 | //TODO it might be worth investigating if encoding/decoding 85 | // chunks is more efficient 86 | 87 | cfg_if! { 88 | if #[cfg(feature="snappy-compression") ] { 89 | let mut encoder = snap::raw::Encoder::new(); 90 | let compressed = encoder.compress_vec(data) 91 | .expect("Failed to compress data"); 92 | } else { 93 | let mut compressed = vec![]; 94 | compressed.extend_from_slice(data); 95 | } 96 | } 97 | 98 | write_uncompressed(fpath, compressed).await 99 | } 100 | 101 | /// Writes the uncompressed (even if the feature is enabled) 102 | /// to the specified file path 103 | /// 104 | /// This will create the file if it does not exist yet. 105 | #[tracing::instrument(skip(data))] 106 | #[inline(always)] 107 | pub async fn write_uncompressed(fpath: &Path, data: Vec) -> Result<(), std::io::Error> { 108 | cfg_if! { 109 | if #[ cfg(feature="_async-io") ] { 110 | let file = fs::OpenOptions::new().create(true) 111 | .truncate(true).write(true) 112 | .open(fpath).await?; 113 | 114 | let (res, _buf) = file.write_all_at(data, 0).await; 115 | res?; 116 | file.sync_all().await?; 117 | } else { 118 | let mut file = fs::OpenOptions::new().create(true) 119 | .truncate(true).write(true) 120 | .open(fpath)?; 121 | 122 | file.write_all(&data)?; 123 | file.sync_all()?; 124 | } 125 | } 126 | 127 | Ok(()) 128 | } 129 | 130 | pub async fn remove_file(fpath: &Path) -> Result<(), std::io::Error> { 131 | cfg_if! { 132 | if #[ cfg(feature="tokio-uring") ] { 133 | tokio_uring::fs::remove_file(fpath).await 134 | } else { 135 | std::fs::remove_file(fpath)?; 136 | Ok(()) 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/index_blocks.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::mem::size_of; 3 | use std::path::Path; 4 | 5 | use crate::data_blocks::DataBlockId; 6 | use crate::sorted_table::TableId; 7 | use crate::{Error, disk}; 8 | use crate::{Key, Params}; 9 | 10 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; 11 | 12 | #[derive(Debug, IntoBytes, KnownLayout, Immutable, FromBytes)] 13 | #[repr(C, packed)] 14 | struct IndexBlockHeader { 15 | size: u64, 16 | min_key_len: u32, 17 | max_key_len: u32, 18 | num_data_blocks: u32, 19 | _padding: u32, 20 | } 21 | 22 | #[derive(IntoBytes, KnownLayout, Immutable, FromBytes)] 23 | #[repr(C, packed)] 24 | struct IndexEntryHeader { 25 | block_id: DataBlockId, 26 | key_len: u32, 27 | _padding: u32, 28 | } 29 | 30 | /** Index blocks hold metadata about a sorted table 31 | * Each table has exactly one index block 32 | * 33 | * Layout: 34 | * - Header 35 | * - Min key bytes 36 | * - Max key bytes 37 | * - Offset list 38 | * - Index Entries 39 | **/ 40 | pub struct IndexBlock { 41 | data: Vec, 42 | } 43 | 44 | impl IndexBlock { 45 | pub async fn new( 46 | params: &Params, 47 | id: TableId, 48 | index: Vec<(Key, DataBlockId)>, 49 | size: u64, 50 | min_key: Key, 51 | max_key: Key, 52 | ) -> Result { 53 | let header = IndexBlockHeader { 54 | size, 55 | min_key_len: min_key.len() as u32, 56 | max_key_len: max_key.len() as u32, 57 | num_data_blocks: index.len() as u32, 58 | _padding: 0, 59 | }; 60 | 61 | let mut block_data = header.as_bytes().to_vec(); 62 | block_data.extend_from_slice(&min_key); 63 | block_data.extend_from_slice(&max_key); 64 | 65 | crate::add_padding(&mut block_data); 66 | 67 | // Reserve space for offsets 68 | let offset_start = block_data.len(); 69 | let offset_len = crate::pad_offset(index.len()); 70 | block_data.append(&mut vec![0u8; offset_len * size_of::()]); 71 | 72 | for (pos, (key, block_id)) in index.into_iter().enumerate() { 73 | let header = IndexEntryHeader { 74 | block_id, 75 | key_len: key.len() as u32, 76 | _padding: 0, 77 | }; 78 | 79 | let entry_offset = block_data.len() as u32; 80 | 81 | block_data[offset_start + pos * size_of::() 82 | ..offset_start + (pos + 1) * size_of::()] 83 | .copy_from_slice(entry_offset.as_bytes()); 84 | 85 | block_data.extend_from_slice(header.as_bytes()); 86 | block_data.extend_from_slice(&key); 87 | } 88 | 89 | // Store on disk before grabbing the lock 90 | let fpath = Self::get_file_path(params, &id); 91 | disk::write(&fpath, &block_data) 92 | .await 93 | .map_err(|err| Error::from_io_error("Failed to write index block", err))?; 94 | 95 | Ok(IndexBlock { data: block_data }) 96 | } 97 | 98 | pub async fn load(params: &Params, id: TableId) -> Result { 99 | log::trace!("Loading index block from disk"); 100 | let fpath = Self::get_file_path(params, &id); 101 | let data = disk::read(&fpath, 0) 102 | .await 103 | .map_err(|err| Error::from_io_error("Failed to read index block", err))?; 104 | 105 | Ok(IndexBlock { data }) 106 | } 107 | 108 | /// where is this index block located on disk? 109 | #[inline] 110 | fn get_file_path(params: &Params, block_id: &TableId) -> std::path::PathBuf { 111 | let fname = format!("idx{block_id:08}.data"); 112 | params.db_path.join(Path::new(&fname)) 113 | } 114 | 115 | fn get_header(&self) -> &IndexBlockHeader { 116 | IndexBlockHeader::ref_from_prefix(&self.data[..]).unwrap().0 117 | } 118 | 119 | fn get_entry_offset(&self, pos: usize) -> usize { 120 | let header = self.get_header(); 121 | assert!((pos as u32) < header.num_data_blocks); 122 | 123 | let offset = size_of::() 124 | + header.min_key_len as usize 125 | + header.max_key_len as usize; 126 | 127 | let offset_offset = crate::pad_offset(offset) + pos * size_of::(); 128 | *u32::ref_from_prefix(&self.data[offset_offset..]).unwrap().0 as usize 129 | } 130 | 131 | /// Get the unique id for the data block at the specified index 132 | pub fn get_block_id(&self, pos: usize) -> DataBlockId { 133 | let offset = self.get_entry_offset(pos); 134 | 135 | let entry_header = IndexEntryHeader::ref_from_bytes( 136 | &self.data[offset..offset + size_of::()], 137 | ) 138 | .unwrap(); 139 | 140 | entry_header.block_id 141 | } 142 | 143 | /// Get the key for the data block at the specified index 144 | pub fn get_block_key(&self, pos: usize) -> &[u8] { 145 | let offset = self.get_entry_offset(pos); 146 | 147 | let (entry_header, _) = IndexEntryHeader::ref_from_prefix(&self.data[offset..]).unwrap(); 148 | 149 | let key_start = offset + size_of::(); 150 | &self.data[key_start..key_start + (entry_header.key_len as usize)] 151 | } 152 | 153 | /// How many data blocks does this table have? 154 | pub fn num_data_blocks(&self) -> usize { 155 | self.get_header().num_data_blocks as usize 156 | } 157 | 158 | /// The size of this table in bytes 159 | /// (for WiscKey this just counts the references, not the values themselves) 160 | pub fn get_size(&self) -> usize { 161 | self.get_header().size as usize 162 | } 163 | 164 | /// Whats the minimum key in this table? 165 | pub fn get_min(&self) -> &[u8] { 166 | let header = self.get_header(); 167 | let key_offset = size_of::(); 168 | 169 | &self.data[key_offset..key_offset + (header.min_key_len as usize)] 170 | } 171 | 172 | /// What is the maximum key in this table? 173 | pub fn get_max(&self) -> &[u8] { 174 | let header = self.get_header(); 175 | let key_offset = size_of::() + (header.min_key_len as usize); 176 | 177 | &self.data[key_offset..key_offset + (header.max_key_len as usize)] 178 | } 179 | 180 | /// Search for a specific key 181 | /// This will a return a data block id that *might* hold this entry or None 182 | #[tracing::instrument(skip(self, key))] 183 | pub fn binary_search(&self, key: &[u8]) -> Option { 184 | if key < self.get_min() || key > self.get_max() { 185 | return None; 186 | } 187 | 188 | let header = self.get_header(); 189 | 190 | let mut start = 0; 191 | let mut end = (header.num_data_blocks as usize) - 1; 192 | 193 | while end - start > 1 { 194 | let mid = (end - start) / 2 + start; 195 | let mid_key = self.get_block_key(mid); 196 | 197 | match mid_key.cmp(key) { 198 | Ordering::Equal => { 199 | return Some(self.get_block_id(mid)); 200 | } 201 | Ordering::Greater => { 202 | end = mid; 203 | } 204 | Ordering::Less => { 205 | start = mid; 206 | } 207 | } 208 | } 209 | 210 | assert!(key >= self.get_block_key(start)); 211 | 212 | if key >= self.get_block_key(end) { 213 | Some(self.get_block_id(end)) 214 | } else { 215 | Some(self.get_block_id(start)) 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/iterate.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::future::Future; 3 | use std::pin::Pin; 4 | use std::task::{Context, Poll}; 5 | 6 | #[cfg(feature = "wisckey")] 7 | use std::sync::Arc; 8 | 9 | #[cfg(feature = "wisckey")] 10 | use crate::values::ValueLog; 11 | 12 | use crate::logic::EntryRef; 13 | use crate::memtable::MemtableIterator; 14 | use crate::sorted_table::{InternalIterator, TableIterator}; 15 | use crate::{Error, Key}; 16 | 17 | use futures::stream::Stream; 18 | 19 | #[cfg(feature = "_async-io")] 20 | type IterFuture = dyn Future), Error>>; 21 | 22 | #[cfg(not(feature = "_async-io"))] 23 | type IterFuture = 24 | dyn Future), Error>> + Send; 25 | 26 | pub struct DbIterator { 27 | state: Option>>, 28 | } 29 | 30 | impl DbIterator { 31 | pub(crate) fn new( 32 | mem_iters: Vec, 33 | table_iters: Vec, 34 | min_key: Option>, 35 | max_key: Option>, 36 | reverse: bool, 37 | #[cfg(feature = "wisckey")] value_log: Arc, 38 | ) -> Self { 39 | let inner = DbIteratorInner::new( 40 | mem_iters, 41 | table_iters, 42 | min_key, 43 | max_key, 44 | reverse, 45 | #[cfg(feature = "wisckey")] 46 | value_log, 47 | ); 48 | let state = Box::pin(DbIteratorInner::next(inner)); 49 | 50 | Self { state: Some(state) } 51 | } 52 | } 53 | 54 | impl Stream for DbIterator { 55 | type Item = (Key, EntryRef); 56 | 57 | fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context) -> Poll> { 58 | let (inner, res) = if let Some(mut fut) = self.state.take() { 59 | match Future::poll(fut.as_mut(), ctx) { 60 | // return and keep waiting for result 61 | Poll::Pending => { 62 | self.state = Some(fut); 63 | return Poll::Pending; 64 | } 65 | // item computation complete 66 | Poll::Ready(result) => { 67 | let (inner, res) = result.expect("iteration failed"); 68 | (inner, res) 69 | } 70 | } 71 | } else { 72 | // no items left 73 | return Poll::Ready(None); 74 | }; 75 | 76 | // Prepare next state? 77 | if res.is_some() { 78 | self.state = Some(Box::pin(DbIteratorInner::next(inner))); 79 | } else { 80 | self.state = None; 81 | } 82 | 83 | // return item 84 | Poll::Ready(res) 85 | } 86 | } 87 | 88 | struct DbIteratorInner { 89 | last_key: Option>, 90 | iterators: Vec>, 91 | 92 | reverse: bool, 93 | 94 | min_key: Option>, 95 | max_key: Option>, 96 | 97 | #[cfg(feature = "wisckey")] 98 | value_log: Arc, 99 | } 100 | 101 | type NextKV = Option<(crate::manifest::SeqNumber, usize)>; 102 | 103 | impl DbIteratorInner { 104 | fn new( 105 | mem_iters: Vec, 106 | table_iters: Vec, 107 | min_key: Option>, 108 | max_key: Option>, 109 | reverse: bool, 110 | #[cfg(feature = "wisckey")] value_log: Arc, 111 | ) -> Self { 112 | let mut iterators: Vec> = vec![]; 113 | for iter in mem_iters.into_iter() { 114 | iterators.push(Box::new(iter)); 115 | } 116 | for iter in table_iters.into_iter() { 117 | iterators.push(Box::new(iter)); 118 | } 119 | 120 | Self { 121 | iterators, 122 | last_key: None, 123 | min_key, 124 | max_key, 125 | reverse, 126 | #[cfg(feature = "wisckey")] 127 | value_log, 128 | } 129 | } 130 | 131 | /// Tries to pick the next value from the specified iterator 132 | async fn parse_iter(&mut self, pos: usize, next_kv: NextKV) -> (bool, NextKV) { 133 | // Split slices to make the borrow checker happy 134 | let (prev, cur) = self.iterators[..].split_at_mut(pos); 135 | let iter = &mut *cur[0]; 136 | 137 | if self.reverse { 138 | // This iterator might be "behind" other iterators 139 | if let Some(last_key) = &self.last_key { 140 | while !iter.at_end() && iter.get_key() >= last_key.as_slice() { 141 | iter.step().await; 142 | } 143 | } 144 | 145 | // Don't pick a key that is greater than the maximum 146 | if let Some(max_key) = &self.max_key { 147 | while !iter.at_end() && iter.get_key() > max_key.as_slice() { 148 | iter.step().await; 149 | } 150 | 151 | // There might be no key in this iterator that is <=max_key 152 | if iter.at_end() || iter.get_key() > max_key.as_slice() { 153 | return (false, next_kv); 154 | } 155 | } 156 | 157 | if iter.at_end() { 158 | return (false, next_kv); 159 | } 160 | 161 | let key = iter.get_key(); 162 | 163 | // Don't pick a key that is less or equal to the minimum 164 | if let Some(min_key) = &self.min_key 165 | && iter.get_key() <= min_key.as_slice() 166 | { 167 | return (false, next_kv); 168 | } 169 | 170 | let seq_number = iter.get_seq_number(); 171 | 172 | if let Some((max_seq_number, max_pos)) = next_kv { 173 | let max_iter = &*prev[max_pos]; 174 | let max_key = max_iter.get_key(); 175 | 176 | match key.cmp(max_key) { 177 | Ordering::Greater => (true, Some((seq_number, pos))), 178 | Ordering::Equal => { 179 | if seq_number > max_seq_number { 180 | (true, Some((seq_number, pos))) 181 | } else { 182 | (false, next_kv) 183 | } 184 | } 185 | Ordering::Less => (false, next_kv), 186 | } 187 | } else { 188 | (true, Some((seq_number, pos))) 189 | } 190 | } else { 191 | // This iterator might be "behind" other iterators 192 | if let Some(last_key) = &self.last_key { 193 | while !iter.at_end() && iter.get_key() <= last_key.as_slice() { 194 | iter.step().await; 195 | } 196 | } 197 | 198 | // Don't pick a key that is smaller than the minimum 199 | if let Some(min_key) = &self.min_key { 200 | while !iter.at_end() && iter.get_key() < min_key.as_slice() { 201 | iter.step().await; 202 | } 203 | 204 | // There might be no key in this iterator that is >=min_key 205 | if iter.at_end() || iter.get_key() < min_key.as_slice() { 206 | return (false, next_kv); 207 | } 208 | } 209 | 210 | if iter.at_end() { 211 | return (false, next_kv); 212 | } 213 | 214 | let key = iter.get_key(); 215 | 216 | // Don't pick a key that is greater or equal to the maximum 217 | if let Some(max_key) = &self.max_key 218 | && iter.get_key() >= max_key.as_slice() 219 | { 220 | return (false, next_kv); 221 | } 222 | 223 | let seq_number = iter.get_seq_number(); 224 | 225 | if let Some((min_seq_number, min_pos)) = next_kv { 226 | let min_iter = &*prev[min_pos]; 227 | let min_key = min_iter.get_key(); 228 | 229 | match key.cmp(min_key) { 230 | Ordering::Less => (true, Some((seq_number, pos))), 231 | Ordering::Equal => { 232 | if seq_number > min_seq_number { 233 | (true, Some((seq_number, pos))) 234 | } else { 235 | (false, next_kv) 236 | } 237 | } 238 | Ordering::Greater => (false, next_kv), 239 | } 240 | } else { 241 | (true, Some((seq_number, pos))) 242 | } 243 | } 244 | } 245 | 246 | async fn next(mut self) -> Result<(Self, Option<(Key, EntryRef)>), Error> { 247 | let mut result = None; 248 | 249 | while result.is_none() { 250 | let mut next_kv = None; 251 | let num_iterators = self.iterators.len(); 252 | 253 | for pos in 0..num_iterators { 254 | let (change, kv) = self.parse_iter(pos, next_kv).await; 255 | 256 | if change { 257 | next_kv = kv; 258 | } 259 | } 260 | 261 | if let Some((_, pos)) = next_kv.take() { 262 | let iter = &*self.iterators[pos]; 263 | 264 | let res_key = iter.get_key(); 265 | self.last_key = Some(iter.get_key().to_vec()); 266 | 267 | #[cfg(feature = "wisckey")] 268 | let entry = iter.get_entry(&self.value_log).await; 269 | #[cfg(not(feature = "wisckey"))] 270 | let entry = iter.get_entry(); 271 | 272 | if let Some(entry) = entry { 273 | result = Some(Some((res_key.to_vec(), entry))); 274 | } else { 275 | // this is a deletion... skip 276 | } 277 | } else { 278 | // at end 279 | result = Some(None); 280 | }; 281 | } 282 | 283 | let (key, result) = match result.unwrap() { 284 | Some(inner) => inner, 285 | None => { 286 | return Ok((self, None)); 287 | } 288 | }; 289 | 290 | Ok((self, Some((key, result)))) 291 | } 292 | } 293 | -------------------------------------------------------------------------------- /src/level_logger.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::time::Instant; 3 | 4 | use crate::manifest::LevelId; 5 | 6 | use parking_lot::Mutex; 7 | 8 | struct Inner { 9 | start: Instant, 10 | outfile: csv::Writer, 11 | num_tables: Vec, 12 | } 13 | 14 | /// Locks changes to the number of tables in a level 15 | pub(crate) struct LevelLogger { 16 | inner: Mutex, 17 | } 18 | 19 | impl LevelLogger { 20 | pub fn new(path: &str, num_levels: usize) -> Self { 21 | let outfile = csv::Writer::from_path(path).expect("Failed to create log file"); 22 | 23 | let inner = Inner::new(outfile, num_levels); 24 | 25 | Self { 26 | inner: Mutex::new(inner), 27 | } 28 | } 29 | 30 | pub fn l0_table_added(&self) { 31 | let mut inner = self.inner.lock(); 32 | inner.num_tables[0] += 1; 33 | 34 | inner.write(); 35 | } 36 | 37 | pub fn compaction(&self, level: LevelId, added: usize, removed: usize) { 38 | let mut inner = self.inner.lock(); 39 | inner.num_tables[level as usize] -= removed; 40 | inner.num_tables[level as usize + 1] += added; 41 | 42 | inner.write(); 43 | } 44 | } 45 | 46 | impl Inner { 47 | fn new(mut outfile: csv::Writer, num_levels: usize) -> Self { 48 | let num_tables = vec![0; num_levels]; 49 | 50 | let mut header = vec![format!("time")]; 51 | for idx in 0..num_levels { 52 | header.push(format!("level{idx}")); 53 | } 54 | 55 | outfile.write_record(&header).unwrap(); 56 | 57 | Self { 58 | outfile, 59 | num_tables, 60 | start: Instant::now(), 61 | } 62 | } 63 | 64 | fn write(&mut self) { 65 | let mut record = vec![]; 66 | record.push(format!("{}", self.start.elapsed().as_millis())); 67 | 68 | for count in self.num_tables.iter() { 69 | record.push(format!("{count}")); 70 | } 71 | 72 | self.outfile.write_record(&record).unwrap(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(get_mut_unchecked)] 2 | // Temporary workaround for the io_uring code 3 | #![allow(clippy::arc_with_non_send_sync)] 4 | 5 | pub mod iterate; 6 | 7 | #[cfg(feature = "wisckey")] 8 | pub mod values; 9 | 10 | mod params; 11 | pub use params::Params; 12 | 13 | mod write_batch; 14 | pub use write_batch::{WriteBatch, WriteOp, WriteOptions}; 15 | 16 | pub mod sorted_table; 17 | 18 | mod level_logger; 19 | 20 | pub mod memtable; 21 | pub mod tasks; 22 | 23 | pub mod logic; 24 | pub use logic::EntryRef; 25 | 26 | pub mod manifest; 27 | 28 | mod data_blocks; 29 | mod database; 30 | mod disk; 31 | mod index_blocks; 32 | mod level; 33 | mod wal; 34 | 35 | pub type Key = Vec; 36 | pub type Value = Vec; 37 | 38 | /// Shorthand for a list of key-value pairs 39 | #[cfg(feature = "wisckey")] 40 | type EntryList = Vec<(Key, Value)>; 41 | 42 | pub use database::Database; 43 | 44 | /// How many bytes do we align by? 45 | const WORD_SIZE: usize = 8; 46 | 47 | fn pad_offset(offset: usize) -> usize { 48 | offset + compute_padding(offset) 49 | } 50 | 51 | fn compute_padding(offset: usize) -> usize { 52 | let remainder = offset % WORD_SIZE; 53 | if remainder == 0 { 54 | 0 55 | } else { 56 | WORD_SIZE - remainder 57 | } 58 | } 59 | 60 | fn add_padding(data: &mut Vec) { 61 | let padding = compute_padding(data.len()); 62 | if padding > 0 { 63 | data.resize(data.len() + padding, 0u8); 64 | } 65 | } 66 | 67 | #[derive(Clone, Debug)] 68 | pub enum Error { 69 | Io { context: String, message: String }, 70 | InvalidParams(String), 71 | Serialization(String), 72 | } 73 | 74 | impl std::fmt::Display for Error { 75 | fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { 76 | match self { 77 | Self::Io { context, message } => { 78 | fmt.write_fmt(format_args!("{context}: {message}"))?; 79 | } 80 | Self::InvalidParams(msg) => { 81 | fmt.write_fmt(format_args!("Invalid Parameter: {msg}"))?; 82 | } 83 | Self::Serialization(msg) => { 84 | fmt.write_fmt(format_args!("Serialization Error: {msg}"))?; 85 | } 86 | } 87 | 88 | Ok(()) 89 | } 90 | } 91 | 92 | impl Error { 93 | fn from_io_error(context: S, inner: std::io::Error) -> Self { 94 | Self::Io { 95 | context: context.to_string(), 96 | message: format!("{inner}"), 97 | } 98 | } 99 | } 100 | 101 | /// Allow specifying how the datastore behaves during startup 102 | #[derive(Debug, Clone)] 103 | pub enum StartMode { 104 | /// Reuse existing database, or create if non-existent 105 | CreateOrOpen, 106 | /// Open existing database, or fail if non-existent 107 | Open, 108 | /// Create a new, or override an existing, database 109 | CreateOrOverride, 110 | } 111 | -------------------------------------------------------------------------------- /src/memtable.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::sync::Arc; 3 | 4 | use async_trait::async_trait; 5 | 6 | use crate::data_blocks::DataEntryType; 7 | use crate::manifest::SeqNumber; 8 | use crate::sorted_table::InternalIterator; 9 | use crate::{EntryRef, Key, Params}; 10 | 11 | #[cfg(feature = "wisckey")] 12 | use crate::values::ValueLog; 13 | 14 | #[derive(Debug, Clone)] 15 | pub struct MemtableRef { 16 | inner: Arc, 17 | } 18 | 19 | /// A reference to a memtable that cannot modify it 20 | #[derive(Debug, Clone)] 21 | pub struct ImmMemtableRef { 22 | inner: Arc, 23 | } 24 | 25 | #[derive(Clone, Debug, PartialEq, Eq)] 26 | pub enum MemtableEntry { 27 | Value { seq_number: u64, value: Vec }, 28 | Deletion { seq_number: u64 }, 29 | } 30 | 31 | /// Reference to an entry in the memtable 32 | /// TODO: make this zerocopy somehow 33 | pub struct MemtableEntryRef { 34 | entry: MemtableEntry, 35 | } 36 | 37 | impl MemtableEntryRef { 38 | pub fn get_type(&self) -> DataEntryType { 39 | match &self.entry { 40 | MemtableEntry::Value { .. } => DataEntryType::Put, 41 | MemtableEntry::Deletion { .. } => DataEntryType::Delete, 42 | } 43 | } 44 | 45 | pub fn get_value(&self) -> Option<&[u8]> { 46 | match &self.entry { 47 | MemtableEntry::Value { value, .. } => Some(value), 48 | MemtableEntry::Deletion { .. } => None, 49 | } 50 | } 51 | } 52 | 53 | impl MemtableEntry { 54 | pub fn get_value(&self) -> Option<&[u8]> { 55 | match self { 56 | MemtableEntry::Value { value, .. } => Some(value), 57 | MemtableEntry::Deletion { .. } => None, 58 | } 59 | } 60 | } 61 | 62 | /// Iterates over a memtable and returns its contents in order 63 | #[derive(Debug)] 64 | pub struct MemtableIterator { 65 | inner: Arc, 66 | next_index: i64, 67 | key: Option, 68 | entry: Option, 69 | reverse: bool, 70 | } 71 | 72 | impl MemtableIterator { 73 | pub async fn new(inner: Arc, reverse: bool) -> Self { 74 | let next_index = if reverse { 75 | (inner.entries.len() as i64) - 1 76 | } else { 77 | 0 78 | }; 79 | 80 | let mut obj = Self { 81 | inner, 82 | reverse, 83 | key: None, 84 | entry: None, 85 | next_index, 86 | }; 87 | 88 | obj.step().await; 89 | 90 | obj 91 | } 92 | } 93 | 94 | #[cfg_attr(feature="_async-io", async_trait(?Send))] 95 | #[cfg_attr(not(feature = "_async-io"), async_trait)] 96 | impl InternalIterator for MemtableIterator { 97 | #[tracing::instrument] 98 | async fn step(&mut self) { 99 | let entries = &self.inner.entries; 100 | let num_entries = entries.len() as i64; 101 | 102 | if self.reverse { 103 | match self.next_index.cmp(&(-1)) { 104 | Ordering::Less => { 105 | panic!("Cannot step(); already at end"); 106 | } 107 | Ordering::Equal => { 108 | self.next_index -= 1; 109 | } 110 | Ordering::Greater => { 111 | let (key, entry) = entries[self.next_index as usize].clone(); 112 | self.key = Some(key); 113 | self.entry = Some(entry); 114 | self.next_index -= 1; 115 | } 116 | } 117 | } else { 118 | match self.next_index.cmp(&num_entries) { 119 | Ordering::Greater => { 120 | panic!("Cannot step(); already at end"); 121 | } 122 | Ordering::Equal => { 123 | self.next_index += 1; 124 | } 125 | Ordering::Less => { 126 | let (key, entry) = entries[self.next_index as usize].clone(); 127 | self.key = Some(key); 128 | self.entry = Some(entry); 129 | self.next_index += 1; 130 | } 131 | } 132 | } 133 | } 134 | 135 | fn at_end(&self) -> bool { 136 | if self.reverse { 137 | self.next_index < -1 138 | } else { 139 | let len = self.inner.entries.len() as i64; 140 | self.next_index > len 141 | } 142 | } 143 | 144 | fn get_key(&self) -> &[u8] { 145 | self.key.as_ref().expect("Not a valid iterator") 146 | } 147 | 148 | #[cfg(feature = "wisckey")] 149 | async fn get_entry(&self, _value_log: &ValueLog) -> Option { 150 | self.entry.clone().map(|entry| EntryRef::Memtable { 151 | entry: MemtableEntryRef { entry }, 152 | }) 153 | } 154 | 155 | #[cfg(not(feature = "wisckey"))] 156 | fn get_entry(&self) -> Option { 157 | self.entry.clone().map(|entry| EntryRef::Memtable { 158 | entry: MemtableEntryRef { entry }, 159 | }) 160 | } 161 | 162 | fn get_seq_number(&self) -> SeqNumber { 163 | match self.entry.as_ref().unwrap() { 164 | MemtableEntry::Value { seq_number, .. } | MemtableEntry::Deletion { seq_number } => { 165 | *seq_number 166 | } 167 | } 168 | } 169 | 170 | fn get_entry_type(&self) -> DataEntryType { 171 | match self.entry.as_ref().unwrap() { 172 | MemtableEntry::Value { .. } => DataEntryType::Put, 173 | MemtableEntry::Deletion { .. } => DataEntryType::Delete, 174 | } 175 | } 176 | } 177 | 178 | impl ImmMemtableRef { 179 | pub fn get(&self) -> &Memtable { 180 | &self.inner 181 | } 182 | 183 | pub async fn into_iter(self, reverse: bool) -> MemtableIterator { 184 | MemtableIterator::new(self.inner, reverse).await 185 | } 186 | } 187 | 188 | impl MemtableRef { 189 | pub fn wrap(inner: Memtable) -> Self { 190 | Self { 191 | inner: Arc::new(inner), 192 | } 193 | } 194 | 195 | /// Get an immutable reference to the same memtable 196 | pub fn clone_immutable(&self) -> ImmMemtableRef { 197 | ImmMemtableRef { 198 | inner: self.inner.clone(), 199 | } 200 | } 201 | 202 | /// Make the current contents into an immutable memtable 203 | /// and create a new mutable one 204 | pub fn take(&mut self, next_seq_number: u64) -> ImmMemtableRef { 205 | let mut inner = Arc::new(Memtable::new(next_seq_number)); 206 | std::mem::swap(&mut inner, &mut self.inner); 207 | 208 | ImmMemtableRef { inner } 209 | } 210 | 211 | pub fn get(&self) -> &Memtable { 212 | &self.inner 213 | } 214 | 215 | /// This is only safe to call from the DbLogic while holding the memtable lock 216 | pub(crate) unsafe fn get_mut(&mut self) -> &mut Memtable { 217 | unsafe { Arc::get_mut_unchecked(&mut self.inner) } 218 | } 219 | } 220 | 221 | /// In-memory representation of state that has not been written to level 0 yet. 222 | /// This data structure does not exist on disk, but can be recreated from the write-ahead log 223 | #[derive(Debug)] 224 | pub struct Memtable { 225 | /// Sorted updates 226 | entries: Vec<(Vec, MemtableEntry)>, 227 | size: usize, 228 | next_seq_number: SeqNumber, 229 | } 230 | 231 | impl Memtable { 232 | pub fn new(next_seq_number: SeqNumber) -> Self { 233 | let entries = Vec::new(); 234 | let size = 0; 235 | 236 | Self { 237 | entries, 238 | size, 239 | next_seq_number, 240 | } 241 | } 242 | 243 | #[inline] 244 | pub fn get_next_seq_number(&self) -> u64 { 245 | self.next_seq_number 246 | } 247 | 248 | pub fn get_min_max_key(&self) -> (&[u8], &[u8]) { 249 | let len = self.entries.len(); 250 | 251 | if len == 0 { 252 | panic!("Memtable is empty"); 253 | } 254 | 255 | (&self.entries[0].0, &self.entries[len - 1].0) 256 | } 257 | 258 | #[tracing::instrument(skip(self, key))] 259 | pub fn get(&self, key: &[u8]) -> Option { 260 | match self.entries.binary_search_by_key(&key, |t| t.0.as_slice()) { 261 | Ok(pos) => Some(MemtableEntryRef { 262 | entry: self.entries[pos].1.clone(), 263 | }), 264 | Err(_) => None, 265 | } 266 | } 267 | 268 | /// Get position were to insert the key 269 | /// Will remove existing entries with the same key 270 | fn get_key_pos(&mut self, key: &[u8]) -> usize { 271 | match self.entries.binary_search_by_key(&key, |t| t.0.as_slice()) { 272 | Ok(pos) => { 273 | // remove old entry 274 | let entry_len = { 275 | let (_, entry) = self.entries.remove(pos); 276 | match entry { 277 | MemtableEntry::Value { value, .. } => key.len() + value.len(), 278 | MemtableEntry::Deletion { .. } => key.len(), 279 | } 280 | }; 281 | 282 | self.size -= entry_len; 283 | pos 284 | } 285 | Err(pos) => pos, 286 | } 287 | } 288 | 289 | #[tracing::instrument(skip(self, key, value))] 290 | pub fn put(&mut self, key: Vec, value: Vec) { 291 | let pos = self.get_key_pos(key.as_slice()); 292 | let entry_len = key.len() + value.len(); 293 | 294 | self.entries.insert( 295 | pos, 296 | ( 297 | key, 298 | MemtableEntry::Value { 299 | value, 300 | seq_number: self.next_seq_number, 301 | }, 302 | ), 303 | ); 304 | 305 | self.size += entry_len; 306 | self.next_seq_number += 1; 307 | } 308 | 309 | #[tracing::instrument(skip(self, key))] 310 | pub fn delete(&mut self, key: Vec) { 311 | let pos = self.get_key_pos(key.as_slice()); 312 | let entry_len = key.len(); 313 | 314 | self.entries.insert( 315 | pos, 316 | ( 317 | key, 318 | MemtableEntry::Deletion { 319 | seq_number: self.next_seq_number, 320 | }, 321 | ), 322 | ); 323 | 324 | self.size += entry_len; 325 | self.next_seq_number += 1; 326 | } 327 | 328 | #[inline] 329 | pub fn is_full(&self, params: &Params) -> bool { 330 | self.size >= params.max_memtable_size 331 | } 332 | 333 | //FIXME avoid this copy somehow without breaking seek consistency 334 | pub fn get_entries(&self) -> Vec<(Key, MemtableEntry)> { 335 | self.entries.clone() 336 | } 337 | } 338 | 339 | #[cfg(test)] 340 | mod tests { 341 | use super::*; 342 | 343 | #[test] 344 | fn get_put() { 345 | let mut mem = Memtable::new(1); 346 | 347 | let key1 = vec![5, 2, 4]; 348 | let key2 = vec![3, 8, 1]; 349 | 350 | let val1 = vec![5, 1]; 351 | let val2 = vec![1, 8]; 352 | 353 | mem.put(key1.clone(), val1.clone()); 354 | mem.put(key2.clone(), val2.clone()); 355 | 356 | assert_eq!(mem.get(&key1).unwrap().get_value().unwrap(), &val1); 357 | assert_eq!(mem.get(&key2).unwrap().get_value().unwrap(), &val2); 358 | } 359 | 360 | #[test] 361 | fn delete() { 362 | let mut mem = Memtable::new(1); 363 | 364 | assert_eq!(mem.entries.len(), 0); 365 | 366 | let key = vec![5, 2, 4]; 367 | let val = vec![5, 1]; 368 | 369 | mem.put(key.clone(), val.clone()); 370 | mem.delete(key.clone()); 371 | 372 | assert_eq!(mem.entries.len(), 1); 373 | assert_eq!(mem.get(&key).unwrap().get_value(), None); 374 | } 375 | 376 | #[test] 377 | fn override_entry() { 378 | let mut mem = Memtable::new(1); 379 | 380 | let key1 = vec![5, 2, 4]; 381 | 382 | let val1 = vec![5, 1]; 383 | let val2 = vec![1, 8]; 384 | 385 | mem.put(key1.clone(), val1.clone()); 386 | mem.put(key1.clone(), val2.clone()); 387 | 388 | assert_eq!(mem.get(&key1).unwrap().get_value().unwrap(), &val2); 389 | } 390 | } 391 | -------------------------------------------------------------------------------- /src/params.rs: -------------------------------------------------------------------------------- 1 | use std::path::{Path, PathBuf}; 2 | 3 | /// Parameters to customize the creation of the database 4 | #[derive(Debug, Clone)] 5 | pub struct Params { 6 | /// Where in the filesystem should the database be stored? 7 | pub db_path: PathBuf, 8 | /// Maximum size of a memtable (keys+values), 9 | /// This indirectly also defines how large a value block can be 10 | pub max_memtable_size: usize, 11 | /// How many levels does this store have (default: 5) 12 | pub num_levels: usize, 13 | /// How many open files should be held in memory? 14 | pub max_open_files: usize, 15 | /// Maximum number of entries in a key block 16 | pub max_key_block_size: usize, 17 | /// How often should the full key be stored in a data block? 18 | /// Larger numbers result in smaller on-disk files, but seeks will be slower 19 | pub block_restart_interval: u32, 20 | /// Write the size of each level to a csv file 21 | pub log_level_stats: Option, 22 | /// How many concurrent compaction tasks should there be 23 | pub compaction_concurrency: usize, 24 | /// How many seeks (per kb) before compaction is triggered? 25 | pub seek_based_compaction: Option, 26 | } 27 | 28 | impl Default for Params { 29 | fn default() -> Self { 30 | Self { 31 | db_path: Path::new("./storage.lsm").to_path_buf(), 32 | max_memtable_size: 5 * 1024 * 1024, 33 | num_levels: 5, 34 | max_open_files: 1_000_000, 35 | max_key_block_size: 512, 36 | block_restart_interval: 16, 37 | log_level_stats: None, 38 | compaction_concurrency: 4, 39 | seek_based_compaction: Some(10), 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/sorted_table/builder.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::sync::atomic::{AtomicBool, AtomicI32}; 3 | 4 | use crate::data_blocks::{DataBlockBuilder, DataBlockId, DataBlocks, PrefixedKey}; 5 | use crate::index_blocks::IndexBlock; 6 | use crate::manifest::SeqNumber; 7 | use crate::{Error, Key, Params, WriteOp}; 8 | 9 | #[cfg(feature = "wisckey")] 10 | use crate::values::ValueId; 11 | 12 | use super::{SortedTable, TableId}; 13 | 14 | /// Helper class to construct a table 15 | /// only used during compaction 16 | pub struct TableBuilder<'a> { 17 | identifier: TableId, 18 | params: &'a Params, 19 | data_blocks: Arc, 20 | min_key: Key, 21 | max_key: Key, 22 | 23 | data_block: DataBlockBuilder, 24 | block_index: Vec<(Key, DataBlockId)>, 25 | last_key: Key, 26 | block_entry_count: usize, 27 | size: u64, 28 | restart_count: u32, 29 | index_key: Option, 30 | } 31 | 32 | impl<'a> TableBuilder<'a> { 33 | #[tracing::instrument(skip(params, data_blocks, min_key, max_key))] 34 | pub fn new( 35 | identifier: TableId, 36 | params: &'a Params, 37 | data_blocks: Arc, 38 | min_key: Key, 39 | max_key: Key, 40 | ) -> TableBuilder<'a> { 41 | let block_index = vec![]; 42 | let last_key = vec![]; 43 | let block_entry_count = 0; 44 | let size = 0; 45 | let restart_count = 0; 46 | let index_key = None; 47 | let data_block = DataBlocks::build_block(data_blocks.clone()); 48 | 49 | Self { 50 | identifier, 51 | params, 52 | data_blocks, 53 | block_index, 54 | data_block, 55 | last_key, 56 | block_entry_count, 57 | size, 58 | restart_count, 59 | index_key, 60 | min_key, 61 | max_key, 62 | } 63 | } 64 | 65 | #[cfg(feature = "wisckey")] 66 | #[tracing::instrument(skip(self, key, seq_number, value_ref))] 67 | pub async fn add_value( 68 | &mut self, 69 | key: &[u8], 70 | seq_number: SeqNumber, 71 | value_ref: ValueId, 72 | ) -> Result<(), Error> { 73 | self.add_entry(key, seq_number, WriteOp::PUT_OP, value_ref) 74 | .await 75 | } 76 | 77 | #[cfg(feature = "wisckey")] 78 | #[tracing::instrument(skip(self, key, seq_number))] 79 | pub async fn add_deletion(&mut self, key: &[u8], seq_number: SeqNumber) -> Result<(), Error> { 80 | self.add_entry(key, seq_number, WriteOp::DELETE_OP, ValueId::default()) 81 | .await 82 | } 83 | 84 | #[cfg(not(feature = "wisckey"))] 85 | #[tracing::instrument(skip(self, key, seq_number, value))] 86 | pub async fn add_value( 87 | &mut self, 88 | key: &[u8], 89 | seq_number: SeqNumber, 90 | value: &[u8], 91 | ) -> Result<(), Error> { 92 | self.add_entry(key, seq_number, WriteOp::PUT_OP, value) 93 | .await 94 | } 95 | 96 | #[cfg(not(feature = "wisckey"))] 97 | #[tracing::instrument(skip(self, key, seq_number))] 98 | pub async fn add_deletion(&mut self, key: &[u8], seq_number: SeqNumber) -> Result<(), Error> { 99 | self.add_entry(key, seq_number, WriteOp::DELETE_OP, &[]) 100 | .await 101 | } 102 | 103 | async fn add_entry( 104 | &mut self, 105 | key: &[u8], 106 | seq_number: SeqNumber, 107 | op_type: u8, 108 | #[cfg(feature = "wisckey")] value: ValueId, 109 | #[cfg(not(feature = "wisckey"))] value: &[u8], 110 | ) -> Result<(), Error> { 111 | if self.index_key.is_none() { 112 | self.index_key = Some(key.to_vec()); 113 | } 114 | let mut prefix_len = 0; 115 | 116 | // After a certain interval we reset the prefixed keys 117 | // So that it is possible to binary search blocks 118 | if self.restart_count == self.params.block_restart_interval { 119 | self.restart_count = 0; 120 | } else { 121 | // Calculate key prefix length 122 | while prefix_len < key.len() 123 | && prefix_len < self.last_key.len() 124 | && key[prefix_len] == self.last_key[prefix_len] 125 | { 126 | prefix_len += 1; 127 | } 128 | } 129 | 130 | let suffix = key[prefix_len..].to_vec(); 131 | 132 | self.block_entry_count += 1; 133 | self.restart_count += 1; 134 | 135 | let pkey = PrefixedKey::new(prefix_len, suffix); 136 | 137 | self.last_key = key.to_vec(); 138 | 139 | self.data_block 140 | .add_entry(pkey, key, seq_number, op_type, value); 141 | 142 | if self.block_entry_count >= self.params.max_key_block_size { 143 | self.size += self.data_block.current_size() as u64; 144 | 145 | let mut next_block = DataBlocks::build_block(self.data_blocks.clone()); 146 | std::mem::swap(&mut next_block, &mut self.data_block); 147 | 148 | let id = next_block.finish().await?.unwrap(); 149 | self.block_index.push((self.index_key.take().unwrap(), id)); 150 | self.block_entry_count = 0; 151 | self.restart_count = 0; 152 | self.last_key.clear(); 153 | } 154 | 155 | Ok(()) 156 | } 157 | 158 | #[tracing::instrument(skip(self))] 159 | pub async fn finish(mut self) -> Result { 160 | let block_size = self.data_block.current_size(); 161 | 162 | // Block will only be created if it contained entries 163 | if let Some(id) = self.data_block.finish().await? { 164 | self.size += block_size as u64; 165 | self.block_index.push((self.index_key.take().unwrap(), id)); 166 | } 167 | 168 | log::debug!("Created new table with {} blocks", self.block_index.len()); 169 | 170 | let index = IndexBlock::new( 171 | self.params, 172 | self.identifier, 173 | self.block_index, 174 | self.size, 175 | self.min_key, 176 | self.max_key, 177 | ) 178 | .await?; 179 | 180 | let allowed_seeks = if let Some(count) = self.params.seek_based_compaction { 181 | ((index.get_size() / 1024).max(1) as i32) * (count as i32) 182 | } else { 183 | 0 184 | }; 185 | 186 | Ok(SortedTable { 187 | index, 188 | allowed_seeks: AtomicI32::new(allowed_seeks), 189 | identifier: self.identifier, 190 | compaction_flag: AtomicBool::new(false), 191 | data_blocks: self.data_blocks, 192 | }) 193 | } 194 | } 195 | -------------------------------------------------------------------------------- /src/sorted_table/iterator.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::sync::Arc; 3 | 4 | use async_trait::async_trait; 5 | 6 | use crate::data_blocks::{DataBlock, DataEntry, DataEntryType}; 7 | use crate::manifest::SeqNumber; 8 | use crate::{EntryRef, Key}; 9 | 10 | use super::SortedTable; 11 | 12 | #[cfg(feature = "wisckey")] 13 | use crate::values::{ValueId, ValueLog}; 14 | 15 | #[cfg_attr(feature="_async-io", async_trait(?Send))] 16 | #[cfg_attr(not(feature = "_async-io"), async_trait)] 17 | pub trait InternalIterator: Send { 18 | fn at_end(&self) -> bool; 19 | async fn step(&mut self); 20 | 21 | /// Returns None if this refers to a deletion 22 | #[cfg(feature = "wisckey")] 23 | async fn get_entry(&self, value_log: &ValueLog) -> Option; 24 | #[cfg(not(feature = "wisckey"))] 25 | fn get_entry(&self) -> Option; 26 | 27 | fn get_key(&self) -> &[u8]; 28 | fn get_seq_number(&self) -> SeqNumber; 29 | fn get_entry_type(&self) -> DataEntryType; 30 | } 31 | 32 | /// Returns the entries within a table in order 33 | pub struct TableIterator { 34 | block_pos: i64, 35 | block_offset: u32, 36 | key: Key, 37 | entry: DataEntry, 38 | table: Arc, 39 | reverse: bool, 40 | } 41 | 42 | impl TableIterator { 43 | pub async fn new(table: Arc, reverse: bool) -> Self { 44 | let last_key = vec![]; 45 | 46 | if reverse { 47 | let num_blocks = table.index.num_data_blocks() as i64; 48 | assert!(num_blocks > 0); // tables must have at least one data block 49 | 50 | let block_id = table.index.get_block_id((num_blocks - 1) as usize); 51 | let first_block = table.data_blocks.get_block(&block_id).await; 52 | 53 | let len = first_block.get_num_entries(); 54 | assert!(len > 0); 55 | let (key, entry) = DataBlock::get_entry_at_index(&first_block, len - 1); 56 | 57 | // Are we already at the end of the first block? 58 | let (block_pos, block_offset) = if len == 1 { 59 | let next_pos = num_blocks - 2; 60 | if next_pos >= 0 { 61 | let block_id = table.index.get_block_id(next_pos as usize); 62 | let next_block = table.data_blocks.get_block(&block_id).await; 63 | let len = next_block.get_num_entries(); 64 | assert!(len > 0); 65 | (next_pos, len - 1) 66 | } else { 67 | (next_pos, 0) 68 | } 69 | } else { 70 | (num_blocks - 1, len - 2) 71 | }; 72 | 73 | Self { 74 | block_pos, 75 | block_offset, 76 | key, 77 | entry, 78 | table, 79 | reverse, 80 | } 81 | } else { 82 | let block_id = table.index.get_block_id(0); 83 | let first_block = table.data_blocks.get_block(&block_id).await; 84 | let byte_len = first_block.byte_len(); 85 | let (key, entry) = DataBlock::get_entry_at_offset(first_block, 0, &last_key); 86 | 87 | let next_offset = entry.get_next_offset(); 88 | 89 | // Are we already at the end of the first block? 90 | let (block_pos, block_offset) = if byte_len == next_offset { 91 | (1, 0) 92 | } else { 93 | (0, next_offset) 94 | }; 95 | 96 | Self { 97 | block_pos, 98 | block_offset, 99 | key, 100 | entry, 101 | table, 102 | reverse, 103 | } 104 | } 105 | } 106 | 107 | #[cfg(feature = "wisckey")] 108 | pub fn get_value_id(&self) -> Option { 109 | self.entry.get_value_id() 110 | } 111 | } 112 | 113 | #[cfg_attr(feature="_async-io", async_trait(?Send))] 114 | #[cfg_attr(not(feature = "_async-io"), async_trait)] 115 | impl InternalIterator for TableIterator { 116 | fn at_end(&self) -> bool { 117 | if self.reverse { 118 | self.block_pos < -1 119 | } else { 120 | self.block_pos > self.table.index.num_data_blocks() as i64 121 | } 122 | } 123 | 124 | fn get_key(&self) -> &[u8] { 125 | &self.key 126 | } 127 | 128 | fn get_seq_number(&self) -> SeqNumber { 129 | self.entry.get_sequence_number() 130 | } 131 | 132 | #[cfg(feature = "wisckey")] 133 | async fn get_entry(&self, value_log: &ValueLog) -> Option { 134 | match self.entry.get_type() { 135 | DataEntryType::Put => Some(EntryRef::SortedTable { 136 | value_ref: value_log 137 | .get_ref(self.entry.get_value_id().unwrap()) 138 | .await 139 | .expect("No such value?"), 140 | entry: self.entry.clone(), 141 | }), 142 | DataEntryType::Delete => None, 143 | } 144 | } 145 | 146 | #[cfg(not(feature = "wisckey"))] 147 | fn get_entry(&self) -> Option { 148 | match self.entry.get_type() { 149 | DataEntryType::Put => Some(EntryRef::SortedTable { 150 | entry: self.entry.clone(), 151 | }), 152 | DataEntryType::Delete => None, 153 | } 154 | } 155 | 156 | fn get_entry_type(&self) -> DataEntryType { 157 | self.entry.get_type() 158 | } 159 | 160 | #[tracing::instrument(skip(self))] 161 | async fn step(&mut self) { 162 | if self.reverse { 163 | match self.block_pos.cmp(&(-1)) { 164 | Ordering::Less => { 165 | panic!("Cannot step(); already at end"); 166 | } 167 | Ordering::Equal => self.block_pos -= 1, 168 | Ordering::Greater => { 169 | let block_id = self.table.index.get_block_id(self.block_pos as usize); 170 | let block = self.table.data_blocks.get_block(&block_id).await; 171 | 172 | let (key, entry) = DataBlock::get_entry_at_index(&block, self.block_offset); 173 | 174 | self.key = key; 175 | self.entry = entry; 176 | 177 | // At the end of the block? 178 | if self.block_offset == 0 { 179 | self.block_pos -= 1; 180 | 181 | if self.block_pos >= 0 { 182 | let block_id = self.table.index.get_block_id(self.block_pos as usize); 183 | let block = self.table.data_blocks.get_block(&block_id).await; 184 | self.block_offset = block.get_num_entries() - 1; 185 | } else { 186 | self.block_offset = 0; 187 | } 188 | } else { 189 | self.block_offset -= 1; 190 | } 191 | } 192 | } 193 | } else { 194 | let num_blocks = self.table.index.num_data_blocks() as i64; 195 | match self.block_pos.cmp(&num_blocks) { 196 | Ordering::Equal => { 197 | self.block_pos += 1; 198 | return; 199 | } 200 | Ordering::Greater => { 201 | panic!("Cannot step(); already at end"); 202 | } 203 | Ordering::Less => { 204 | let block_id = self.table.index.get_block_id(self.block_pos as usize); 205 | let block = self.table.data_blocks.get_block(&block_id).await; 206 | let byte_len = block.byte_len(); 207 | 208 | let (key, entry) = 209 | DataBlock::get_entry_at_offset(block, self.block_offset, &self.key); 210 | 211 | let next_offset = entry.get_next_offset(); 212 | 213 | self.key = key; 214 | self.entry = entry; 215 | 216 | // At the end of the block? 217 | if next_offset >= byte_len { 218 | self.block_pos += 1; 219 | self.block_offset = 0; 220 | } else { 221 | self.block_offset = next_offset; 222 | } 223 | } 224 | } 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/sorted_table/mod.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::sync::atomic::{AtomicBool, AtomicI32, Ordering as AtomicOrdering}; 3 | 4 | use crate::data_blocks::{DataBlock, DataBlocks, DataEntry}; 5 | use crate::index_blocks::IndexBlock; 6 | use crate::{Error, Params}; 7 | 8 | mod iterator; 9 | pub use iterator::{InternalIterator, TableIterator}; 10 | 11 | mod builder; 12 | pub use builder::TableBuilder; 13 | 14 | #[cfg(test)] 15 | mod tests; 16 | 17 | pub type TableId = u64; 18 | 19 | /// Entries ach level are grouped into sorted tables 20 | /// These tables contain an ordered set of key/value-pairs 21 | /// 22 | /// Except for level 0, sorted tables do not overlap others on the same level 23 | pub struct SortedTable { 24 | identifier: TableId, 25 | /// The index of the table; it holds all relevant metadata 26 | index: IndexBlock, 27 | data_blocks: Arc, 28 | /// Is this table currently being compacted 29 | compaction_flag: AtomicBool, 30 | /// The number of seek operations on this table before compaction is triggered 31 | /// This improves read performance for heavily queried keys 32 | allowed_seeks: AtomicI32, 33 | } 34 | 35 | impl SortedTable { 36 | pub async fn load( 37 | identifier: TableId, 38 | data_blocks: Arc, 39 | params: &Params, 40 | ) -> Result { 41 | let index = IndexBlock::load(params, identifier).await?; 42 | 43 | let allowed_seeks = if let Some(count) = params.seek_based_compaction { 44 | ((index.get_size() / 1024).max(1) as i32) * (count as i32) 45 | } else { 46 | 0 47 | }; 48 | 49 | Ok(Self { 50 | identifier, 51 | index, 52 | data_blocks, 53 | allowed_seeks: AtomicI32::new(allowed_seeks), 54 | compaction_flag: AtomicBool::new(false), 55 | }) 56 | } 57 | 58 | /// Checks if seek-based compaction should be triggered for this table 59 | pub fn has_maximum_seeks(&self) -> bool { 60 | self.allowed_seeks.load(AtomicOrdering::SeqCst) <= 0 61 | } 62 | 63 | /// Returns false if another task is already compacting this table 64 | pub fn maybe_start_compaction(&self) -> bool { 65 | let order = AtomicOrdering::SeqCst; 66 | let result = self 67 | .compaction_flag 68 | .compare_exchange(false, true, order, order); 69 | 70 | result.is_ok() 71 | } 72 | 73 | /// Compaction has failed, e.g., due to lock contention 74 | /// Remove the compaction flag 75 | pub fn abort_compaction(&self) { 76 | let prev = self.compaction_flag.swap(false, AtomicOrdering::SeqCst); 77 | assert!(prev, "Compaction flag was not set!"); 78 | } 79 | 80 | /// The table has moved to a new level during compaction and will be 81 | /// reused. Remove the compaction marker. 82 | pub fn finish_fast_compaction(&self) { 83 | let prev = self.compaction_flag.swap(false, AtomicOrdering::SeqCst); 84 | assert!(prev, "Compaction flag was not set!"); 85 | } 86 | 87 | pub fn get_id(&self) -> TableId { 88 | self.identifier 89 | } 90 | 91 | /// Get the size of this table (in bytes) 92 | pub fn get_size(&self) -> usize { 93 | self.index.get_size() 94 | } 95 | 96 | /// Get the minimum key of this table 97 | pub fn get_min(&self) -> &[u8] { 98 | self.index.get_min() 99 | } 100 | 101 | /// Get the maximum key of this table 102 | pub fn get_max(&self) -> &[u8] { 103 | self.index.get_max() 104 | } 105 | 106 | /// Gets an entry for particular key in this table 107 | /// Returns None if no entry for the key exists 108 | #[tracing::instrument(skip(self, key))] 109 | pub async fn get(&self, key: &[u8]) -> Option { 110 | self.allowed_seeks.fetch_sub(1, AtomicOrdering::Relaxed); 111 | 112 | let block_id = self.index.binary_search(key)?; 113 | let block = self.data_blocks.get_block(&block_id).await; 114 | 115 | DataBlock::get_by_key(&block, key) 116 | } 117 | 118 | /// Check if this table overlaps with the specified range 119 | /// 120 | /// min and max are both inclusive 121 | #[inline(always)] 122 | pub fn overlaps(&self, min: &[u8], max: &[u8]) -> bool { 123 | self.get_max() >= min && self.get_min() <= max 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/sorted_table/tests.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | use crate::manifest::Manifest; 4 | 5 | use tempfile::tempdir; 6 | 7 | #[cfg(feature = "tokio-uring")] 8 | use kioto_uring_executor::test as async_test; 9 | 10 | #[cfg(feature = "monoio")] 11 | use monoio::test as async_test; 12 | 13 | #[cfg(not(feature = "_async-io"))] 14 | use tokio::test as async_test; 15 | 16 | #[cfg(feature = "wisckey")] 17 | #[async_test] 18 | async fn iterate() { 19 | let dir = tempdir().unwrap(); 20 | let params = Params { 21 | db_path: dir.path().to_path_buf(), 22 | ..Default::default() 23 | }; 24 | 25 | let params = Arc::new(params); 26 | let manifest = Arc::new(Manifest::new(params.clone()).await); 27 | 28 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 29 | 30 | let key1 = vec![5]; 31 | let key2 = vec![15]; 32 | 33 | let value_id1 = (4, 2); 34 | let value_id2 = (4, 50); 35 | 36 | let id = 124234; 37 | let mut builder = TableBuilder::new(id, ¶ms, data_blocks, key1.clone(), key2.clone()); 38 | 39 | builder.add_value(&key1, 1, value_id1).await.unwrap(); 40 | 41 | builder.add_value(&key2, 4, value_id2).await.unwrap(); 42 | 43 | let table = builder.finish().await.unwrap(); 44 | 45 | let mut iter = TableIterator::new(Arc::new(table), false).await; 46 | 47 | assert!(!iter.at_end()); 48 | assert_eq!(iter.get_key(), &key1); 49 | assert_eq!(iter.get_value_id(), Some(value_id1)); 50 | 51 | iter.step().await; 52 | 53 | assert!(!iter.at_end()); 54 | assert_eq!(iter.get_key(), &key2); 55 | assert_eq!(iter.get_value_id(), Some(value_id2)); 56 | 57 | iter.step().await; 58 | 59 | assert!(iter.at_end()); 60 | } 61 | 62 | #[cfg(not(feature = "wisckey"))] 63 | #[async_test] 64 | async fn iterate() { 65 | let dir = tempdir().unwrap(); 66 | let params = Params { 67 | db_path: dir.path().to_path_buf(), 68 | ..Default::default() 69 | }; 70 | 71 | let params = Arc::new(params); 72 | let manifest = Arc::new(Manifest::new(params.clone()).await); 73 | 74 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 75 | 76 | let key1 = vec![5, 10, 3]; 77 | let key2 = vec![15, 10, 3]; 78 | 79 | let value1 = vec![4, 2]; 80 | let value2 = vec![4, 50]; 81 | 82 | let id = 124234; 83 | let mut builder = TableBuilder::new(id, ¶ms, data_blocks, key1.clone(), key2.clone()); 84 | 85 | builder.add_value(&key1, 1, &value1).await.unwrap(); 86 | 87 | builder.add_value(&key2, 4, &value2).await.unwrap(); 88 | 89 | let table = Arc::new(builder.finish().await.unwrap()); 90 | 91 | let mut iter = TableIterator::new(table, false).await; 92 | 93 | assert!(!iter.at_end()); 94 | assert_eq!(iter.get_key(), &key1); 95 | assert_eq!(iter.get_entry().unwrap().get_value(), &value1); 96 | 97 | iter.step().await; 98 | 99 | assert!(!iter.at_end()); 100 | assert_eq!(iter.get_key(), &key2); 101 | assert_eq!(iter.get_entry().unwrap().get_value(), &value2); 102 | 103 | iter.step().await; 104 | 105 | assert!(iter.at_end()); 106 | } 107 | 108 | #[cfg(not(feature = "wisckey"))] 109 | #[async_test] 110 | async fn reverse_iterate() { 111 | let dir = tempdir().unwrap(); 112 | let params = Params { 113 | db_path: dir.path().to_path_buf(), 114 | ..Default::default() 115 | }; 116 | 117 | let params = Arc::new(params); 118 | let manifest = Arc::new(Manifest::new(params.clone()).await); 119 | 120 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 121 | 122 | let key1 = vec![5, 10, 3]; 123 | let key2 = vec![15, 10, 3]; 124 | 125 | let value1 = vec![4, 2]; 126 | let value2 = vec![4, 50]; 127 | 128 | let id = 124234; 129 | let mut builder = TableBuilder::new(id, ¶ms, data_blocks, key1.clone(), key2.clone()); 130 | 131 | builder.add_value(&key1, 1, &value1).await.unwrap(); 132 | 133 | builder.add_value(&key2, 4, &value2).await.unwrap(); 134 | 135 | let table = Arc::new(builder.finish().await.unwrap()); 136 | 137 | let mut iter = TableIterator::new(table, true).await; 138 | 139 | assert!(!iter.at_end()); 140 | assert_eq!(iter.get_key(), &key2); 141 | assert_eq!(iter.get_entry().unwrap().get_value(), &value2); 142 | 143 | iter.step().await; 144 | 145 | assert!(!iter.at_end()); 146 | assert_eq!(iter.get_key(), &key1); 147 | assert_eq!(iter.get_entry().unwrap().get_value(), &value1); 148 | 149 | iter.step().await; 150 | 151 | assert!(iter.at_end()); 152 | } 153 | 154 | #[cfg(feature = "wisckey")] 155 | #[async_test] 156 | async fn iterate_many() { 157 | const COUNT: u32 = 5_000; 158 | 159 | let dir = tempdir().unwrap(); 160 | let params = Params { 161 | db_path: dir.path().to_path_buf(), 162 | ..Default::default() 163 | }; 164 | 165 | let params = Arc::new(params); 166 | let manifest = Arc::new(Manifest::new(params.clone()).await); 167 | 168 | let data_blocks = Arc::new(DataBlocks::new(params.clone(), manifest)); 169 | 170 | let min_key = (0u32).to_le_bytes().to_vec(); 171 | let max_key = COUNT.to_le_bytes().to_vec(); 172 | 173 | let id = 1; 174 | let mut builder = TableBuilder::new(id, ¶ms, data_blocks, min_key, max_key); 175 | 176 | for pos in 0..COUNT { 177 | let key = (pos).to_le_bytes().to_vec(); 178 | let seq_num = (500 + pos) as u64; 179 | 180 | builder.add_value(&key, seq_num, (100, pos)).await.unwrap(); 181 | } 182 | 183 | let table = Arc::new(builder.finish().await.unwrap()); 184 | 185 | let mut iter = TableIterator::new(table, false).await; 186 | 187 | for pos in 0..COUNT { 188 | assert!(!iter.at_end()); 189 | 190 | assert_eq!(iter.get_key(), &pos.to_le_bytes().to_vec()); 191 | assert_eq!(iter.get_value_id(), Some((100, pos))); 192 | assert_eq!(iter.get_seq_number(), 500 + pos as u64); 193 | 194 | iter.step().await; 195 | } 196 | 197 | assert!(iter.at_end()); 198 | } 199 | -------------------------------------------------------------------------------- /src/tasks.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | use std::sync::atomic::{AtomicBool, Ordering}; 4 | use std::time::Instant; 5 | 6 | use parking_lot::RwLock; 7 | 8 | use tokio::sync::Notify; 9 | 10 | use crate::Error; 11 | use crate::logic::DbLogic; 12 | 13 | use async_trait::async_trait; 14 | 15 | #[cfg(feature = "_async-io")] 16 | #[async_trait(?Send)] 17 | pub trait Task { 18 | async fn run(&self) -> Result; 19 | } 20 | 21 | #[cfg(not(feature = "_async-io"))] 22 | #[async_trait] 23 | pub trait Task: Sync + Send { 24 | async fn run(&self) -> Result; 25 | } 26 | 27 | #[derive(Debug, PartialEq, Eq, Hash)] 28 | pub enum TaskType { 29 | MemtableCompaction, 30 | LevelCompaction, 31 | } 32 | 33 | struct TaskHandle { 34 | stop_flag: Arc, 35 | task: Box, 36 | update_cond: Arc, 37 | } 38 | 39 | /// This structure manages background tasks 40 | /// Currently there is only compaction, but there might be more in the future 41 | pub struct TaskManager { 42 | stop_flag: Arc, 43 | tasks: HashMap, 44 | } 45 | 46 | /// Holds a group of tasks that do the same thing 47 | /// e.g., all compaction tasks 48 | struct TaskGroup { 49 | condition: Arc, 50 | } 51 | 52 | /// Keeps track of a condition variables shared within a task group 53 | struct UpdateCond { 54 | last_change: RwLock, 55 | condition: Notify, 56 | } 57 | 58 | struct MemtableCompactionTask { 59 | datastore: Arc, 60 | level_update_cond: Arc, 61 | } 62 | 63 | struct LevelCompactionTask { 64 | datastore: Arc, 65 | } 66 | 67 | impl MemtableCompactionTask { 68 | fn new_boxed(datastore: Arc, level_update_cond: Arc) -> Box { 69 | Box::new(Self { 70 | datastore, 71 | level_update_cond, 72 | }) 73 | } 74 | } 75 | 76 | impl LevelCompactionTask { 77 | fn new_boxed(datastore: Arc) -> Box { 78 | Box::new(Self { datastore }) 79 | } 80 | } 81 | 82 | #[cfg_attr(feature="_async-io", async_trait(?Send))] 83 | #[cfg_attr(not(feature = "_async-io"), async_trait)] 84 | impl Task for MemtableCompactionTask { 85 | async fn run(&self) -> Result { 86 | let did_work = self.datastore.do_memtable_compaction().await?; 87 | if did_work { 88 | self.level_update_cond.wake_up(); 89 | } 90 | Ok(did_work) 91 | } 92 | } 93 | 94 | #[cfg_attr(feature="_async-io", async_trait(?Send))] 95 | #[cfg_attr(not(feature = "_async-io"), async_trait)] 96 | impl Task for LevelCompactionTask { 97 | async fn run(&self) -> Result { 98 | Ok(self.datastore.do_level_compaction().await?) 99 | } 100 | } 101 | 102 | impl UpdateCond { 103 | fn new() -> Self { 104 | Self { 105 | last_change: RwLock::new(Instant::now()), 106 | condition: Default::default(), 107 | } 108 | } 109 | 110 | /// Notify the task that there is new work to do 111 | fn wake_up(&self) { 112 | let mut last_change = self.last_change.write(); 113 | *last_change = Instant::now(); 114 | self.condition.notify_one(); 115 | } 116 | } 117 | 118 | impl TaskHandle { 119 | fn new(stop_flag: Arc, update_cond: Arc, task: Box) -> Self { 120 | Self { 121 | stop_flag, 122 | update_cond, 123 | task, 124 | } 125 | } 126 | 127 | #[inline(always)] 128 | fn is_running(&self) -> bool { 129 | !self.stop_flag.load(Ordering::SeqCst) 130 | } 131 | 132 | async fn work_loop(&self) { 133 | log::trace!("Task work loop started"); 134 | let mut last_update = Instant::now(); 135 | 136 | // Indicates whether work was done in the last iteration 137 | let mut idle = false; 138 | 139 | loop { 140 | let now = Instant::now(); 141 | 142 | loop { 143 | let fut = self.update_cond.condition.notified(); 144 | tokio::pin!(fut); 145 | 146 | { 147 | let lchange = self.update_cond.last_change.read(); 148 | 149 | if !self.is_running() || !idle || *lchange > last_update { 150 | break; 151 | } 152 | 153 | // wait for change to queue and retry 154 | fut.as_mut().enable(); 155 | } 156 | 157 | fut.await; 158 | } 159 | 160 | if !self.is_running() { 161 | break; 162 | } 163 | 164 | let did_work = self.task.run().await.expect("Task failed"); 165 | last_update = now; 166 | 167 | if did_work { 168 | idle = false; 169 | } else { 170 | log::trace!("Task did not do any work"); 171 | idle = true; 172 | } 173 | } 174 | 175 | log::trace!("Task work loop ended"); 176 | } 177 | } 178 | 179 | impl TaskManager { 180 | pub async fn new(datastore: Arc, num_compaction_tasks: usize) -> Self { 181 | let mut tasks = HashMap::default(); 182 | let stop_flag = Arc::new(AtomicBool::new(false)); 183 | 184 | let memtable_update_cond = Arc::new(UpdateCond::new()); 185 | let level_update_cond = Arc::new(UpdateCond::new()); 186 | 187 | #[cfg(feature = "tokio-uring")] 188 | let mut sring = kioto_uring_executor::new_spawn_ring(); 189 | 190 | { 191 | let stop_flag = stop_flag.clone(); 192 | let memtable_update_cond = memtable_update_cond.clone(); 193 | let datastore = datastore.clone(); 194 | let level_update_cond = level_update_cond.clone(); 195 | 196 | #[cfg(feature = "tokio-uring")] 197 | { 198 | kioto_uring_executor::spawn_with(move || { 199 | let hdl = TaskHandle::new( 200 | stop_flag, 201 | memtable_update_cond, 202 | MemtableCompactionTask::new_boxed(datastore, level_update_cond), 203 | ); 204 | Box::pin(async move { hdl.work_loop().await }) 205 | }); 206 | } 207 | 208 | #[cfg(not(feature = "tokio-uring"))] 209 | { 210 | let hdl = TaskHandle::new( 211 | stop_flag, 212 | memtable_update_cond, 213 | MemtableCompactionTask::new_boxed(datastore, level_update_cond), 214 | ); 215 | 216 | cfg_if::cfg_if! { 217 | if #[cfg(feature="monoio")] { 218 | monoio::spawn(async move { hdl.work_loop().await }); 219 | } else { 220 | tokio::spawn(async move { hdl.work_loop().await }); 221 | } 222 | } 223 | } 224 | } 225 | 226 | let task_group = TaskGroup { 227 | condition: memtable_update_cond, 228 | }; 229 | 230 | tasks.insert(TaskType::MemtableCompaction, task_group); 231 | 232 | { 233 | for _ in 0..num_compaction_tasks { 234 | let stop_flag = stop_flag.clone(); 235 | let level_update_cond = level_update_cond.clone(); 236 | let datastore = datastore.clone(); 237 | 238 | #[cfg(feature = "tokio-uring")] 239 | { 240 | sring.spawn_with(move || { 241 | let hdl = TaskHandle::new( 242 | stop_flag, 243 | level_update_cond, 244 | LevelCompactionTask::new_boxed(datastore), 245 | ); 246 | Box::pin(async move { hdl.work_loop().await }) 247 | }); 248 | } 249 | 250 | #[cfg(not(feature = "tokio-uring"))] 251 | { 252 | let hdl = TaskHandle::new( 253 | stop_flag, 254 | level_update_cond, 255 | LevelCompactionTask::new_boxed(datastore), 256 | ); 257 | 258 | cfg_if::cfg_if! { 259 | if #[cfg(feature="monoio")] { 260 | monoio::spawn(async move { hdl.work_loop().await }); 261 | } else { 262 | tokio::spawn(async move { hdl.work_loop().await }); 263 | } 264 | } 265 | } 266 | } 267 | 268 | let task_group = TaskGroup { 269 | condition: level_update_cond, 270 | }; 271 | 272 | tasks.insert(TaskType::LevelCompaction, task_group); 273 | } 274 | 275 | Self { stop_flag, tasks } 276 | } 277 | 278 | #[tracing::instrument(skip(self))] 279 | pub fn wake_up(&self, task_type: &TaskType) { 280 | let task_group = self.tasks.get(task_type).expect("No such task"); 281 | task_group.condition.wake_up(); 282 | } 283 | 284 | pub fn terminate(&self) { 285 | self.stop_flag.store(false, Ordering::SeqCst); 286 | 287 | for (_, task_group) in self.tasks.iter() { 288 | task_group.condition.condition.notify_one(); 289 | } 290 | } 291 | 292 | pub async fn stop_all(&self) -> Result<(), Error> { 293 | log::trace!("Stopping all background tasks"); 294 | 295 | self.stop_flag.store(true, Ordering::SeqCst); 296 | 297 | for (_, task_group) in self.tasks.iter() { 298 | task_group.condition.condition.notify_waiters(); 299 | } 300 | 301 | Ok(()) 302 | } 303 | } 304 | -------------------------------------------------------------------------------- /src/values/batch.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::Error; 4 | use crate::disk; 5 | use crate::values::{ValueBatchId, ValueId, ValueLog, ValueOffset, ValueRef}; 6 | 7 | use zerocopy::{FromBytes, Immutable, IntoBytes, KnownLayout}; 8 | 9 | /** 10 | * The layout is as follows: 11 | * - value batch header 12 | * - offsets (padded to WORD_SIZE) 13 | * - value entries 14 | */ 15 | #[derive(Debug)] 16 | pub(super) struct ValueBatch { 17 | data: Vec, 18 | } 19 | 20 | pub struct ValueBatchBuilder<'a> { 21 | vlog: &'a ValueLog, 22 | identifier: ValueBatchId, 23 | /// The locations of the values within this block 24 | offsets: Vec, 25 | /// The value data 26 | value_data: Vec, 27 | } 28 | 29 | #[derive(Debug, KnownLayout, Immutable, IntoBytes, FromBytes)] 30 | #[repr(C, packed)] 31 | pub(super) struct ValueBatchHeader { 32 | pub num_values: u32, 33 | } 34 | 35 | pub const BATCH_HEADER_LEN: usize = std::mem::size_of::(); 36 | 37 | #[derive(Debug, KnownLayout, Immutable, IntoBytes, FromBytes)] 38 | #[repr(C, packed)] 39 | pub(super) struct ValueEntryHeader { 40 | pub key_length: u32, 41 | pub value_length: u32, 42 | } 43 | 44 | impl<'a> ValueBatchBuilder<'a> { 45 | pub fn new(identifier: ValueBatchId, vlog: &'a ValueLog) -> Self { 46 | Self { 47 | identifier, 48 | vlog, 49 | value_data: vec![], 50 | offsets: vec![], 51 | } 52 | } 53 | 54 | /// Add another value to this batch 55 | pub async fn add_entry(&mut self, key: &[u8], val: &[u8]) -> ValueId { 56 | // Add padding (if needed) 57 | let offset = crate::pad_offset(self.value_data.len()); 58 | assert!(offset >= self.value_data.len()); 59 | self.value_data.resize(offset, 0u8); 60 | 61 | self.offsets.extend_from_slice((offset as u32).as_bytes()); 62 | 63 | let entry_header = ValueEntryHeader { 64 | key_length: key.len().try_into().expect("Key is too long"), 65 | value_length: val.len().try_into().expect("Value is too long"), 66 | }; 67 | 68 | self.value_data.extend_from_slice(entry_header.as_bytes()); 69 | self.value_data.extend_from_slice(key); 70 | self.value_data.extend_from_slice(val); 71 | 72 | (self.identifier, offset as u32) 73 | } 74 | 75 | /// Create the batch and write it to disk 76 | pub async fn finish(mut self) -> Result { 77 | let num_values = (self.offsets.len() / size_of::()) as u32; 78 | 79 | let header = ValueBatchHeader { num_values }; 80 | 81 | crate::add_padding(&mut self.offsets); 82 | 83 | let mut data = header.as_bytes().to_vec(); 84 | let file_path = self.vlog.get_batch_file_path(&self.identifier); 85 | 86 | data.extend_from_slice(&self.value_data); 87 | disk::write(&file_path, &data) 88 | .await 89 | .map_err(|err| Error::from_io_error("Failed to write value log batch", err))?; 90 | 91 | let batch = Arc::new(ValueBatch { data }); 92 | 93 | // Store in the cache so we don't have to load immediately 94 | { 95 | let shard_id = ValueLog::batch_to_shard_id(self.identifier); 96 | let mut shard = self.vlog.batch_caches[shard_id].lock().await; 97 | shard.put(self.identifier, batch); 98 | } 99 | 100 | self.vlog 101 | .index 102 | .add_batch(self.identifier, num_values as usize) 103 | .await?; 104 | 105 | log::trace!("Created value batch #{}", self.identifier); 106 | Ok(self.identifier) 107 | } 108 | } 109 | 110 | impl ValueBatch { 111 | pub fn from_existing(data: Vec) -> Self { 112 | Self { data } 113 | } 114 | 115 | pub fn get_ref(self_ptr: Arc, pos: ValueOffset) -> ValueRef { 116 | let mut offset = pos as usize; 117 | let data = &self_ptr.get_value_data()[offset..]; 118 | 119 | let (vheader, _) = ValueEntryHeader::ref_from_prefix(data).unwrap(); 120 | 121 | offset += size_of::(); 122 | offset += vheader.key_length as usize; 123 | 124 | ValueRef { 125 | length: vheader.value_length as usize, 126 | batch: self_ptr, 127 | offset, 128 | } 129 | } 130 | 131 | pub fn get_entries(&self, offsets: &[ValueOffset]) -> Vec<(Vec, Vec)> { 132 | offsets 133 | .iter() 134 | .map(|offset| { 135 | let mut offset = *offset as usize; 136 | let data = &self.get_value_data()[offset..]; 137 | 138 | let (vheader, _) = ValueEntryHeader::ref_from_prefix(data).unwrap(); 139 | 140 | offset += size_of::(); 141 | 142 | let key = data[offset..(vheader.key_length as usize)].to_vec(); 143 | offset += vheader.key_length as usize; 144 | 145 | let value = data[offset..(vheader.value_length as usize)].to_vec(); 146 | 147 | (key, value) 148 | }) 149 | .collect() 150 | } 151 | 152 | /// Access the raw data of this batch 153 | #[inline] 154 | pub(super) fn get_value_data(&self) -> &[u8] { 155 | &self.data[BATCH_HEADER_LEN..] 156 | } 157 | 158 | #[inline] 159 | fn get_header(&self) -> &ValueBatchHeader { 160 | ValueBatchHeader::ref_from_prefix(&self.data[..]).unwrap().0 161 | } 162 | 163 | /// The number of all values in this batch, even deleted ones 164 | #[allow(dead_code)] 165 | pub fn total_num_values(&self) -> u32 { 166 | self.get_header().num_values 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/values/mod.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroUsize; 2 | use std::sync::Arc; 3 | 4 | use tokio::sync::Mutex; 5 | 6 | use crate::Error; 7 | 8 | use lru::LruCache; 9 | 10 | use crate::Params; 11 | use crate::disk; 12 | use crate::manifest::Manifest; 13 | 14 | pub type ValueOffset = u32; 15 | pub type ValueBatchId = u64; 16 | 17 | pub const MIN_VALUE_BATCH_ID: ValueBatchId = 1; 18 | 19 | pub type ValueId = (ValueBatchId, ValueOffset); 20 | 21 | const NUM_SHARDS: NonZeroUsize = NonZeroUsize::new(16).unwrap(); 22 | 23 | pub const GARBAGE_COLLECT_THRESHOLD: f64 = 0.2; 24 | 25 | type BatchShard = LruCache>; 26 | 27 | #[cfg(test)] 28 | mod tests; 29 | 30 | mod index; 31 | pub use index::{MIN_VALUE_INDEX_PAGE_ID, ValueIndex, ValueIndexPageId}; 32 | 33 | mod batch; 34 | use batch::ValueBatch; 35 | pub use batch::ValueBatchBuilder; 36 | 37 | use crate::EntryList; 38 | use crate::wal::{LogEntry, WriteAheadLog}; 39 | 40 | pub struct ValueLog { 41 | /// The value log uses the write-ahed log 42 | /// to batch updates to its index 43 | wal: Arc, 44 | 45 | /// The value_index keeps track of all used entires within 46 | /// the value log and helps to garbage collect and 47 | /// defragment 48 | index: ValueIndex, 49 | 50 | /// Sharded storage of log batches 51 | batch_caches: Vec>, 52 | 53 | params: Arc, 54 | manifest: Arc, 55 | } 56 | 57 | pub struct ValueRef { 58 | batch: Arc, 59 | offset: usize, 60 | length: usize, 61 | } 62 | 63 | impl ValueRef { 64 | pub fn get_value(&self) -> &[u8] { 65 | &self.batch.get_value_data()[self.offset..self.offset + self.length] 66 | } 67 | } 68 | 69 | impl ValueLog { 70 | fn init_caches(params: &Params) -> Vec> { 71 | let max_value_files = NonZeroUsize::new(params.max_open_files / 2) 72 | .expect("Max open files needs to be greater than 2"); 73 | 74 | let shard_size = NonZeroUsize::new(max_value_files.get() / NUM_SHARDS) 75 | .expect("Not enough open files to support the number of shards"); 76 | 77 | (0..NUM_SHARDS.get()) 78 | .map(|_| Mutex::new(BatchShard::new(shard_size))) 79 | .collect() 80 | } 81 | 82 | pub async fn new( 83 | wal: Arc, 84 | params: Arc, 85 | manifest: Arc, 86 | ) -> Result { 87 | let batch_caches = Self::init_caches(¶ms); 88 | let index = ValueIndex::new(params.clone(), manifest.clone()).await?; 89 | 90 | Ok(Self { 91 | wal, 92 | index, 93 | params, 94 | manifest, 95 | batch_caches, 96 | }) 97 | } 98 | 99 | pub async fn open( 100 | wal: Arc, 101 | params: Arc, 102 | manifest: Arc, 103 | index: ValueIndex, 104 | to_delete: Vec, 105 | ) -> Result { 106 | let batch_caches = Self::init_caches(¶ms); 107 | let obj = Self { 108 | wal, 109 | index, 110 | params, 111 | manifest, 112 | batch_caches, 113 | }; 114 | 115 | for batch_id in to_delete.into_iter() { 116 | obj.remove_batch_from_disk(batch_id).await?; 117 | } 118 | 119 | Ok(obj) 120 | } 121 | 122 | /// Marks a value as unused and, potentially, removes old value batches 123 | /// On success, this might return a list of entries to reinsert in order to defragment the log 124 | #[tracing::instrument(skip(self))] 125 | pub async fn mark_value_deleted(&self, vid: ValueId) -> Result { 126 | let (page_id, page_offset) = self.index.mark_value_as_deleted(vid).await?; 127 | self.wal 128 | .store(&[LogEntry::DeleteValue(page_id, page_offset)]) 129 | .await?; 130 | 131 | let res = if self.try_to_remove(page_id).await? { 132 | vec![] 133 | } else { 134 | self.try_to_compact(page_id).await?.unwrap_or_else(Vec::new) 135 | }; 136 | 137 | Ok(res) 138 | } 139 | 140 | /// Attempts to delete empty batches 141 | #[tracing::instrument(skip(self))] 142 | async fn try_to_remove(&self, batch_id: ValueBatchId) -> Result { 143 | log::trace!("Checking if value batch #{batch_id} can be removed"); 144 | 145 | let num_active = self.index.count_active_entries(batch_id).await; 146 | 147 | // Can only remove if no values in this batch are active 148 | if num_active > 0 { 149 | return Ok(false); 150 | } 151 | 152 | log::trace!("Deleting empty batch #{batch_id}"); 153 | self.index.mark_batch_as_deleted(batch_id).await?; 154 | 155 | // Hold lock so nobody else messes with the file while we do this 156 | self.remove_batch_from_disk(batch_id).await?; 157 | Ok(true) 158 | } 159 | 160 | async fn remove_batch_from_disk(&self, batch_id: ValueBatchId) -> Result<(), Error> { 161 | let shard_id = Self::batch_to_shard_id(batch_id); 162 | let mut cache = self.batch_caches[shard_id].lock().await; 163 | let fpath = self.get_batch_file_path(&batch_id); 164 | disk::remove_file(&fpath) 165 | .await 166 | .map_err(|err| Error::from_io_error("Failed to remove value log batch", err))?; 167 | cache.pop(&batch_id); 168 | 169 | // Can we remove entries entirely? 170 | let min_batch = self 171 | .manifest 172 | .get_minimum_value_batch() 173 | .await 174 | .max(MIN_VALUE_BATCH_ID); 175 | 176 | // We can only completely remove batches starting from the oldest one 177 | // Instead, we "empty" the batch, reducing its size to a single on-disk page 178 | if batch_id > min_batch { 179 | return Ok(()); 180 | } 181 | 182 | let most_recent = self.manifest.get_most_recent_value_batch_id().await; 183 | let mut new_minimum = batch_id; 184 | 185 | while new_minimum < most_recent { 186 | if self.index.count_active_entries(batch_id).await > 0 { 187 | break; 188 | } 189 | new_minimum += 1; 190 | } 191 | 192 | log::debug!("Full removed {} value batches", new_minimum - batch_id + 1); 193 | self.manifest.set_minimum_value_batch_id(new_minimum).await; 194 | 195 | Ok(()) 196 | } 197 | 198 | /// Check if we should reinsert entries from this batch 199 | #[tracing::instrument(skip(self))] 200 | async fn try_to_compact(&self, batch_id: ValueBatchId) -> Result, Error> { 201 | log::trace!("Checking if value batch #{batch_id} should be compacted (reinserted)"); 202 | 203 | let batch = self.get_batch(batch_id).await?; 204 | let num_entries = batch.total_num_values() as usize; 205 | let num_active = self.index.count_active_entries(batch_id).await; 206 | let active_ratio = (num_active * 100) / (num_entries * 100); 207 | 208 | if active_ratio < 25 { 209 | log::trace!("Re-inserting sparse value batch #{batch_id}"); 210 | let offsets = self.index.get_active_entries(batch_id).await; 211 | self.index.mark_batch_as_compacted(batch_id).await?; 212 | 213 | Ok(Some(batch.get_entries(&offsets))) 214 | } else { 215 | Ok(None) 216 | } 217 | } 218 | 219 | #[inline] 220 | fn batch_to_shard_id(batch_id: ValueBatchId) -> usize { 221 | (batch_id as usize) % NUM_SHARDS 222 | } 223 | 224 | #[inline] 225 | fn get_batch_file_path(&self, batch_id: &ValueBatchId) -> std::path::PathBuf { 226 | self.params.db_path.join(format!("val{batch_id:08}.data")) 227 | } 228 | 229 | pub async fn make_batch(&self) -> ValueBatchBuilder<'_> { 230 | let identifier = self.manifest.generate_next_value_batch_id().await; 231 | ValueBatchBuilder::new(identifier, self) 232 | } 233 | 234 | #[tracing::instrument(skip(self))] 235 | async fn get_batch(&self, identifier: ValueBatchId) -> Result, Error> { 236 | let shard_id = Self::batch_to_shard_id(identifier); 237 | let mut cache = self.batch_caches[shard_id].lock().await; 238 | 239 | if let Some(batch) = cache.get(&identifier) { 240 | Ok(batch.clone()) 241 | } else { 242 | log::trace!("Loading value batch #{identifier} from disk"); 243 | 244 | let data = disk::read(&self.get_batch_file_path(&identifier), 0) 245 | .await 246 | .map_err(|err| Error::from_io_error("Failed to read value log batch", err))?; 247 | 248 | let obj = Arc::new(ValueBatch::from_existing(data)); 249 | cache.put(identifier, obj.clone()); 250 | 251 | Ok(obj) 252 | } 253 | } 254 | 255 | /// Return the reference to a value 256 | pub async fn get_ref(&self, value_ref: ValueId) -> Result { 257 | log::trace!("Getting value at {value_ref:?}"); 258 | 259 | let (id, offset) = value_ref; 260 | let batch = self.get_batch(id).await?; 261 | 262 | Ok(ValueBatch::get_ref(batch, offset)) 263 | } 264 | 265 | pub async fn sync(&self) -> Result<(), Error> { 266 | self.index.sync().await 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /src/values/tests.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "tokio-uring")] 2 | use kioto_uring_executor::test as async_test; 3 | 4 | #[cfg(feature = "monoio")] 5 | use monoio::test as async_test; 6 | 7 | #[cfg(not(feature = "_async-io"))] 8 | use tokio::test as async_test; 9 | 10 | use super::*; 11 | 12 | use tempfile::{Builder, TempDir}; 13 | 14 | async fn test_init() -> (TempDir, ValueLog) { 15 | let tmp_dir = Builder::new() 16 | .prefix("lsm-value-log-test-") 17 | .tempdir() 18 | .unwrap(); 19 | let _ = env_logger::builder().is_test(true).try_init(); 20 | 21 | let params = Params { 22 | db_path: tmp_dir.path().to_path_buf(), 23 | ..Default::default() 24 | }; 25 | 26 | let params = Arc::new(params); 27 | let wal = Arc::new(WriteAheadLog::new(params.clone()).await.unwrap()); 28 | let manifest = Arc::new(Manifest::new(params.clone()).await); 29 | 30 | (tmp_dir, ValueLog::new(wal, params, manifest).await.unwrap()) 31 | } 32 | 33 | #[async_test] 34 | async fn delete_batch() { 35 | const SIZE: usize = 1_000; 36 | 37 | let (_tmpdir, values) = test_init().await; 38 | let mut builder = values.make_batch().await; 39 | 40 | let key = "hello".as_bytes().to_vec(); 41 | let value = vec![b'a'; SIZE]; 42 | 43 | let vid = builder.add_entry(&key, &value).await; 44 | 45 | let batch_id = builder.finish().await.unwrap(); 46 | let batch = values.get_batch(batch_id).await.unwrap(); 47 | 48 | assert_eq!(batch.total_num_values(), 1); 49 | 50 | values.mark_value_deleted(vid).await.unwrap(); 51 | 52 | let result = values.get_batch(batch_id).await; 53 | assert!(result.is_err()); 54 | } 55 | 56 | #[async_test] 57 | async fn get_put_many() { 58 | let (_tmpdir, values) = test_init().await; 59 | 60 | let mut builder = values.make_batch().await; 61 | let mut vids = vec![]; 62 | 63 | for pos in 0..1000u32 { 64 | let key = format!("key_{pos}").as_bytes().to_vec(); 65 | let value = format!("Number {pos}").into_bytes(); 66 | let vid = builder.add_entry(&key, &value).await; 67 | vids.push(vid); 68 | } 69 | 70 | builder.finish().await.unwrap(); 71 | 72 | for (pos, vid) in vids.iter().enumerate() { 73 | let value = format!("Number {pos}").into_bytes(); 74 | 75 | let result = values.get_ref(*vid).await.unwrap(); 76 | assert_eq!(result.get_value(), value); 77 | } 78 | } 79 | 80 | #[async_test] 81 | async fn get_put_large_value() { 82 | let (_tmpdir, values) = test_init().await; 83 | 84 | const SIZE: usize = 1_000_000; 85 | let mut builder = values.make_batch().await; 86 | 87 | let key = "hello".as_bytes().to_vec(); 88 | let data = vec![b'a'; SIZE]; 89 | 90 | let vid = builder.add_entry(&key, &data).await; 91 | 92 | builder.finish().await.unwrap(); 93 | 94 | assert!(values.get_ref(vid).await.unwrap().get_value() == data); 95 | } 96 | -------------------------------------------------------------------------------- /src/wal/reader.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use zerocopy::FromBytes; 4 | 5 | #[cfg(feature = "wisckey")] 6 | use crate::values::{ValueBatchId, ValueIndex}; 7 | 8 | use crate::memtable::Memtable; 9 | use crate::{Error, Params, disk}; 10 | 11 | use super::{LogEntryType, PAGE_SIZE, WalWriter, WriteOp}; 12 | 13 | /// WAL reader used during recovery 14 | pub struct WalReader { 15 | params: Arc, 16 | position: usize, 17 | current_page: Vec, 18 | } 19 | 20 | #[derive(Default)] 21 | pub struct RecoveryResult { 22 | pub new_position: usize, 23 | pub entries_recovered: usize, 24 | #[cfg(feature = "wisckey")] 25 | pub value_batches_to_delete: Vec, 26 | } 27 | 28 | impl WalReader { 29 | pub async fn new(params: Arc, start_position: usize) -> Result { 30 | let position = start_position; 31 | let fpos = position / PAGE_SIZE; 32 | 33 | let fpath = WalWriter::get_file_path(¶ms, fpos); 34 | log::trace!("Opening next log file at {fpath:?}"); 35 | 36 | let current_page = disk::read_uncompressed(&fpath, 0) 37 | .await 38 | .map_err(|err| Error::from_io_error("Failed to open WAL file", err))?; 39 | 40 | Ok(Self { 41 | params, 42 | current_page, 43 | position, 44 | }) 45 | } 46 | 47 | #[cfg(feature = "wisckey")] 48 | pub async fn run( 49 | &mut self, 50 | memtable: &mut Memtable, 51 | value_index: &mut ValueIndex, 52 | ) -> Result { 53 | let mut result = RecoveryResult::default(); 54 | 55 | // Re-insert ops into memtable 56 | loop { 57 | let mut log_type = [0u8; 1]; 58 | let success = self.read_from_log(&mut log_type[..], true).await?; 59 | 60 | if !success { 61 | break; 62 | } 63 | 64 | if log_type[0] == LogEntryType::Write as u8 { 65 | self.parse_write_entry(memtable).await? 66 | } else if log_type[0] == LogEntryType::DeleteValue as u8 { 67 | self.parse_value_deletion_entry(value_index).await? 68 | } else if log_type[0] == LogEntryType::DeleteBatch as u8 { 69 | self.parse_batch_deletion_entry(value_index).await? 70 | } else { 71 | panic!("Unexpected log entry type! {}", log_type[0]); 72 | } 73 | 74 | result.entries_recovered += 1; 75 | } 76 | 77 | log::debug!( 78 | "Found {} entries in write-ahead log", 79 | result.entries_recovered 80 | ); 81 | result.new_position = self.position; 82 | Ok(result) 83 | } 84 | 85 | #[cfg(not(feature = "wisckey"))] 86 | pub async fn run(&mut self, memtable: &mut Memtable) -> Result { 87 | let mut result = RecoveryResult::default(); 88 | 89 | // Re-insert ops into memtable 90 | loop { 91 | let mut log_type = [0u8; 1]; 92 | let success = self.read_from_log(&mut log_type[..], true).await?; 93 | 94 | if !success { 95 | break; 96 | } 97 | 98 | if log_type[0] == LogEntryType::Write as u8 { 99 | self.parse_write_entry(memtable).await? 100 | } else { 101 | panic!("Unexpected log entry type!"); 102 | } 103 | 104 | result.entries_recovered += 1; 105 | } 106 | 107 | log::debug!( 108 | "Found {} entries in write-ahead log", 109 | result.entries_recovered 110 | ); 111 | result.new_position = self.position; 112 | Ok(result) 113 | } 114 | 115 | async fn parse_write_entry(&mut self, memtable: &mut Memtable) -> Result<(), Error> { 116 | let op_type: u8 = self.read_value().await?; 117 | let key_len: u64 = self.read_value().await?; 118 | 119 | let mut key = vec![0; key_len as usize]; 120 | self.read_from_log(&mut key, false).await?; 121 | 122 | if op_type == WriteOp::PUT_OP { 123 | let val_len: u64 = self.read_value().await?; 124 | let mut value = vec![0; val_len as usize]; 125 | self.read_from_log(&mut value, false).await?; 126 | memtable.put(key, value); 127 | } else if op_type == WriteOp::DELETE_OP { 128 | memtable.delete(key); 129 | } else { 130 | panic!("Unexpected op type!"); 131 | } 132 | 133 | Ok(()) 134 | } 135 | 136 | /// Fetches a value from the current position at the log 137 | /// and advances the position 138 | /// 139 | /// This might open the next page of the log, if needed 140 | async fn read_value(&mut self) -> Result { 141 | let mut data = vec![0u8; std::mem::size_of::()]; 142 | self.read_from_log(&mut data, false).await?; 143 | Ok(T::read_from_bytes(&data).unwrap()) 144 | } 145 | 146 | #[cfg(feature = "wisckey")] 147 | async fn parse_value_deletion_entry( 148 | &mut self, 149 | value_index: &mut ValueIndex, 150 | ) -> Result<(), Error> { 151 | let page_id = self.read_value().await?; 152 | let offset = self.read_value().await?; 153 | value_index.mark_value_as_deleted_at(page_id, offset).await; 154 | 155 | Ok(()) 156 | } 157 | 158 | #[cfg(feature = "wisckey")] 159 | async fn parse_batch_deletion_entry( 160 | &mut self, 161 | value_index: &mut ValueIndex, 162 | ) -> Result<(), Error> { 163 | let page_id = self.read_value().await?; 164 | let offset = self.read_value().await?; 165 | value_index 166 | .mark_batch_as_deleted_at(page_id, offset) 167 | .await?; 168 | Ok(()) 169 | } 170 | 171 | /// Read the next entry from the log 172 | /// (only used during recovery) 173 | /// 174 | /// TODO: Change this to just fetch an entire page at a time 175 | async fn read_from_log(&mut self, out: &mut [u8], maybe: bool) -> Result { 176 | let buffer_len = out.len(); 177 | let mut buffer_pos = 0; 178 | assert!(buffer_len > 0); 179 | 180 | while buffer_pos < buffer_len { 181 | let offset = self.position % PAGE_SIZE; 182 | let file_remaining = self 183 | .current_page 184 | .len() 185 | .checked_sub(offset) 186 | .expect("Invalid offset. Page too small?"); 187 | let buffer_remaining = buffer_len - buffer_pos; 188 | 189 | let len = buffer_remaining.min(file_remaining); 190 | 191 | if len > 0 { 192 | out[buffer_pos..buffer_pos + len] 193 | .copy_from_slice(&self.current_page[offset..offset + len]); 194 | buffer_pos += len; 195 | self.position += len; 196 | } else if self.position % PAGE_SIZE != 0 { 197 | log::trace!( 198 | "WAL reader is done. Current file was not full; assuming it is the most recent." 199 | ); 200 | assert!(self.current_page.len() < PAGE_SIZE); 201 | return Ok(false); 202 | } 203 | 204 | // Move to next file? 205 | if self.position % PAGE_SIZE == 0 { 206 | let fpos = self.position / PAGE_SIZE; 207 | let fpath = WalWriter::get_file_path(&self.params, fpos); 208 | log::trace!("Opening next log file at {fpath:?}"); 209 | 210 | self.current_page = match disk::read_uncompressed(&fpath, 0).await { 211 | Ok(data) => data, 212 | Err(err) => { 213 | if maybe && err.kind() == std::io::ErrorKind::NotFound { 214 | // At last file but it is still exactly 215 | // one page 216 | log::trace!("WAL reader is done. No next log file found"); 217 | return Ok(false); 218 | } else { 219 | return Err(Error::from_io_error("Failed to open WAL file", err)); 220 | } 221 | } 222 | } 223 | } 224 | } 225 | 226 | Ok(true) 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /src/wal/tests.rs: -------------------------------------------------------------------------------- 1 | /// Tests for the write-ahead log, especially its behavior during recovery 2 | use tempfile::TempDir; 3 | 4 | use super::*; 5 | 6 | #[cfg(feature = "wisckey")] 7 | use crate::{manifest::Manifest, values::ValueIndex}; 8 | 9 | #[cfg(feature = "tokio-uring")] 10 | use kioto_uring_executor::test as async_test; 11 | 12 | #[cfg(feature = "monoio")] 13 | use monoio::test as async_test; 14 | 15 | #[cfg(not(feature = "_async-io"))] 16 | use tokio::test as async_test; 17 | 18 | async fn test_init() -> (TempDir, Arc, WriteAheadLog) { 19 | let _ = env_logger::builder().is_test(true).try_init(); 20 | 21 | let tempdir = tempfile::Builder::new() 22 | .prefix("lsm-wal-test-") 23 | .tempdir() 24 | .expect("Failed to create temporary directory"); 25 | 26 | log::debug!("Created tempdir at {:?}", tempdir.path()); 27 | 28 | let params = Arc::new(Params { 29 | db_path: tempdir.path().to_path_buf(), 30 | ..Default::default() 31 | }); 32 | 33 | let wal = WriteAheadLog::new(params.clone()).await.unwrap(); 34 | (tempdir, params, wal) 35 | } 36 | 37 | async fn test_cleanup(tempdir: TempDir, wal: WriteAheadLog) { 38 | // Finish all writes before we stop the tests 39 | wal.stop().await.expect("WAL sync failed"); 40 | 41 | // Ensure that the tempdir is dropped last 42 | drop(wal); 43 | 44 | log::trace!("Removing tempdir at {:?}", tempdir.path()); 45 | drop(tempdir); 46 | } 47 | 48 | #[cfg(feature = "wisckey")] 49 | async fn reopen_wal(params: Arc, offset: u64) -> (Memtable, WriteAheadLog) { 50 | let mut memtable = Memtable::new(0); 51 | 52 | let manifest = Arc::new(Manifest::new(params.clone()).await); 53 | let mut freelist = ValueIndex::new(params.clone(), manifest).await.unwrap(); 54 | let (wal, _) = WriteAheadLog::open(params, offset, &mut memtable, &mut freelist) 55 | .await 56 | .unwrap(); 57 | 58 | (memtable, wal) 59 | } 60 | 61 | #[cfg(not(feature = "wisckey"))] 62 | async fn reopen_wal(params: Arc, offset: u64) -> (Memtable, WriteAheadLog) { 63 | let mut memtable = Memtable::new(0); 64 | 65 | let (wal, _) = WriteAheadLog::open(params, offset, &mut memtable) 66 | .await 67 | .unwrap(); 68 | 69 | (memtable, wal) 70 | } 71 | 72 | #[async_test] 73 | async fn empty_sync() { 74 | let (tempdir, _, wal) = test_init().await; 75 | 76 | assert_eq!(wal.inner.status.read().sync_pos, 0); 77 | assert_eq!(wal.inner.status.read().write_pos, 0); 78 | 79 | test_cleanup(tempdir, wal).await; 80 | } 81 | 82 | #[async_test] 83 | async fn write_and_sync() { 84 | let (tempdir, _, wal) = test_init().await; 85 | 86 | let key = vec![1, 2]; 87 | let value = vec![2, 3]; 88 | let op = WriteOp::Put(key.clone(), value.clone()); 89 | 90 | wal.store(&[LogEntry::Write(&op)]).await.unwrap(); 91 | wal.sync().await.unwrap(); 92 | 93 | assert_eq!(wal.inner.status.read().sync_pos, 22); 94 | assert_eq!(wal.inner.status.read().write_pos, 22); 95 | 96 | test_cleanup(tempdir, wal).await; 97 | } 98 | 99 | #[async_test] 100 | async fn write_large_value() { 101 | let (tempdir, _, wal) = test_init().await; 102 | 103 | let key = vec![1, 2]; 104 | let value = vec![1; 2 * PAGE_SIZE]; 105 | let op = WriteOp::Put(key.clone(), value.clone()); 106 | 107 | wal.store(&[LogEntry::Write(&op)]).await.unwrap(); 108 | wal.sync().await.unwrap(); 109 | 110 | assert_eq!(wal.inner.status.read().sync_pos, 8212); 111 | assert_eq!(wal.inner.status.read().write_pos, 8212); 112 | 113 | test_cleanup(tempdir, wal).await; 114 | } 115 | 116 | #[async_test] 117 | async fn reopen() { 118 | let (tempdir, params, wal) = test_init().await; 119 | 120 | let key = vec![1, 2]; 121 | let value = vec![2, 3]; 122 | let op = WriteOp::Put(key.clone(), value.clone()); 123 | 124 | wal.store(&[LogEntry::Write(&op)]).await.unwrap(); 125 | wal.sync().await.unwrap(); 126 | drop(wal); 127 | 128 | let (memtable, wal) = reopen_wal(params, 0).await; 129 | assert_eq!(wal.inner.status.read().sync_pos, 22); 130 | assert_eq!(wal.inner.status.read().write_pos, 22); 131 | 132 | let entry = memtable.get(&key).unwrap(); 133 | assert_eq!(entry.get_value(), Some(value).as_deref()); 134 | 135 | test_cleanup(tempdir, wal).await; 136 | } 137 | 138 | #[async_test] 139 | async fn reopen_with_offset1() { 140 | let (tempdir, params, wal) = test_init().await; 141 | 142 | let key1 = vec![1, 2]; 143 | let key2 = vec![1, 2, 3]; 144 | let value = vec![2, 3]; 145 | 146 | let op1 = WriteOp::Put(key1.clone(), value.clone()); 147 | let op2 = WriteOp::Put(key2.clone(), value.clone()); 148 | 149 | wal.store(&[LogEntry::Write(&op1)]).await.unwrap(); 150 | wal.store(&[LogEntry::Write(&op2)]).await.unwrap(); 151 | wal.sync().await.unwrap(); 152 | 153 | drop(wal); 154 | 155 | let (memtable, wal) = reopen_wal(params, 22).await; 156 | 157 | assert_eq!(wal.inner.status.read().sync_pos, 45); 158 | assert_eq!(wal.inner.status.read().write_pos, 45); 159 | 160 | assert!(memtable.get(&key1).is_none()); 161 | let entry = memtable.get(&key2).unwrap(); 162 | assert_eq!(entry.get_value(), Some(value).as_deref()); 163 | 164 | test_cleanup(tempdir, wal).await; 165 | } 166 | 167 | #[async_test] 168 | async fn reopen_with_offset_and_cleanup1() { 169 | let (tempdir, params, wal) = test_init().await; 170 | 171 | let key1 = vec![1, 2]; 172 | let key2 = vec![1, 2, 3]; 173 | let value = vec![2, 3]; 174 | 175 | let op1 = WriteOp::Put(key1.clone(), value.clone()); 176 | let op2 = WriteOp::Put(key2.clone(), value.clone()); 177 | 178 | wal.store(&[LogEntry::Write(&op1)]).await.unwrap(); 179 | wal.store(&[LogEntry::Write(&op2)]).await.unwrap(); 180 | wal.sync().await.unwrap(); 181 | 182 | let offset = 22; 183 | wal.set_offset(offset).await; 184 | drop(wal); 185 | 186 | let (memtable, wal) = reopen_wal(params, offset).await; 187 | 188 | assert_eq!(wal.inner.status.read().sync_pos, 45); 189 | assert_eq!(wal.inner.status.read().write_pos, 45); 190 | 191 | assert!(memtable.get(&key1).is_none()); 192 | let entry = memtable.get(&key2).unwrap(); 193 | assert_eq!(entry.get_value(), Some(value).as_deref()); 194 | 195 | test_cleanup(tempdir, wal).await; 196 | } 197 | 198 | #[async_test] 199 | async fn reopen_with_offset_and_cleanup2() { 200 | let (tempdir, params, wal) = test_init().await; 201 | 202 | let key1 = vec![1, 2]; 203 | let key2 = vec![1, 2, 3]; 204 | let value1 = vec![2; 2 * PAGE_SIZE]; 205 | let value2 = vec![2, 3]; 206 | 207 | let op1 = WriteOp::Put(key1.clone(), value1.clone()); 208 | let op2 = WriteOp::Put(key2.clone(), value2.clone()); 209 | 210 | wal.store(&[LogEntry::Write(&op1)]).await.unwrap(); 211 | wal.store(&[LogEntry::Write(&op2)]).await.unwrap(); 212 | wal.sync().await.unwrap(); 213 | 214 | let offset = 8212; 215 | wal.set_offset(offset).await; 216 | 217 | drop(wal); 218 | 219 | let (memtable, wal) = reopen_wal(params, offset).await; 220 | 221 | assert_eq!(wal.inner.status.read().sync_pos, 8235); 222 | assert_eq!(wal.inner.status.read().write_pos, 8235); 223 | 224 | assert!(memtable.get(&key1).is_none()); 225 | let entry = memtable.get(&key2).unwrap(); 226 | assert_eq!(entry.get_value(), Some(value2).as_deref()); 227 | 228 | test_cleanup(tempdir, wal).await; 229 | } 230 | 231 | #[async_test] 232 | async fn reopen_with_offset2() { 233 | let (tempdir, params, wal) = test_init().await; 234 | 235 | let key1 = vec![1, 2]; 236 | let key2 = vec![1, 2, 3]; 237 | let value1 = vec![2; 2 * PAGE_SIZE]; 238 | let value2 = vec![2, 3]; 239 | 240 | let op1 = WriteOp::Put(key1.clone(), value1.clone()); 241 | let op2 = WriteOp::Put(key2.clone(), value2.clone()); 242 | 243 | wal.store(&[LogEntry::Write(&op1)]).await.unwrap(); 244 | wal.store(&[LogEntry::Write(&op2)]).await.unwrap(); 245 | wal.sync().await.unwrap(); 246 | 247 | drop(wal); 248 | 249 | let (memtable, wal) = reopen_wal(params, 8212).await; 250 | 251 | assert_eq!(wal.inner.status.read().sync_pos, 8235); 252 | assert_eq!(wal.inner.status.read().write_pos, 8235); 253 | 254 | assert!(memtable.get(&key1).is_none()); 255 | let entry = memtable.get(&key2).unwrap(); 256 | assert_eq!(entry.get_value(), Some(value2).as_deref()); 257 | 258 | test_cleanup(tempdir, wal).await; 259 | } 260 | 261 | #[async_test] 262 | async fn reopen_large_file() { 263 | let (tempdir, params, wal) = test_init().await; 264 | 265 | let key = vec![1, 2]; 266 | let value = vec![2; 2 * PAGE_SIZE]; 267 | let op = WriteOp::Put(key.clone(), value.clone()); 268 | 269 | wal.store(&[LogEntry::Write(&op)]).await.unwrap(); 270 | wal.sync().await.unwrap(); 271 | 272 | drop(wal); 273 | 274 | let (memtable, wal) = reopen_wal(params, 0).await; 275 | 276 | assert_eq!(wal.inner.status.read().sync_pos, 8212); 277 | assert_eq!(wal.inner.status.read().write_pos, 8212); 278 | 279 | let entry = memtable.get(&key).unwrap(); 280 | assert_eq!(entry.get_value(), Some(value).as_deref()); 281 | 282 | test_cleanup(tempdir, wal).await; 283 | } 284 | -------------------------------------------------------------------------------- /src/wal/writer.rs: -------------------------------------------------------------------------------- 1 | use std::path::{Path, PathBuf}; 2 | use std::sync::Arc; 3 | 4 | #[cfg(not(feature = "_async-io"))] 5 | use std::io::Write; 6 | 7 | #[cfg(feature = "tokio-uring")] 8 | use tokio_uring::fs::{File, OpenOptions}; 9 | 10 | #[cfg(feature = "tokio-uring")] 11 | use tokio_uring::buf::BoundedBuf; 12 | 13 | #[cfg(feature = "monoio")] 14 | use monoio::fs::{File, OpenOptions}; 15 | 16 | #[cfg(not(feature = "_async-io"))] 17 | use std::fs::{File, OpenOptions}; 18 | 19 | #[cfg(feature = "monoio")] 20 | use monoio::buf::IoBuf; 21 | 22 | use cfg_if::cfg_if; 23 | 24 | use crate::wal::{LogInner, PAGE_SIZE}; 25 | use crate::{Error, Params, disk}; 26 | 27 | /// The task that actually writes the log to disk 28 | pub struct WalWriter { 29 | log_file: File, 30 | position: usize, 31 | params: Arc, 32 | } 33 | 34 | impl WalWriter { 35 | pub async fn new(params: Arc) -> Self { 36 | let log_file = Self::create_file(¶ms, 0).await.unwrap_or_else(|err| { 37 | panic!( 38 | "Failed to create WAL file in directory {:?}: {err}", 39 | params.db_path 40 | ) 41 | }); 42 | 43 | Self { 44 | log_file, 45 | params, 46 | position: 0, 47 | } 48 | } 49 | 50 | /// Start the writer at a specific position after opening a log 51 | pub async fn continue_from(position: usize, params: Arc) -> Self { 52 | let fpos = position / PAGE_SIZE; 53 | 54 | let log_file = if position % PAGE_SIZE == 0 { 55 | // At the beginning of a new file 56 | Self::create_file(¶ms, fpos) 57 | .await 58 | .unwrap_or_else(|err| { 59 | panic!( 60 | "Failed to create WAL file in directory {:?}: {err}", 61 | params.db_path 62 | ) 63 | }) 64 | } else { 65 | Self::open_file(¶ms, fpos).await.unwrap_or_else(|err| { 66 | panic!( 67 | "Failed to open WAL file in directory {:?}: {err}", 68 | params.db_path 69 | ) 70 | }) 71 | }; 72 | 73 | Self { 74 | log_file, 75 | params, 76 | position, 77 | } 78 | } 79 | 80 | pub fn get_file_path(params: &Params, fpos: usize) -> PathBuf { 81 | params 82 | .db_path 83 | .join(Path::new(&format!("log{:08}.data", fpos + 1))) 84 | } 85 | 86 | /// Open an existing log file (used during recovery/restart) 87 | pub async fn open_file(params: &Params, fpos: usize) -> Result { 88 | let fpath = Self::get_file_path(params, fpos); 89 | log::trace!("Opening file at {fpath:?}"); 90 | 91 | cfg_if! { 92 | if #[cfg(feature="_async-io")] { 93 | let log_file = OpenOptions::new() 94 | .read(true).write(true).create(false).truncate(false) 95 | .open(fpath).await?; 96 | } else { 97 | let log_file = OpenOptions::new() 98 | .read(true).write(true).create(false).truncate(false) 99 | .open(fpath)?; 100 | } 101 | } 102 | 103 | Ok(log_file) 104 | } 105 | 106 | /// Returns true if the writer is done and the associated task should terminate 107 | pub async fn update_log(&mut self, inner: &LogInner) -> Result { 108 | let (to_write, sync_flag, sync_pos, new_offset, stop_flag) = loop { 109 | // This works around the following bug: 110 | // https://github.com/rust-lang/rust/issues/63768 111 | let fut = inner.queue_cond.notified(); 112 | tokio::pin!(fut); 113 | 114 | { 115 | let mut lock = inner.status.write(); 116 | let to_write = std::mem::take(&mut lock.queue); 117 | let sync_flag = lock.sync_flag; 118 | let sync_pos = lock.sync_pos; 119 | let stop_flag = lock.stop_flag; 120 | 121 | let new_offset = if lock.offset_pos > lock.flush_pos { 122 | Some((lock.offset_pos, lock.flush_pos)) 123 | } else { 124 | assert_eq!(lock.offset_pos, lock.flush_pos); 125 | None 126 | }; 127 | 128 | // Check whether there is something to do 129 | if !to_write.is_empty() || new_offset.is_some() || sync_flag || stop_flag { 130 | assert_eq!(self.position, lock.write_pos); 131 | 132 | lock.sync_flag = false; 133 | break (to_write, sync_flag, sync_pos, new_offset, stop_flag); 134 | } 135 | 136 | // wait for change to queue and retry 137 | assert_eq!(lock.write_pos, lock.queue_pos); 138 | fut.as_mut().enable(); 139 | } 140 | 141 | fut.await; 142 | }; 143 | 144 | // Don't hold lock while write 145 | for buf in to_write.into_iter() { 146 | self.write_all(buf) 147 | .await 148 | .map_err(|err| Error::from_io_error("Failed to writ write-ahead log", err))?; 149 | } 150 | 151 | // Only sync if necessary 152 | // We do not need to hold the lock while syncing 153 | // because there is only one write-ahead writer 154 | if sync_flag && sync_pos < self.position { 155 | self.sync().await; 156 | inner.status.write().sync_pos = self.position; 157 | } 158 | 159 | if let Some((new_offset, old_offset)) = new_offset { 160 | self.set_offset(new_offset, old_offset).await?; 161 | } 162 | 163 | // Notify about finished write(s) 164 | { 165 | let mut lock = inner.status.write(); 166 | assert!(lock.write_pos <= self.position); 167 | lock.write_pos = self.position; 168 | 169 | if let Some((new_offset, _)) = new_offset { 170 | lock.flush_pos = new_offset; 171 | } 172 | 173 | inner.write_cond.notify_waiters(); 174 | } 175 | 176 | if stop_flag { 177 | log::debug!("WAL writer finished"); 178 | } 179 | 180 | Ok(stop_flag) 181 | } 182 | 183 | async fn set_offset(&mut self, new_offset: usize, old_offset: usize) -> Result<(), Error> { 184 | let old_file_pos = old_offset / PAGE_SIZE; 185 | let new_file_pos = new_offset / PAGE_SIZE; 186 | 187 | for fpos in old_file_pos..new_file_pos { 188 | let fpath = self 189 | .params 190 | .db_path 191 | .join(Path::new(&format!("log{:08}.data", fpos + 1))); 192 | log::trace!("Removing file {fpath:?}"); 193 | 194 | disk::remove_file(&fpath).await.map_err(|err| { 195 | Error::from_io_error(format!("Failed to remove log file {fpath:?}"), err) 196 | })?; 197 | } 198 | 199 | Ok(()) 200 | } 201 | 202 | async fn sync(&mut self) { 203 | cfg_if! { 204 | if #[cfg(feature="_async-io") ] { 205 | self.log_file.sync_data().await 206 | .expect("Data sync failed"); 207 | } else { 208 | self.log_file.sync_data() 209 | .expect("Data sync failed"); 210 | } 211 | } 212 | } 213 | 214 | #[allow(unused_mut)] 215 | async fn write_all(&mut self, mut data: Vec) -> Result<(), std::io::Error> { 216 | let mut buf_pos = 0; 217 | while buf_pos < data.len() { 218 | let mut file_offset = self.position % PAGE_SIZE; 219 | 220 | // Figure out how much we can fit into the current file 221 | assert!(file_offset < PAGE_SIZE); 222 | 223 | let page_remaining = PAGE_SIZE - file_offset; 224 | let buffer_remaining = data.len() - buf_pos; 225 | let write_len = (buffer_remaining).min(page_remaining); 226 | 227 | assert!(write_len > 0); 228 | cfg_if! { 229 | if #[cfg(feature="tokio-uring")] { 230 | let to_write = data.slice(buf_pos..buf_pos + write_len); 231 | let (res, buf) = self.log_file.write_all_at(to_write, file_offset as u64).await; 232 | res.expect("Failed to write to log file"); 233 | 234 | data = buf.into_inner(); 235 | } else if #[cfg(feature="monoio")] { 236 | let to_write = data.slice(buf_pos..buf_pos + write_len); 237 | let (res, buf) = self.log_file.write_all_at(to_write, file_offset as u64).await; 238 | res.expect("Failed to write to log file"); 239 | 240 | data = buf.into_inner(); 241 | 242 | 243 | }else { 244 | let to_write = &data[buf_pos..buf_pos + write_len]; 245 | self.log_file.write_all(to_write).expect("Failed to write log file"); 246 | } 247 | } 248 | 249 | buf_pos += write_len; 250 | self.position += write_len; 251 | file_offset += write_len; 252 | 253 | assert!(file_offset <= PAGE_SIZE); 254 | 255 | // Create a new file? 256 | if file_offset == PAGE_SIZE { 257 | let file_pos = self.position / PAGE_SIZE; 258 | self.log_file = Self::create_file(&self.params, file_pos).await?; 259 | } 260 | } 261 | 262 | Ok(()) 263 | } 264 | 265 | /// Create a new file that is part of the log 266 | pub async fn create_file(params: &Params, file_pos: usize) -> Result { 267 | let fpath = Self::get_file_path(params, file_pos); 268 | log::trace!("Creating new log file at {fpath:?}"); 269 | 270 | cfg_if! { 271 | if #[cfg(feature="_async-io")] { 272 | File::create(fpath).await 273 | } else { 274 | File::create(fpath) 275 | } 276 | } 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /src/write_batch.rs: -------------------------------------------------------------------------------- 1 | use crate::{Key, Value}; 2 | 3 | #[derive(Debug)] 4 | pub enum WriteOp { 5 | Put(Key, Value), 6 | Delete(Key), 7 | } 8 | 9 | /// A WriteBatch allows to bundle multiple updates together for higher throughput 10 | /// 11 | /// Note: The batch will not be applied to the database until it is passed to `Database::write` 12 | #[derive(Debug)] 13 | pub struct WriteBatch { 14 | pub(crate) writes: Vec, 15 | } 16 | 17 | impl WriteOp { 18 | pub(crate) const PUT_OP: u8 = 1; 19 | pub(crate) const DELETE_OP: u8 = 2; 20 | 21 | pub fn get_key(&self) -> &[u8] { 22 | match self { 23 | Self::Put(key, _) => key, 24 | Self::Delete(key) => key, 25 | } 26 | } 27 | 28 | pub fn get_type(&self) -> u8 { 29 | match self { 30 | Self::Put(_, _) => Self::PUT_OP, 31 | Self::Delete(_) => Self::DELETE_OP, 32 | } 33 | } 34 | 35 | pub(crate) fn get_key_length(&self) -> u64 { 36 | match self { 37 | Self::Put(key, _) | Self::Delete(key) => key.len() as u64, 38 | } 39 | } 40 | 41 | #[allow(dead_code)] 42 | pub(crate) fn get_value_length(&self) -> u64 { 43 | match self { 44 | Self::Put(_, value) => value.len() as u64, 45 | Self::Delete(_) => 0u64, 46 | } 47 | } 48 | } 49 | 50 | impl WriteBatch { 51 | pub fn new() -> Self { 52 | Self { writes: Vec::new() } 53 | } 54 | 55 | /// Record a put operation in the write batch 56 | /// Will not be applied to the Database until the WriteBatch is written 57 | pub fn put(&mut self, key: Key, value: Value) { 58 | self.writes.push(WriteOp::Put(key, value)); 59 | } 60 | 61 | pub fn delete(&mut self, key: Key) { 62 | self.writes.push(WriteOp::Delete(key)); 63 | } 64 | } 65 | 66 | impl Default for WriteBatch { 67 | fn default() -> Self { 68 | Self::new() 69 | } 70 | } 71 | 72 | /// Allows specifying details of a write 73 | #[derive(Debug, Clone)] 74 | pub struct WriteOptions { 75 | /// Should the call block until it is guaranteed to be written to disk? 76 | pub sync: bool, 77 | } 78 | 79 | impl WriteOptions { 80 | pub const fn new() -> Self { 81 | Self { sync: true } 82 | } 83 | } 84 | 85 | impl Default for WriteOptions { 86 | fn default() -> Self { 87 | Self::new() 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /sync/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lsm-sync" 3 | version = "0.5.0-dev" 4 | authors = ["Kai Mast "] 5 | edition = "2024" 6 | repository = "https://github.com/kaimast/lsm-rs" 7 | description = "Synchronous API for the lsm crate" 8 | license = "MIT" 9 | readme = "../README.md" 10 | keywords = ["storage", "database", "async"] 11 | 12 | [dependencies] 13 | lsm = { path="..", version="0.5.0-dev" } 14 | cfg-if = "1" 15 | 16 | [dependencies.tokio] 17 | version="1" 18 | default-features=false 19 | features=["rt-multi-thread", "io-util", "sync", "macros", "tracing"] 20 | 21 | [dev-dependencies] 22 | env_logger = "0.11" 23 | tempfile = "3" 24 | 25 | [features] 26 | default = ["snappy-compression", "bloom-filters"] 27 | snappy-compression = ["lsm/snappy-compression"] 28 | wisckey = ["lsm/wisckey"] 29 | bloom-filters = ["lsm/bloom-filters"] 30 | 31 | [[test]] 32 | name = "basic" 33 | path = "tests/basic.rs" 34 | 35 | [[test]] 36 | name = "reopen" 37 | path = "tests/reopen.rs" 38 | -------------------------------------------------------------------------------- /sync/justfile: -------------------------------------------------------------------------------- 1 | LOG_LEVEL := "debug" 2 | 3 | tests: default-tests wisckey-tests 4 | 5 | default-tests: 6 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test 7 | 8 | lint: 9 | cargo clippy --no-default-features -- -D warnings 10 | 11 | check-formatting: 12 | cargo fmt --check 13 | 14 | fix-formatting: 15 | cargo fmt 16 | 17 | udeps: 18 | cargo udeps --all-targets --release 19 | 20 | wisckey-tests: 21 | env RUST_BACKTRACE=1 RUST_LOG={{LOG_LEVEL}} cargo test --no-default-features --features=snappy-compression,wisckey 22 | -------------------------------------------------------------------------------- /sync/src/database.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use tokio::runtime::Runtime as TokioRuntime; 4 | 5 | use lsm::logic::DbLogic; 6 | use lsm::tasks::{TaskManager, TaskType}; 7 | use lsm::{EntryRef, Error, Key, Params, StartMode, Value, WriteBatch, WriteOptions}; 8 | 9 | use crate::iterate::DbIterator; 10 | 11 | pub struct Database { 12 | inner: Arc, 13 | tasks: Arc, 14 | tokio_rt: Arc, 15 | } 16 | 17 | impl Database { 18 | pub fn new(mode: StartMode) -> Result { 19 | let params = Params::default(); 20 | Self::new_with_params(mode, params) 21 | } 22 | 23 | pub fn new_with_params(mode: StartMode, params: Params) -> Result { 24 | let tokio_rt = Arc::new(TokioRuntime::new().expect("Failed to start tokio")); 25 | let (inner, tasks) = tokio_rt.block_on(async { 26 | let compaction_concurrency = params.compaction_concurrency; 27 | 28 | match DbLogic::new(mode, params).await { 29 | Ok(inner) => { 30 | let inner = Arc::new(inner); 31 | let tasks = 32 | Arc::new(TaskManager::new(inner.clone(), compaction_concurrency).await); 33 | 34 | Ok((inner, tasks)) 35 | } 36 | Err(err) => Err(err), 37 | } 38 | })?; 39 | 40 | Ok(Self { 41 | inner, 42 | tasks, 43 | tokio_rt, 44 | }) 45 | } 46 | 47 | /// Will deserialize V from the raw data (avoids an additional copy) 48 | #[inline] 49 | pub fn get(&self, key: &[u8]) -> Result, Error> { 50 | let inner = &*self.inner; 51 | 52 | self.tokio_rt.block_on(async { 53 | let result = inner.get(key).await; 54 | 55 | match result { 56 | Ok((needs_compaction, data)) => { 57 | if needs_compaction { 58 | self.tasks.wake_up(&TaskType::LevelCompaction); 59 | } 60 | 61 | Ok(data) 62 | } 63 | Err(err) => Err(err), 64 | } 65 | }) 66 | } 67 | 68 | /// Ensure all data is written to disk 69 | /// Only has an effect if there were previous writes with sync=false 70 | pub fn synchronize(&self) -> Result<(), Error> { 71 | let inner = &*self.inner; 72 | 73 | self.tokio_rt 74 | .block_on(async move { inner.synchronize().await }) 75 | } 76 | 77 | /// Store entry 78 | #[inline] 79 | pub fn put(&self, key: Key, value: Value) -> Result<(), Error> { 80 | const OPTS: WriteOptions = WriteOptions::new(); 81 | self.put_opts(key, value, &OPTS) 82 | } 83 | 84 | /// Store entry (with options) 85 | #[inline] 86 | pub fn put_opts(&self, key: Key, value: Value, opts: &WriteOptions) -> Result<(), Error> { 87 | let mut batch = WriteBatch::new(); 88 | batch.put(key, value); 89 | self.write_opts(batch, opts) 90 | } 91 | 92 | /// Delete an existing entry 93 | /// For efficiency, the datastore does not check whether the key actually existed 94 | /// Instead, it will just mark the most recent (which could be the first one) as deleted 95 | pub fn delete(&self, key: Key) -> Result<(), Error> { 96 | const OPTS: WriteOptions = WriteOptions::new(); 97 | 98 | let mut batch = WriteBatch::new(); 99 | batch.delete(key); 100 | 101 | self.write_opts(batch, &OPTS) 102 | } 103 | 104 | /// Delete an existing entry (with additional options) 105 | pub fn delete_opts(&self, key: Key, opts: &WriteOptions) -> Result<(), Error> { 106 | let mut batch = WriteBatch::new(); 107 | batch.delete(key); 108 | 109 | self.write_opts(batch, opts) 110 | } 111 | 112 | /// Iterate over all entries in the database 113 | pub fn iter(&self) -> DbIterator { 114 | let tokio_rt = self.tokio_rt.clone(); 115 | 116 | self.tokio_rt.block_on(async { 117 | let (mem_iters, table_iters, min_key, max_key) = 118 | self.inner.prepare_iter(None, None).await; 119 | 120 | DbIterator::new( 121 | mem_iters, 122 | table_iters, 123 | min_key, 124 | max_key, 125 | false, 126 | #[cfg(feature = "wisckey")] 127 | self.inner.get_value_log(), 128 | tokio_rt, 129 | ) 130 | }) 131 | } 132 | 133 | /// Like iter(), but reverse 134 | pub fn reverse_iter(&self) -> DbIterator { 135 | let tokio_rt = self.tokio_rt.clone(); 136 | 137 | self.tokio_rt.block_on(async { 138 | let (mem_iters, table_iters, min_key, max_key) = 139 | self.inner.prepare_reverse_iter(None, None).await; 140 | 141 | DbIterator::new( 142 | mem_iters, 143 | table_iters, 144 | min_key, 145 | max_key, 146 | true, 147 | #[cfg(feature = "wisckey")] 148 | self.inner.get_value_log(), 149 | tokio_rt, 150 | ) 151 | }) 152 | } 153 | 154 | /// Like iter(), but will only include entries with keys in [min_key;max_key) 155 | pub fn range_iter(&self, min: &[u8], max: &[u8]) -> DbIterator { 156 | let tokio_rt = self.tokio_rt.clone(); 157 | 158 | self.tokio_rt.block_on(async { 159 | let (mem_iters, table_iters, min_key, max_key) = 160 | self.inner.prepare_iter(Some(min), Some(max)).await; 161 | 162 | DbIterator::new( 163 | mem_iters, 164 | table_iters, 165 | min_key, 166 | max_key, 167 | false, 168 | #[cfg(feature = "wisckey")] 169 | self.inner.get_value_log(), 170 | tokio_rt, 171 | ) 172 | }) 173 | } 174 | 175 | /// Like range_iter(), but in reverse. 176 | /// It will only include entries with keys in (min_key;max_key] 177 | pub fn reverse_range_iter(&self, max_key: &[u8], min_key: &[u8]) -> DbIterator { 178 | let tokio_rt = self.tokio_rt.clone(); 179 | 180 | self.tokio_rt.block_on(async { 181 | let (mem_iters, table_iters, min_key, max_key) = self 182 | .inner 183 | .prepare_reverse_iter(Some(max_key), Some(min_key)) 184 | .await; 185 | 186 | DbIterator::new( 187 | mem_iters, 188 | table_iters, 189 | min_key, 190 | max_key, 191 | true, 192 | #[cfg(feature = "wisckey")] 193 | self.inner.get_value_log(), 194 | tokio_rt, 195 | ) 196 | }) 197 | } 198 | 199 | /// Write a batch of updates to the database 200 | /// 201 | /// If you only want to write to a single key, use `Database::put` instead 202 | pub fn write(&self, write_batch: WriteBatch) -> Result<(), Error> { 203 | self.write_opts(write_batch, &WriteOptions::default()) 204 | } 205 | 206 | pub fn write_opts(&self, write_batch: WriteBatch, opts: &WriteOptions) -> Result<(), Error> { 207 | let inner = &*self.inner; 208 | 209 | self.tokio_rt.block_on(async move { 210 | let needs_compaction = inner.write_opts(write_batch, opts).await?; 211 | if needs_compaction { 212 | self.tasks.wake_up(&TaskType::MemtableCompaction); 213 | } 214 | 215 | Ok(()) 216 | }) 217 | } 218 | 219 | /// Stop all background tasks gracefully 220 | pub fn stop(&self) -> Result<(), Error> { 221 | let tasks = self.tasks.clone(); 222 | 223 | self.tokio_rt 224 | .block_on(async move { tasks.stop_all().await }) 225 | } 226 | } 227 | 228 | impl Drop for Database { 229 | /// This might abort some tasks is stop() has not been called 230 | /// crash consistency should prevent this from being a problem 231 | fn drop(&mut self) { 232 | self.tasks.terminate(); 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /sync/src/iterate.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "wisckey")] 2 | use lsm::values::ValueLog; 3 | 4 | use lsm::EntryRef; 5 | use lsm::memtable::MemtableIterator; 6 | use lsm::sorted_table::{InternalIterator, TableIterator}; 7 | 8 | use std::cmp::Ordering; 9 | use std::sync::Arc; 10 | 11 | use cfg_if::cfg_if; 12 | 13 | /// Allows iterating over a consistent snapshot of the database 14 | pub struct DbIterator { 15 | last_key: Option>, 16 | iterators: Vec>, 17 | 18 | min_key: Option>, 19 | max_key: Option>, 20 | 21 | tokio_rt: Arc, 22 | 23 | reverse: bool, 24 | 25 | #[cfg(feature = "wisckey")] 26 | value_log: Arc, 27 | } 28 | 29 | type NextKV = Option<(lsm::manifest::SeqNumber, usize)>; 30 | 31 | impl DbIterator { 32 | pub(crate) fn new( 33 | mem_iters: Vec, 34 | table_iters: Vec, 35 | min_key: Option>, 36 | max_key: Option>, 37 | reverse: bool, 38 | #[cfg(feature = "wisckey")] value_log: Arc, 39 | tokio_rt: Arc, 40 | ) -> Self { 41 | let mut iterators: Vec> = vec![]; 42 | 43 | for iter in mem_iters.into_iter() { 44 | iterators.push(Box::new(iter)); 45 | } 46 | 47 | for iter in table_iters.into_iter() { 48 | iterators.push(Box::new(iter)); 49 | } 50 | 51 | Self { 52 | last_key: None, 53 | iterators, 54 | tokio_rt, 55 | min_key, 56 | max_key, 57 | reverse, 58 | #[cfg(feature = "wisckey")] 59 | value_log, 60 | } 61 | } 62 | 63 | async fn parse_iter( 64 | &self, 65 | pos: usize, 66 | last_key: &Option>, 67 | next_iter: Option<&dyn InternalIterator>, 68 | iter: &mut dyn InternalIterator, 69 | next_kv: NextKV, 70 | ) -> (bool, NextKV) { 71 | if self.reverse { 72 | // This iterator might be "behind" other iterators 73 | if let Some(last_key) = last_key { 74 | while !iter.at_end() && iter.get_key() >= last_key.as_slice() { 75 | iter.step().await; 76 | } 77 | } 78 | 79 | // Don't pick a key that is greater than the maximum 80 | if let Some(max_key) = &self.max_key { 81 | while !iter.at_end() && iter.get_key() > max_key.as_slice() { 82 | iter.step().await; 83 | } 84 | 85 | // There might be no key in this iterator that is <=max_key 86 | if iter.at_end() || iter.get_key() > max_key.as_slice() { 87 | return (false, next_kv); 88 | } 89 | } 90 | 91 | if iter.at_end() { 92 | return (false, next_kv); 93 | } 94 | 95 | let key = iter.get_key(); 96 | 97 | // Don't pick a key that is less or equal to the minimum 98 | if let Some(min_key) = &self.min_key 99 | && iter.get_key() <= min_key.as_slice() 100 | { 101 | return (false, next_kv); 102 | } 103 | 104 | let seq_number = iter.get_seq_number(); 105 | 106 | if let Some((max_seq_number, _)) = next_kv { 107 | let max_key = next_iter.unwrap().get_key(); 108 | 109 | match key.cmp(max_key) { 110 | Ordering::Greater => (true, Some((seq_number, pos))), 111 | Ordering::Equal => { 112 | if seq_number > max_seq_number { 113 | (true, Some((seq_number, pos))) 114 | } else { 115 | (false, next_kv) 116 | } 117 | } 118 | Ordering::Less => (false, next_kv), 119 | } 120 | } else { 121 | (true, Some((seq_number, pos))) 122 | } 123 | } else { 124 | // This iterator might be "behind" other iterators 125 | if let Some(last_key) = last_key { 126 | while !iter.at_end() && iter.get_key() <= last_key.as_slice() { 127 | iter.step().await; 128 | } 129 | } 130 | 131 | // Don't pick a key that is smaller than the minimum 132 | if let Some(min_key) = &self.min_key { 133 | while !iter.at_end() && iter.get_key() < min_key.as_slice() { 134 | iter.step().await; 135 | } 136 | 137 | // There might be no key in this iterator that is >=min_key 138 | if iter.at_end() || iter.get_key() < min_key.as_slice() { 139 | return (false, next_kv); 140 | } 141 | } 142 | 143 | if iter.at_end() { 144 | return (false, next_kv); 145 | } 146 | 147 | let key = iter.get_key(); 148 | 149 | // Don't pick a key that is greater or equal to the maximum 150 | if let Some(max_key) = &self.max_key 151 | && iter.get_key() >= max_key.as_slice() 152 | { 153 | return (false, next_kv); 154 | } 155 | 156 | let seq_number = iter.get_seq_number(); 157 | 158 | if let Some((min_seq_number, _)) = next_kv { 159 | let min_key = next_iter.unwrap().get_key(); 160 | 161 | match key.cmp(min_key) { 162 | Ordering::Less => (true, Some((seq_number, pos))), 163 | Ordering::Equal => { 164 | if seq_number > min_seq_number { 165 | (true, Some((seq_number, pos))) 166 | } else { 167 | (false, next_kv) 168 | } 169 | } 170 | Ordering::Greater => (false, next_kv), 171 | } 172 | } else { 173 | (true, Some((seq_number, pos))) 174 | } 175 | } 176 | } 177 | } 178 | 179 | impl Iterator for DbIterator { 180 | type Item = (Vec, EntryRef); 181 | 182 | fn next(&mut self) -> Option { 183 | let mut iterators = std::mem::take(&mut self.iterators); 184 | let mut last_key = self.last_key.clone(); 185 | let mut result = None; 186 | 187 | while result.is_none() { 188 | let (out_result, out_last_key, out_iterators) = self.tokio_rt.block_on(async { 189 | let mut next_kv = None; 190 | let num_iterators = iterators.len(); 191 | 192 | for pos in 0..num_iterators { 193 | // Split slices to make the borrow checker happy 194 | let (prev, cur) = iterators[..].split_at_mut(pos); 195 | 196 | let next_iter = if let Some((_, pos)) = next_kv { 197 | // see https://github.com/rust-lang/rust-clippy/issues/9309 198 | #[allow(clippy::borrowed_box)] 199 | let iter: &Box = &prev[pos]; 200 | Some(&**iter) 201 | } else { 202 | None 203 | }; 204 | 205 | let current_iter = &mut *cur[0]; 206 | let (change, kv) = self 207 | .parse_iter(pos, &last_key, next_iter, current_iter, next_kv) 208 | .await; 209 | 210 | if change { 211 | next_kv = kv; 212 | } 213 | } 214 | 215 | let result = if let Some((_, pos)) = next_kv.take() { 216 | #[allow(clippy::explicit_auto_deref)] 217 | let iter: &dyn InternalIterator = &*iterators[pos]; 218 | 219 | let res_key = iter.get_key().to_vec(); 220 | last_key = Some(res_key.clone()); 221 | 222 | cfg_if! { 223 | if #[ cfg(feature="wisckey") ] { 224 | iter.get_entry(&self.value_log).await 225 | .map(|entry| Some((res_key, entry))) 226 | } else { 227 | iter.get_entry().map(|entry|Some((res_key, entry))) 228 | } 229 | } 230 | } else { 231 | Some(None) 232 | }; 233 | 234 | (result, last_key, iterators) 235 | }); 236 | 237 | result = out_result; 238 | last_key = out_last_key; 239 | iterators = out_iterators; 240 | } 241 | 242 | self.last_key = last_key; 243 | self.iterators = iterators; 244 | 245 | result.unwrap() 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /sync/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub use lsm::{Params, StartMode, WriteBatch, WriteOptions}; 2 | 3 | pub mod iterate; 4 | 5 | mod database; 6 | pub use database::Database; 7 | -------------------------------------------------------------------------------- /sync/tests/basic.rs: -------------------------------------------------------------------------------- 1 | use lsm_sync::{Database, Params, StartMode, WriteBatch, WriteOptions}; 2 | use tempfile::{Builder, TempDir}; 3 | 4 | const SM: StartMode = StartMode::CreateOrOverride; 5 | 6 | fn test_init() -> (TempDir, Database) { 7 | let tmp_dir = Builder::new().prefix("lsm-sync-test-").tempdir().unwrap(); 8 | let _ = env_logger::builder().is_test(true).try_init(); 9 | 10 | let mut db_path = tmp_dir.path().to_path_buf(); 11 | db_path.push("storage.lsm"); 12 | 13 | let params = Params { 14 | db_path, 15 | ..Default::default() 16 | }; 17 | let database = 18 | Database::new_with_params(SM, params).expect("Failed to create database instance"); 19 | 20 | (tmp_dir, database) 21 | } 22 | 23 | #[test] 24 | fn get_put() { 25 | let (_tmpdir, database) = test_init(); 26 | 27 | let key1 = "Foo".to_string().into_bytes(); 28 | let key2 = "Foz".to_string().into_bytes(); 29 | let value1 = "Bar".to_string().into_bytes(); 30 | let value2 = "Baz".to_string().into_bytes(); 31 | 32 | assert!(database.get(&key1).unwrap().is_none()); 33 | assert!(database.get(&key2).unwrap().is_none()); 34 | 35 | database.put(key1.clone(), value1.clone()).unwrap(); 36 | 37 | assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value1); 38 | assert!(database.get(&key2).unwrap().is_none()); 39 | 40 | database.put(key1.clone(), value2.clone()).unwrap(); 41 | assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value2); 42 | } 43 | 44 | #[test] 45 | fn iterate() { 46 | const COUNT: u64 = 5_000; 47 | 48 | let (_tmpdir, database) = test_init(); 49 | 50 | // Write without fsync to speed up tests 51 | let options = WriteOptions { sync: false }; 52 | 53 | for pos in 0..COUNT { 54 | let key = format!("key_{pos:05}").into_bytes(); 55 | let value = format!("some_string_{pos}").into_bytes(); 56 | database.put_opts(key, value, &options).unwrap(); 57 | } 58 | 59 | let mut count = 0; 60 | 61 | for (pos, (key, val)) in database.iter().enumerate() { 62 | let expected_key = format!("key_{pos:05}").into_bytes(); 63 | let expected_val = format!("some_string_{pos}").into_bytes(); 64 | 65 | assert_eq!(expected_key, key); 66 | assert_eq!(expected_val, val.get_value()); 67 | 68 | count += 1; 69 | } 70 | 71 | assert_eq!(count, COUNT); 72 | } 73 | 74 | #[test] 75 | fn range_iterate() { 76 | const COUNT: u64 = 25_000; 77 | 78 | let (_tmpdir, database) = test_init(); 79 | 80 | // Write without fsync to speed up tests 81 | let options = WriteOptions { sync: false }; 82 | 83 | for pos in 0..COUNT { 84 | let key = format!("key_{pos:05}").into_bytes(); 85 | let value = format!("some_string_{pos}").into_bytes(); 86 | database.put_opts(key, value, &options).unwrap(); 87 | } 88 | 89 | let start = "key_00300".to_string().into_bytes(); 90 | let end = "key_10150".to_string().into_bytes(); 91 | let iter = database.range_iter(&start, &end); 92 | 93 | let mut pos = 0; 94 | for (key, val) in iter { 95 | let real_pos = pos + 300; 96 | let expected_key = format!("key_{real_pos:05}").into_bytes(); 97 | let expected_val = format!("some_string_{real_pos}").into_bytes(); 98 | 99 | assert_eq!(expected_key, key); 100 | assert_eq!(expected_val, val.get_value()); 101 | 102 | pos += 1; 103 | } 104 | 105 | assert_eq!(pos, 9850); 106 | 107 | database.stop().unwrap(); 108 | } 109 | 110 | #[test] 111 | fn range_iterate_reverse() { 112 | const COUNT: u64 = 25_000; 113 | 114 | let (_tmpdir, database) = test_init(); 115 | 116 | // Write without fsync to speed up tests 117 | let options = WriteOptions { sync: false }; 118 | 119 | for pos in 0..COUNT { 120 | let key = format!("key_{pos:05}").into_bytes(); 121 | let value = format!("some_string_{pos}").into_bytes(); 122 | database.put_opts(key, value, &options).unwrap(); 123 | } 124 | 125 | let start = "key_10150".to_string().into_bytes(); 126 | let end = "key_00300".to_string().into_bytes(); 127 | let iter = database.reverse_range_iter(&start, &end); 128 | 129 | let mut pos = 0; 130 | for (key, val) in iter { 131 | let real_pos = 10150 - pos; 132 | let expected_key = format!("key_{real_pos:05}").into_bytes(); 133 | 134 | assert_eq!(expected_key, key); 135 | assert_eq!( 136 | format!("some_string_{real_pos}").into_bytes(), 137 | val.get_value() 138 | ); 139 | 140 | pos += 1; 141 | } 142 | 143 | assert_eq!(pos, 9850); 144 | 145 | database.stop().unwrap(); 146 | } 147 | 148 | #[test] 149 | fn range_iterate_empty() { 150 | let (_tmpdir, database) = test_init(); 151 | 152 | const COUNT: u64 = 5_000; 153 | 154 | // Write without fsync to speed up tests 155 | let options = WriteOptions { sync: false }; 156 | 157 | for pos in 0..COUNT { 158 | let key = format!("key_{pos:05}").into_bytes(); 159 | let value = format!("some_string_{pos}").into_bytes(); 160 | database.put_opts(key, value, &options).unwrap(); 161 | } 162 | 163 | // Pick a range that is outside of the put range 164 | let start = "key_05300".to_string().into_bytes(); 165 | let end = "key_10150".to_string().into_bytes(); 166 | let mut iter = database.range_iter(&start, &end); 167 | 168 | if let Some((_key, _val)) = iter.next() { 169 | panic!("Found a key where there should be none"); 170 | } 171 | 172 | database.stop().unwrap(); 173 | } 174 | 175 | #[test] 176 | fn get_put_many() { 177 | const COUNT: u64 = 100_000; 178 | 179 | let (_tmpdir, database) = test_init(); 180 | 181 | // Write without fsync to speed up tests 182 | let options = WriteOptions { sync: false }; 183 | 184 | for pos in 0..COUNT { 185 | let key = format!("key_{pos}").into_bytes(); 186 | let value = format!("some_string_{pos}").into_bytes(); 187 | 188 | database.put_opts(key, value, &options).unwrap(); 189 | } 190 | 191 | for pos in 0..COUNT { 192 | let key = format!("key_{pos}").into_bytes(); 193 | let value = format!("some_string_{pos}").into_bytes(); 194 | 195 | assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,); 196 | } 197 | } 198 | 199 | #[test] 200 | fn get_put_delete_many() { 201 | const COUNT: u64 = 10_000; 202 | 203 | let (_tmpdir, database) = test_init(); 204 | 205 | // Write without fsync to speed up tests 206 | let options = WriteOptions { sync: false }; 207 | 208 | for pos in 0..COUNT { 209 | let key = format!("key_{pos}").into_bytes(); 210 | let value = format!("some_string_{pos}").into_bytes(); 211 | 212 | database.put_opts(key, value, &options).unwrap(); 213 | } 214 | 215 | for pos in 0..COUNT { 216 | let key = format!("key_{pos}").into_bytes(); 217 | database.delete(key).unwrap(); 218 | } 219 | 220 | for pos in 0..COUNT { 221 | let key = format!("key_{pos}").into_bytes(); 222 | assert!(database.get(&key).unwrap().is_none()); 223 | } 224 | } 225 | 226 | #[test] 227 | fn override_many() { 228 | const COUNT: u64 = 100_000; 229 | 230 | let (_tmpdir, database) = test_init(); 231 | 232 | // Write without fsync to speed up tests 233 | let options = WriteOptions { sync: false }; 234 | 235 | for pos in 0..COUNT { 236 | let key = format!("key_{pos}").into_bytes(); 237 | let value = format!("some_string_{pos}").into_bytes(); 238 | 239 | database.put_opts(key, value, &options).unwrap(); 240 | } 241 | 242 | for pos in 0..COUNT { 243 | let key = format!("key_{pos}").into_bytes(); 244 | let value = format!("some_other_string_{pos}").into_bytes(); 245 | 246 | database.put_opts(key, value, &options).unwrap(); 247 | } 248 | 249 | for pos in 0..COUNT { 250 | let key = format!("key_{pos}").into_bytes(); 251 | let value = format!("some_other_string_{pos}").into_bytes(); 252 | 253 | assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value); 254 | } 255 | } 256 | 257 | #[test] 258 | fn override_subset() { 259 | const NCOUNT: u64 = 100_000; 260 | const COUNT: u64 = 25_000; 261 | 262 | let (_tmpdir, database) = test_init(); 263 | 264 | // Write without fsync to speed up tests 265 | let options = WriteOptions { sync: false }; 266 | 267 | for pos in 0..NCOUNT { 268 | let key = format!("key_{pos}").into_bytes(); 269 | let value = format!("some_string_{pos}").into_bytes(); 270 | database.put_opts(key, value, &options).unwrap(); 271 | } 272 | 273 | for pos in 0..COUNT { 274 | let key = format!("key_{pos}").into_bytes(); 275 | let value = format!("some_other_string_{pos}").into_bytes(); 276 | database.put_opts(key, value, &options).unwrap(); 277 | } 278 | 279 | for pos in 0..COUNT { 280 | let key = format!("key_{pos}").into_bytes(); 281 | let value = format!("some_other_string_{pos}").into_bytes(); 282 | 283 | assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,); 284 | } 285 | 286 | for pos in COUNT..NCOUNT { 287 | let key = format!("key_{pos}").into_bytes(); 288 | let value = format!("some_string_{pos}").into_bytes(); 289 | 290 | assert_eq!(database.get(&key).unwrap().unwrap().get_value(), value,); 291 | } 292 | 293 | database.stop().unwrap(); 294 | } 295 | 296 | #[test] 297 | fn batched_write() { 298 | const COUNT: u64 = 1000; 299 | 300 | let (_tmpdir, database) = test_init(); 301 | 302 | let mut batch = WriteBatch::new(); 303 | 304 | for pos in 0..COUNT { 305 | let key = format!("key{pos}").into_bytes(); 306 | let value = format!("value{pos}").into_bytes(); 307 | batch.put(key, value); 308 | } 309 | 310 | database.write(batch).unwrap(); 311 | 312 | for pos in 0..COUNT { 313 | let key = format!("key{pos}").into_bytes(); 314 | let value = format!("value{pos}").into_bytes(); 315 | 316 | let entry = database.get(&key).unwrap(); 317 | 318 | assert!(entry.is_some()); 319 | assert_eq!(entry.unwrap().get_value(), value.as_slice()); 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /sync/tests/reopen.rs: -------------------------------------------------------------------------------- 1 | use lsm_sync::{Database, Params, StartMode, WriteOptions}; 2 | use tempfile::{Builder, TempDir}; 3 | 4 | fn test_init() -> (TempDir, Params, Database) { 5 | let tmp_dir = Builder::new() 6 | .prefix("lsm-sync-test-reopen-") 7 | .tempdir() 8 | .unwrap(); 9 | let _ = env_logger::builder().is_test(true).try_init(); 10 | 11 | let mut db_path = tmp_dir.path().to_path_buf(); 12 | db_path.push("storage.lsm"); 13 | 14 | let params = Params { 15 | db_path, 16 | ..Default::default() 17 | }; 18 | let database = Database::new_with_params(StartMode::CreateOrOverride, params.clone()) 19 | .expect("Failed to create database instance"); 20 | 21 | (tmp_dir, params, database) 22 | } 23 | 24 | #[test] 25 | fn get_put() { 26 | let (_tmpdir, params, database) = test_init(); 27 | 28 | let key1 = String::from("Foo").into_bytes(); 29 | let value1 = String::from("Bar").into_bytes(); 30 | let value2 = String::from("Baz").into_bytes(); 31 | 32 | assert!(database.get(&key1).unwrap().is_none()); 33 | 34 | database.put(key1.clone(), value1.clone()).unwrap(); 35 | drop(database); 36 | 37 | // Reopen 38 | let database = Database::new_with_params(StartMode::Open, params.clone()) 39 | .expect("Failed to create database instance"); 40 | 41 | assert_eq!( 42 | database.get(&key1).unwrap().unwrap().get_value(), 43 | value1.clone() 44 | ); 45 | database.put(key1.clone(), value2.clone()).unwrap(); 46 | 47 | drop(database); 48 | 49 | // Reopen again 50 | let database = Database::new_with_params(StartMode::Open, params) 51 | .expect("Failed to create database instance"); 52 | 53 | assert_eq!(database.get(&key1).unwrap().unwrap().get_value(), value2); 54 | } 55 | 56 | #[test] 57 | fn get_put_many() { 58 | const COUNT: u64 = 100_000; 59 | 60 | let (_tmpdir, params, database) = test_init(); 61 | 62 | // Write without fsync to speed up tests 63 | let options = WriteOptions { sync: false }; 64 | 65 | for pos in 0..COUNT { 66 | let key = format!("key_{pos}").into_bytes(); 67 | let value = format!("some_string_{pos}").into_bytes(); 68 | database.put_opts(key, value, &options).unwrap(); 69 | } 70 | 71 | drop(database); 72 | 73 | // Reopen 74 | let database = Database::new_with_params(StartMode::Open, params.clone()) 75 | .expect("Failed to create database instance"); 76 | 77 | for pos in 0..COUNT { 78 | let key = format!("key_{pos}").into_bytes(); 79 | let value = format!("some_string_{pos}").into_bytes(); 80 | 81 | assert_eq!( 82 | database.get(&key).unwrap().unwrap().get_value(), 83 | value.as_slice(), 84 | ); 85 | } 86 | } 87 | 88 | #[test] 89 | fn get_put_large() { 90 | const COUNT: usize = 100; 91 | const SIZE: usize = 100_000; 92 | 93 | let (_tmpdir, params, database) = test_init(); 94 | 95 | // Write without fsync to speed up tests 96 | let options = WriteOptions { sync: false }; 97 | 98 | for pos in 0..COUNT { 99 | let key = format!("key_{pos:05}").into_bytes(); 100 | let value = format!("value_{pos}").repeat(SIZE).into_bytes(); 101 | 102 | database.put_opts(key, value, &options).unwrap(); 103 | } 104 | 105 | database.synchronize().unwrap(); 106 | drop(database); 107 | 108 | // Reopen 109 | let database = Database::new_with_params(StartMode::Open, params.clone()) 110 | .expect("Failed to create database instance"); 111 | 112 | let mut iterator = database.iter(); 113 | let mut pos = 0; 114 | 115 | while let Some((key, value)) = iterator.next() { 116 | let expected_key = format!("key_{pos:05}").into_bytes(); 117 | let expected_value = format!("value_{pos}").repeat(SIZE).into_bytes(); 118 | 119 | assert_eq!(expected_key, key); 120 | assert_eq!(expected_value, value.get_value()); 121 | 122 | pos += 1; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tests/reopen.rs: -------------------------------------------------------------------------------- 1 | use lsm::{Database, Params, StartMode, WriteOptions}; 2 | use tempfile::{Builder, TempDir}; 3 | 4 | use futures::stream::StreamExt; 5 | 6 | #[cfg(feature = "tokio-uring")] 7 | use kioto_uring_executor::test as async_test; 8 | 9 | #[cfg(feature = "monoio")] 10 | use monoio::test as async_test; 11 | 12 | #[cfg(not(feature = "_async-io"))] 13 | use tokio::test as async_test; 14 | 15 | async fn test_init() -> (TempDir, Params, Database) { 16 | let tmp_dir = Builder::new() 17 | .prefix("lsm-async-test-reopen-") 18 | .tempdir() 19 | .unwrap(); 20 | let _ = env_logger::builder().is_test(true).try_init(); 21 | 22 | let mut db_path = tmp_dir.path().to_path_buf(); 23 | db_path.push("storage.lsm"); 24 | 25 | let params = Params { 26 | db_path, 27 | ..Default::default() 28 | }; 29 | let database = Database::new_with_params(StartMode::CreateOrOverride, params.clone()) 30 | .await 31 | .expect("Failed to create database instance"); 32 | 33 | (tmp_dir, params, database) 34 | } 35 | 36 | #[async_test] 37 | async fn get_put() { 38 | let (_tmpdir, params, database) = test_init().await; 39 | 40 | let key1 = String::from("Foo").into_bytes(); 41 | let value1 = String::from("Bar").into_bytes(); 42 | let value2 = String::from("Baz").into_bytes(); 43 | 44 | assert!(database.get(&key1).await.unwrap().is_none()); 45 | 46 | database.put(key1.clone(), value1.clone()).await.unwrap(); 47 | drop(database); 48 | 49 | // Reopen 50 | let database = Database::new_with_params(StartMode::Open, params.clone()) 51 | .await 52 | .expect("Failed to create database instance"); 53 | 54 | assert_eq!( 55 | database.get(&key1).await.unwrap().unwrap().get_value(), 56 | value1 57 | ); 58 | database.put(key1.clone(), value2.clone()).await.unwrap(); 59 | 60 | drop(database); 61 | 62 | // Reopen again 63 | let database = Database::new_with_params(StartMode::Open, params) 64 | .await 65 | .expect("Failed to create database instance"); 66 | 67 | assert_eq!( 68 | database.get(&key1).await.unwrap().unwrap().get_value(), 69 | value2 70 | ); 71 | } 72 | 73 | #[async_test] 74 | async fn get_put_many() { 75 | const COUNT: u64 = 100_000; 76 | 77 | let (_tmpdir, params, database) = test_init().await; 78 | 79 | // Write without fsync to speed up tests 80 | let options = WriteOptions { sync: false }; 81 | 82 | for pos in 0..COUNT { 83 | let key = format!("key_{pos:05}").into_bytes(); 84 | let value = format!("some_string_{pos}").into_bytes(); 85 | database.put_opts(key, value, &options).await.unwrap(); 86 | } 87 | 88 | database.synchronize().await.unwrap(); 89 | drop(database); 90 | 91 | // Reopen 92 | let database = Database::new_with_params(StartMode::Open, params.clone()) 93 | .await 94 | .expect("Failed to create database instance"); 95 | 96 | for pos in 0..COUNT { 97 | let key = format!("key_{pos:05}").into_bytes(); 98 | let value = format!("some_string_{pos}").into_bytes(); 99 | 100 | assert_eq!( 101 | database.get(&key).await.unwrap().unwrap().get_value(), 102 | value, 103 | ); 104 | } 105 | 106 | // Ensure iteration still works 107 | let mut iterator = database.iter().await; 108 | let mut pos = 0; 109 | while let Some((key, value)) = iterator.next().await { 110 | assert_eq!(format!("key_{pos:05}").into_bytes(), key); 111 | assert_eq!(format!("some_string_{pos}").into_bytes(), value.get_value()); 112 | pos += 1; 113 | } 114 | } 115 | 116 | #[async_test] 117 | async fn get_put_large() { 118 | const COUNT: usize = 100; 119 | const SIZE: usize = 100_000; 120 | 121 | let (_tmpdir, params, database) = test_init().await; 122 | 123 | // Write without fsync to speed up tests 124 | let options = WriteOptions { sync: false }; 125 | 126 | for pos in 0..COUNT { 127 | let key = format!("key_{pos:05}").into_bytes(); 128 | let value = format!("value_{pos}").repeat(SIZE).into_bytes(); 129 | 130 | database.put_opts(key, value, &options).await.unwrap(); 131 | } 132 | 133 | database.synchronize().await.unwrap(); 134 | drop(database); 135 | 136 | // Reopen 137 | let database = Database::new_with_params(StartMode::Open, params.clone()) 138 | .await 139 | .expect("Failed to create database instance"); 140 | 141 | let mut iterator = database.iter().await; 142 | let mut pos = 0; 143 | 144 | while let Some((key, value)) = iterator.next().await { 145 | let expected_key = format!("key_{pos:05}").into_bytes(); 146 | let expected_value = format!("value_{pos}").repeat(SIZE).into_bytes(); 147 | 148 | assert_eq!(expected_key, key); 149 | // Value is very long, so don't print 150 | assert!(expected_value == value.get_value()); 151 | 152 | pos += 1; 153 | } 154 | } 155 | --------------------------------------------------------------------------------