├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── blank_issue.md │ ├── bugs.md │ ├── config.yml │ └── feature_request.md ├── dependabot.yml └── workflows │ └── test.yml ├── .gitignore ├── .rustfmt.toml ├── ARCHITECTURE.md ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── RELEASE_CHECKLIST.md ├── SAFETY.md ├── SECURITY.md ├── art ├── CREDITS ├── tree_face.png └── tree_face_anti-transphobia.png ├── code-of-conduct.md ├── examples └── bench.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz_targets │ └── fuzz_model.rs ├── scripts ├── cgtest.sh ├── cross_compile.sh ├── execution_explorer.py ├── instructions ├── sanitizers.sh ├── shufnice.sh └── ubuntu_bench ├── src ├── alloc.rs ├── block_checker.rs ├── config.rs ├── db.rs ├── event_verifier.rs ├── flush_epoch.rs ├── heap.rs ├── id_allocator.rs ├── leaf.rs ├── lib.rs ├── metadata_store.rs ├── object_cache.rs ├── object_location_mapper.rs └── tree.rs ├── tests ├── 00_regression.rs ├── common │ └── mod.rs ├── concurrent_batch_atomicity.rs ├── crash_tests │ ├── crash_batches.rs │ ├── crash_heap.rs │ ├── crash_iter.rs │ ├── crash_metadata_store.rs │ ├── crash_object_cache.rs │ ├── crash_sequential_writes.rs │ ├── crash_tx.rs │ └── mod.rs ├── test_crash_recovery.rs ├── test_quiescent.rs ├── test_space_leaks.rs ├── test_tree.rs ├── test_tree_failpoints.rs └── tree │ └── mod.rs └── tsan_suppressions.txt /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: spacejam # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Issue (do not use this for bug reports or feature requests) 3 | about: Create an issue with a blank template. 4 | --- 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a correctness issue or violated expectation 4 | labels: bug 5 | --- 6 | 7 | Bug reports must include all following items: 8 | 9 | 1. expected result 10 | 1. actual result 11 | 1. sled version 12 | 1. rustc version 13 | 1. operating system 14 | 1. minimal code sample that helps to reproduce the issue 15 | 1. logs, panic messages, stack traces 16 | 17 | Incomplete bug reports will be closed. 18 | 19 | Do not open bug reports for documentation issues. Please just open a PR with the proposed documentation change. 20 | 21 | Thank you for understanding :) 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: sled discord 4 | url: https://discord.gg/Z6VsXds 5 | about: Please ask questions in the discord server here. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Request a feature for sled 4 | labels: feature 5 | --- 6 | 7 | #### Use Case: 8 | 9 | #### Proposed Change: 10 | 11 | #### Who Benefits From The Change(s)? 12 | 13 | #### Alternative Approaches 14 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "10:00" 8 | open-pull-requests-limit: 10 9 | ignore: 10 | - dependency-name: crdts 11 | versions: 12 | - ">= 2.a, < 3" 13 | - dependency-name: zerocopy 14 | versions: 15 | - 0.4.0 16 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | clippy_check: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: nightly 16 | components: clippy 17 | override: true 18 | - run: rustup component add clippy 19 | - uses: actions-rs/clippy-check@v1 20 | with: 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | args: --all-features 23 | default: 24 | name: Cargo Test on ${{ matrix.os }} 25 | env: 26 | RUST_BACKTRACE: 1 27 | runs-on: ${{ matrix.os }} 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | os: [ubuntu-latest, macos-latest, windows-latest] 32 | steps: 33 | - uses: actions/checkout@v1 34 | - name: Cache target 35 | uses: actions/cache@v2 36 | env: 37 | cache-name: cache-default-target-and-lockfile 38 | with: 39 | path: | 40 | target 41 | Cargo.lock 42 | ~/.rustup 43 | key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }} 44 | - name: linux coredump setup 45 | if: ${{ runner.os == 'linux' }} 46 | run: | 47 | ulimit -c unlimited 48 | echo "$PWD/core-dumps/corefile-%e-%p-%t" | sudo tee /proc/sys/kernel/core_pattern 49 | mkdir core-dumps 50 | - name: cargo test 51 | run: | 52 | rustup update --no-self-update 53 | cargo test --release --no-default-features --features=for-internal-testing-only -- --nocapture 54 | - uses: actions/upload-artifact@v4 55 | if: ${{ failure() && runner.os == 'linux' }} 56 | with: 57 | name: linux-core-dumps 58 | path: | 59 | ./core-dumps/* 60 | ./target/release/deps/test_* 61 | examples: 62 | name: Example Tests 63 | runs-on: ubuntu-latest 64 | steps: 65 | - uses: actions/checkout@v1 66 | - name: Cache target 67 | uses: actions/cache@v2 68 | env: 69 | cache-name: cache-examples-target-and-lockfile 70 | with: 71 | path: | 72 | target 73 | Cargo.lock 74 | ~/.rustup 75 | key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }} 76 | - name: example tests 77 | run: | 78 | rustup update --no-self-update 79 | cargo run --example playground 80 | cargo run --example structured 81 | cross-compile: 82 | name: Cross Compile 83 | runs-on: macos-latest 84 | steps: 85 | - uses: actions/checkout@v1 86 | - name: cross compile 87 | run: | 88 | set -eo pipefail 89 | echo "cross build" 90 | scripts/cross_compile.sh 91 | burn-in: 92 | name: Burn In 93 | env: 94 | RUST_BACKTRACE: 1 95 | runs-on: ubuntu-latest 96 | steps: 97 | - uses: actions/checkout@v1 98 | - name: Cache target 99 | uses: actions/cache@v2 100 | env: 101 | cache-name: cache-stress2-asan-target-and-lockfile 102 | with: 103 | path: | 104 | benchmarks/stress2/target 105 | benchmarks/stress2/Cargo.lock 106 | ~/.rustup 107 | key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }} 108 | - name: burn in 109 | run: | 110 | set -eo pipefail 111 | pushd benchmarks/stress2 112 | ulimit -c unlimited 113 | echo "$PWD/core-dumps/corefile-%e-%p-%t" | sudo tee /proc/sys/kernel/core_pattern 114 | mkdir core-dumps 115 | rustup toolchain install nightly 116 | rustup toolchain install nightly --component rust-src 117 | rustup update 118 | rm -rf default.sled || true 119 | export RUSTFLAGS="-Z sanitizer=address" 120 | export ASAN_OPTIONS="detect_odr_violation=0" 121 | cargo +nightly build --release --target x86_64-unknown-linux-gnu 122 | target/x86_64-unknown-linux-gnu/release/stress2 --duration=240 123 | rm -rf default.sled 124 | - name: print backtraces with gdb 125 | if: ${{ failure() }} 126 | run: | 127 | sudo apt-get update 128 | sudo apt-get install gdb 129 | pushd benchmarks/stress2 130 | echo "first backtrace:" 131 | gdb target/release/stress2 core-dumps/* -batch -ex 'bt -frame-info source-and-location' 132 | echo "" 133 | echo "" 134 | echo "" 135 | echo "all backtraces:" 136 | gdb target/release/stress2 core-dumps/* -batch -ex 't a a bt -frame-info source-and-location' 137 | - uses: actions/upload-artifact@v4 138 | if: ${{ failure() }} 139 | with: 140 | name: linux-core-dumps 141 | path: | 142 | ./benchmarks/stress2/core-dumps/* 143 | ./benchmarks/stress2/target/release/stress2 144 | sanitizers: 145 | name: Sanitizers 146 | env: 147 | RUST_BACKTRACE: 1 148 | runs-on: ubuntu-latest 149 | steps: 150 | - uses: actions/checkout@v1 151 | - name: Cache rustup 152 | uses: actions/cache@v2 153 | env: 154 | cache-name: cache-sanitizers-target-and-lockfile 155 | with: 156 | path: | 157 | ~/.rustup 158 | benchmarks/stress2/target 159 | benchmarks/stress2/Cargo.lock 160 | key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }} 161 | - name: sanitizers 162 | run: | 163 | set -eo pipefail 164 | scripts/sanitizers.sh 165 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | CLAUDE.md 2 | fuzz-*.log 3 | default.sled 4 | timing_test* 5 | *db 6 | crash_test_files 7 | *conf 8 | *snap.* 9 | *grind.out* 10 | vgcore* 11 | *.bk 12 | *orig 13 | tags 14 | perf* 15 | *folded 16 | *out 17 | *perf 18 | *svg 19 | *txt 20 | experiments 21 | target 22 | Cargo.lock 23 | *swp 24 | *swo 25 | *.proptest-regressions 26 | corpus 27 | artifacts 28 | .idea 29 | cargo-timing* 30 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | version = "Two" 2 | use_small_heuristics = "Max" 3 | reorder_imports = true 4 | max_width = 80 5 | wrap_comments = true 6 | combine_control_expr = true 7 | report_todo = "Always" 8 | -------------------------------------------------------------------------------- /ARCHITECTURE.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 28 | 29 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 |
key value
buy a coffee for us to convert into databases
documentation
chat about databases with us
22 |
24 |

25 | 26 |

27 |
30 | 31 | # sled 1.0 architecture 32 | 33 | ## in-memory 34 | 35 | * Lock-free B+ tree index, extracted into the [`concurrent-map`](https://github.com/komora-io/concurrent-map) crate. 36 | * The lowest key from each leaf is stored in this in-memory index. 37 | * To read any leaf that is not already cached in memory, at most one disk read will be required. 38 | * RwLock-backed leaves, using the ArcRwLock from the [`parking_lot`](https://github.com/Amanieu/parking_lot) crate. As a `Db` grows, leaf contention tends to go down in most use cases. But this may be revisited over time if many users have issues with RwLock-related contention. Avoiding full RCU for updates on the leaves results in many of the performance benefits over sled 0.34, with significantly lower memory pressure. 39 | * A simple but very high performance epoch-based reclamation technique is used for safely deferring frees of in-memory index data and reuse of on-disk heap slots, extracted into the [`ebr`](https://github.com/komora-io/ebr) crate. 40 | * A scan-resistant LRU is used for handling eviction. By default, 20% of the cache is reserved for leaves that are accessed at most once. This is configurable via `Config.entry_cache_percent`. This is handled by the extracted [`cache-advisor`](https://github.com/komora-io/cache-advisor) crate. The overall cache size is set by the `Config.cache_size` configurable. 41 | 42 | ## write path 43 | 44 | * This is where things get interesting. There is no traditional WAL. There is no LSM. Only metadata is logged atomically after objects are written in parallel. 45 | * The important guarantees are: 46 | * all previous writes are durable after a call to `Db::flush` (This is also called periodically in the background by a flusher thread) 47 | * all write batches written using `Db::apply_batch` are either 100% visible or 0% visible after crash recovery. If it was followed by a flush that returned `Ok(())` it is guaranteed to be present. 48 | * Atomic ([linearizable](https://jepsen.io/consistency/models/linearizable)) durability is provided by marking dirty leaves as participants in "flush epochs" and performing atomic batch writes of the full epoch at a time, in order. Each call to `Db::flush` advances the current flush epoch by 1. 49 | * The atomic write consists in the following steps: 50 | 1. User code or the background flusher thread calls `Db::flush`. 51 | 1. In parallel (via [rayon](https://docs.rs/rayon)) serialize and compress each dirty leaf with zstd (configurable via `Config.zstd_compression_level`). 52 | 1. Based on the size of the bytes for each object, choose the smallest heap file slot that can hold the full set of bytes. This is an on-disk slab allocator. 53 | 1. Slab slots are not power-of-two sized, but tend to increase in size by around 20% from one to the next, resulting in far lower fragmentation than typical page-oriented heaps with either constant-size or power-of-two sized leaves. 54 | 1. Write the object to the allocated slot from the rayon threadpool. 55 | 1. After all writes, fsync the heap files that were written to. 56 | 1. If any writes were written to the end of the heap file, causing it to grow, fsync the directory that stores all heap files. 57 | 1. After the writes are stable, it is now safe to write an atomic metadata batch that records the location of each written leaf in the heap. This is a simple framed batch of `(low_key, slab_slot)` tuples that are initially written to a log, but eventually merged into a simple snapshot file for the metadata store once the log becomes larger than the snapshot file. 58 | 1. Fsync of the metadata log file. 59 | 1. Fsync of the metadata log directory. 60 | 1. After the atomic metadata batch write, the previously occupied slab slots are marked for future reuse with the epoch-based reclamation system. After all threads that may have witnessed the previous location have finished their work, the slab slot is added to the free `BinaryHeap` of the slot that it belongs to so that it may be reused in future atomic write batches. 61 | 1. Return `Ok(())` to the caller of `Db::flush`. 62 | * Writing objects before the metadata write is random, but modern SSDs handle this well. Even though the SSD's FTL will be working harder to defragment things periodically than if we wrote a few megabytes sequentially with each write, the data that the FTL will be copying will be mostly live due to the eager leaf write-backs. 63 | 64 | ## recovery 65 | 66 | * Recovery involves simply reading the atomic metadata store that records the low key for each written leaf as well as its location and mapping it into the in-memory index. Any gaps in the slabs are then used as free slots. 67 | * Any write that failed to complete its entire atomic writebatch is treated as if it never happened, because no user-visible flush ever returned successfully. 68 | * Rayon is also used here for parallelizing reads of this metadata. In general, this is extremely fast compared to the previous sled recovery process. 69 | 70 | ## tuning 71 | 72 | * The larger the `LEAF_FANOUT` const generic on the high-level `Db` struct (default `1024`), the smaller the in-memory leaf index and the better the compression ratio of the on-disk file, but the more expensive it will be to read the entire leaf off of disk and decompress it. 73 | * You can choose to turn the `LEAF_FANOUT` relatively low to make the system behave more like an Index+Log architecture, but overall disk size will grow and write performance will decrease. 74 | * NB: changing `LEAF_FANOUT` after writing data is not supported. 75 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Unreleased 2 | 3 | ## New Features 4 | 5 | * #1178 batches and transactions are now unified for subscribers. 6 | * #1231 `Tree::get_zero_copy` allows for reading a value directly 7 | in-place without making an `IVec` first. 8 | * #1250 the global `print_profile` function has been added 9 | which is enabled when compiling with the `metrics` feature. 10 | * #1254 `IVec` data will now always have an alignment of 8, 11 | which may enable interesting architecture-specific use cases. 12 | * #1307 & #1315 `Db::contains_tree` can be used to see if a 13 | `Tree` with a given name already exists. 14 | 15 | ## Improvements 16 | 17 | * #1214 a new slab-style storage engine has been added which 18 | replaces the previous file-per-blob technique for storing 19 | large pages. 20 | * #1231 tree nodes now get merged into a single-allocation 21 | representation that is able to dynamically avoid various 22 | overheads, resulting in significant efficiency improvements. 23 | 24 | ## Breaking Changes 25 | 26 | * #1400 Bump MSRV to 1.57. 27 | * #1399 Thread support is now required on all platforms. 28 | * #1135 The "no_metrics" anti-feature has been replaced with 29 | the "metrics" positive feature. 30 | * #1178 the `Event` enum has become a unified struct that allows 31 | subscribers to iterate over each (Tree, key, optional value) 32 | involved in single key operations, batches, or transactions in 33 | a unified way. 34 | * #1178 the `Event::key` method has been removed in favor of the 35 | new more comprehensive `iter` method. 36 | * #1214 The deprecated `Config::build` method has been removed. 37 | * #1248 The deprecated `Tree::set` method has been removed. 38 | * #1248 The deprecated `Tree::del` method has been removed. 39 | * #1250 The `Config::print_profile_on_drop` method has been 40 | removed in favor of the global `print_profile` function. 41 | * #1252 The deprecated `Db::open` method has been removed. 42 | * #1252 The deprecated `Config::segment_cleanup_skew` method 43 | has been removed. 44 | * #1252 The deprecated `Config::segment_cleanup_threshold` 45 | method has been removed. 46 | * #1252 The deprecated `Config::snapshot_path` method has 47 | been removed. 48 | * #1253 The `IVec::subslice` method has been removed. 49 | * #1275 Keys and values are now limited to 128gb on 64-bit 50 | platforms and 512mb on 32-bit platforms. 51 | * #1281 `Config`'s `cache_capacity` is now a usize, as u64 52 | doesn't make sense for things that must fit in memory anyway. 53 | * #1314 `Subscriber::next_timeout` now requires a mutable self 54 | reference. 55 | * #1349 The "measure_allocs" feature has been removed. 56 | * #1354 `Error` has been modified to be Copy, removing all 57 | heap-allocated variants. 58 | 59 | ## Bug Fixes 60 | 61 | * #1202 Fix a space leak where blobs were not 62 | removed when replaced by another blob. 63 | * #1229 the powerful ALICE crash consistency tool has been 64 | used to discover several crash vulnerabilities, now fixed. 65 | 66 | # 0.34.7 67 | 68 | ## Bug Fixes 69 | 70 | * #1314 Fix a bug in Subscriber's Future impl. 71 | 72 | # 0.34.6 73 | 74 | ## Improvements 75 | 76 | * documentation improved 77 | 78 | # 0.34.5 79 | 80 | ## Improvements 81 | 82 | * #1164 widen some trait bounds on trees and batches 83 | 84 | # 0.34.4 85 | 86 | ## New Features 87 | 88 | * #1151 `Send` is implemented for `Iter` 89 | * #1167 added `Tree::first` and `Tree::last` functions 90 | to retrieve the first or last items in a `Tree`, unless 91 | the `Tree` is empty. 92 | 93 | ## Bug Fixes 94 | 95 | * #1159 dropping a `Db` instance will no-longer 96 | prematurely shut-down the background flusher 97 | thread. 98 | * #1168 fixed an issue that was causing panics during 99 | recovery in 32-bit code. 100 | * #1170 when encountering corrupted storage data, 101 | the recovery process will panic less often. 102 | 103 | # 0.34.3 104 | 105 | ## New Features 106 | 107 | * #1146 added `TransactionalTree::generate_id` 108 | 109 | # 0.34.2 110 | 111 | ## Improvements 112 | 113 | * #1133 transactions and writebatch performance has been 114 | significantly improved by removing a bottleneck in 115 | the atomic batch stability tracking code. 116 | 117 | # 0.34.1 118 | 119 | ## New Features 120 | 121 | * #1136 Added the `TransactionalTree::flush` method to 122 | flush the underlying database after the transaction 123 | commits and before the transaction returns. 124 | 125 | # 0.34 126 | 127 | ## Improvements 128 | 129 | * #1132 implemented From for io::Error to 130 | reduce friction in some situations. 131 | 132 | ## Breaking Changes 133 | 134 | * #1131 transactions performed on `Tree`s from different 135 | `Db`s will now safely fail. 136 | * #1131 transactions may now only be performed on tuples 137 | of up to 14 elements. For higher numbers, please use 138 | slices. 139 | 140 | # 0.33 141 | 142 | ## Breaking Changes 143 | 144 | * #1125 the backtrace crate has been made optional, which 145 | cuts several seconds off compilation time, but may cause 146 | breakage if you interacted with the backtrace field 147 | of corruption-related errors. 148 | 149 | ## Bug Fixes 150 | 151 | * #1128 `Tree::pop_min` and `Tree::pop_max` had a bug where 152 | they were not atomic. 153 | 154 | # 0.32.1 155 | 156 | ## New Features 157 | 158 | * #1116 `IVec::subslice` has been added to facilitate 159 | creating zero-copy subsliced `IVec`s that are backed 160 | by the same data. 161 | 162 | ## Bug Fixes 163 | 164 | * #1120 Fixed a use-after-free caused by missing `ref` keyword 165 | on a `Copy` type in a pattern match in `IVec::as_mut`. 166 | * #1108 conversions from `Box<[u8]>` to `IVec` are fixed. 167 | 168 | # 0.32 169 | 170 | ## New Features 171 | 172 | * #1079 `Transactional` is now implemented for 173 | `[&Tree]` and `[Tree]` so you can avoid the 174 | previous friction of using tuples, as was 175 | necessary previously. 176 | * #1058 The minimum supported Rust version (MSRV) 177 | is now 1.39.0. 178 | * #1037 `Subscriber` now implements `Future` (non-fused) 179 | so prefix watching may now be iterated over via 180 | `while let Some(event) = (&mut subscriber).await {}` 181 | 182 | ## Improvements 183 | 184 | * #965 concurrency control is now dynamically enabled 185 | for atomic point operations, so that it may be 186 | avoided unless transactional functionality is 187 | being used in the system. This significantly 188 | increases performance for workloads that do not 189 | use transactions. 190 | * A number of memory optimizations have been implemented. 191 | * Disk usage has been significantly reduced for many 192 | workloads. 193 | * #1016 On 64-bit systems, we can now store 1-2 trillion items. 194 | * #993 Added DerefMut and AsMut<[u8]> for `IVec` where it 195 | works similarly to a `Cow`, making a private copy 196 | if the backing `Arc`'s strong count is not 1. 197 | * #1020 The sled wiki has been moved into the documentation 198 | itself, and is accessible through the `doc` module 199 | exported in lib. 200 | 201 | ## Breaking Changes 202 | 203 | * #975 Changed the default `segment_size` from 8m to 512k. 204 | This will result in far smaller database files due 205 | to better file garbage collection granularity. 206 | * #975 deprecated several `Config` options that will be 207 | removed over time. 208 | * #1000 rearranged some transaction-related imports, and 209 | moved them to the `transaction` module away from 210 | the library root to keep the top level docs clean. 211 | * #1015 `TransactionalTree::apply_batch` now accepts 212 | its argument by reference instead of by value. 213 | * `Event` has been changed to make the inner fields 214 | named instead of anonymous. 215 | * #1057 read-only mode has been removed due to not having 216 | the resources to properly keep it tested while 217 | making progress on high priority issues. This may 218 | be correctly implemented in the future if resources 219 | permit. 220 | * The conversion between `Box<[u8]>` and `IVec` has 221 | been temporarily removed. This is re-added in 0.32.1. 222 | 223 | # 0.31 224 | 225 | ## Improvements 226 | 227 | * #947 dramatic read and recovery optimizations 228 | * #921 reduced the reliance on locks while 229 | performing multithreaded IO on windows. 230 | * #928 use `sync_file_range` on linux instead 231 | of a full fsync for most writes. 232 | * #946 io_uring support changed to the `rio` crate 233 | * #939 reduced memory consumption during 234 | zstd decompression 235 | 236 | ## Breaking Changes 237 | 238 | * #927 use SQLite-style varints for serializing 239 | `u64`. This dramatically reduces the written 240 | bytes for databases that store small keys and 241 | values. 242 | * #943 use varints for most of the fields in 243 | message headers, causing an additional large 244 | space reduction. combined with #927, these 245 | changes reduce bytes written by 68% for workloads 246 | writing small items. 247 | 248 | # 0.30.3 249 | 250 | * Documentation-only release 251 | 252 | # 0.30.2 253 | 254 | ## New Features 255 | 256 | * Added the `open` function for quickly 257 | opening a database at a path with default 258 | configuration. 259 | 260 | # 0.30.1 261 | 262 | ## Bugfixes 263 | 264 | * Fixed an issue where an idle threadpool worker 265 | would spin in a hot loop until work arrived 266 | 267 | # 0.30 268 | 269 | ## Breaking Changes 270 | 271 | * Migrated to a new storage format 272 | 273 | ## Bugfixes 274 | 275 | * Fixed a bug where cache was not being evicted. 276 | * Fixed a bug with using transactions with 277 | compression. 278 | 279 | # 0.29.2 280 | 281 | ## New Features 282 | 283 | * The `create_new` option has been added 284 | to `Config`, allowing the user to specify 285 | that a database should only be freshly 286 | created, rather than re-opened. 287 | 288 | # 0.29.1 289 | 290 | ## Bugfixes 291 | 292 | * Fixed a bug where prefix encoding could be 293 | incorrectly handled when merging nodes together. 294 | 295 | # 0.29 296 | 297 | ## New Features 298 | 299 | * The `Config::open` method has been added to give 300 | `Config` a similar feel to std's `fs::OpenOptions`. 301 | The `Config::build` and `Db::start` methods are 302 | now deprecated in favor of calling `Config::open` 303 | directly. 304 | * A `checksum` method has been added to Tree and Db 305 | for use in verifying backups and migrations. 306 | * Transactions may now involve up to 69 different 307 | tables. Nice. 308 | * The `TransactionError::Abort` variant has had 309 | a generic member added that can be returned 310 | as a way to return information from a 311 | manually-aborted transaction. An `abort` helper 312 | function has been added to reduce the boiler- 313 | plate required to return aborted results. 314 | 315 | ## Breaking Changes 316 | 317 | * The `ConfigBuilder` structure has been removed 318 | in favor of a simplified `Config` structure 319 | with the same functionality. 320 | * The way that sled versions are detected at 321 | initialization time is now independent of serde. 322 | * The `cas` method is deprecated in favor of the new 323 | `compare_and_swap` method which now returns the 324 | proposed value that failed to be applied. 325 | * Tree nodes now have constant prefix encoding 326 | lengths. 327 | * The `io_buf_size` configurable renamed to 328 | `segment_size`. 329 | * The `io_buf_size` configurable method has been 330 | removed from ConfigBuilder. This can be manually 331 | set by setting the attribute directly on the 332 | ConfigBuilder, but this is discouraged. 333 | Additionally, this must now be a power of 2. 334 | * The `page_consolidation_threshold` method has been 335 | removed from ConfigBuilder, and this is now 336 | a constant of 10. 337 | 338 | # 0.28 339 | 340 | ## Breaking Changes 341 | 342 | * `Iter` no longer has a lifetime parameter. 343 | * `Db::open_tree` now returns a `Tree` instead of 344 | an `Arc`. `Tree` now has an inner type that 345 | uses an `Arc`, so you don't need to think about it. 346 | 347 | ## Bug Fixes 348 | 349 | * A bug with prefix encoding has been fixed that 350 | led to nodes with keys longer than 256 bytes 351 | being stored incorrectly, which led to them 352 | being inaccessible and also leading to infinite 353 | loops during iteration. 354 | * Several cases of incorrect unsafe code were removed 355 | from the sled crate. No bugs are known to have been 356 | encountered, but they may have resulted in 357 | incorrect optimizations in future refactors. 358 | 359 | # 0.27 360 | 361 | ## Breaking Changes 362 | 363 | * `Event::Set` has been renamed to `Event::Insert` and 364 | `Event::Del` has been renamed to `Event::Remove`. These 365 | names better align with the methods of BTreeMap from 366 | the standard library. 367 | 368 | ## Bug Fixes 369 | 370 | * A deadlock was possible in very high write volume 371 | situations when the segment accountant lock was 372 | taken by all IO threads while a task was blocked 373 | trying to submit a file truncation request to the 374 | threadpool while holding the segment accountant lock. 375 | 376 | ## New Features 377 | 378 | * `flush_async` has been added to perform time-intensive 379 | flushing in an asynchronous manner, returning a Future. 380 | 381 | # 0.26.1 382 | 383 | ## Improvements 384 | 385 | * std::thread is no longer used on platforms other than 386 | linux, macos, and windows, which increases portability. 387 | 388 | # 0.26 389 | 390 | ## New Features 391 | 392 | * Transactions! You may now call `Tree::transaction` and 393 | perform reads, writes, and deletes within a provided 394 | closure with a `TransactionalTree` argument. This 395 | closure may be called multiple times if the transaction 396 | encounters a concurrent update in the process of its 397 | execution. Transactions may also be used on tuples of 398 | `Tree` objects, where the closure will then be 399 | parameterized on `TransactionalTree` instances providing 400 | access to each of the provided `Tree` instances. This 401 | allows you to atomically read and modify multiple 402 | `Tree` instances in a single atomic operation. 403 | These transactions are serializable, fully ACID, 404 | and optimistic. 405 | * `Tree::apply_batch` allows you to apply a `Batch` 406 | * `TransactionalTree::apply_batch` allow you to 407 | apply a `Batch` from within a transaction. 408 | 409 | ## Breaking Changes 410 | 411 | * `Tree::batch` has been removed. Now you can directly 412 | create a `Batch` with `Batch::default()` and then apply 413 | it to a `Tree` with `Tree::apply_batch` or during a 414 | transaction using `TransactionalTree::apply_batch`. 415 | This facilitates multi-`Tree` batches via transactions. 416 | * `Event::Merge` has been removed, and `Tree::merge` will 417 | now send a complete `Event::Set` item to be distributed 418 | to all listening subscribers. 419 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to the Project :) 2 | 3 | * Don't be a jerk - here's our [code of conduct](./code-of-conduct.md). 4 | We have a track record of defending our community from harm. 5 | 6 | There are at least three great ways to contribute to sled: 7 | 8 | * [financial contribution](https://github.com/sponsors/spacejam) 9 | * coding 10 | * conversation 11 | 12 | #### Coding Considerations: 13 | 14 | Please don't waste your time or ours by implementing things that 15 | we do not want to introduce and maintain. Please discuss in an 16 | issue or on chat before submitting a PR with: 17 | 18 | * public API changes 19 | * new functionality of any sort 20 | * additional unsafe code 21 | * significant refactoring 22 | 23 | The above changes are unlikely to be merged or receive 24 | timely attention without prior discussion. 25 | 26 | PRs that generally require less coordination beforehand: 27 | 28 | * Anything addressing a correctness issue. 29 | * Better docs: whatever you find confusing! 30 | * Small code changes with big performance implications, substantiated with [responsibly-gathered metrics](https://sled.rs/perf#experiment-checklist). 31 | * FFI submodule changes: these are generally less well maintained than the Rust core, and benefit more from public assistance. 32 | * Generally any new kind of test that avoids biases inherent in the others. 33 | 34 | #### All PRs block on failing tests! 35 | 36 | sled has intense testing, including crash tests, multi-threaded tests with 37 | delay injection, a variety of mechanically-generated tests that combine fault 38 | injection with concurrency in interesting ways, cross-compilation and minimum 39 | supported Rust version checks, LLVM sanitizers, and more. It can sometimes be 40 | challenging to understand why something is failing these intense tests. 41 | 42 | For better understanding test failures, please: 43 | 44 | 1. read the failing test name and output log for clues 45 | 1. try to reproduce the failed test locally by running its associated command from the [test script](https://github.com/spacejam/sled/blob/main/.github/workflows/test.yml) 46 | 1. If it is not clear why your test is failing, feel free to request help with understanding it either on discord or requesting help on the PR, and we will do our best to help. 47 | 48 | Want to help sled but don't have time for individual contributions? Contribute via [GitHub Sponsors](https://github.com/sponsors/spacejam) to support the people pushing the project forward! 49 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sled" 3 | version = "1.0.0-alpha.124" 4 | edition = "2024" 5 | authors = ["Tyler Neely "] 6 | documentation = "https://docs.rs/sled/" 7 | description = "Lightweight high-performance pure-rust transactional embedded database." 8 | license = "MIT OR Apache-2.0" 9 | homepage = "https://github.com/spacejam/sled" 10 | repository = "https://github.com/spacejam/sled" 11 | keywords = ["redis", "mongo", "sqlite", "lmdb", "rocksdb"] 12 | categories = ["database-implementations", "concurrency", "data-structures", "algorithms", "caching"] 13 | readme = "README.md" 14 | exclude = ["benchmarks", "examples", "bindings", "scripts", "experiments"] 15 | 16 | [features] 17 | # initializes allocated memory to 0xa1, writes 0xde to deallocated memory before freeing it 18 | testing-shred-allocator = [] 19 | # use a counting global allocator that provides the sled::alloc::{allocated, freed, resident, reset} functions 20 | testing-count-allocator = [] 21 | for-internal-testing-only = [] 22 | # turn off re-use of object IDs and heap slots, disable tree leaf merges, disable heap file truncation. 23 | monotonic-behavior = [] 24 | 25 | [profile.release] 26 | debug = true 27 | opt-level = 3 28 | overflow-checks = true 29 | panic = "abort" 30 | 31 | [profile.test] 32 | debug = true 33 | overflow-checks = true 34 | panic = "abort" 35 | 36 | [dependencies] 37 | bincode = "1.3.3" 38 | cache-advisor = "1.0.16" 39 | concurrent-map = { version = "5.0.31", features = ["serde"] } 40 | crc32fast = "1.3.2" 41 | ebr = "0.2.13" 42 | inline-array = { version = "0.1.13", features = ["serde", "concurrent_map_minimum"] } 43 | fs2 = "0.4.3" 44 | log = "0.4.19" 45 | pagetable = "0.4.5" 46 | parking_lot = { version = "0.12.1", features = ["arc_lock"] } 47 | rayon = "1.7.0" 48 | serde = { version = "1.0", features = ["derive"] } 49 | stack-map = { version = "1.0.5", features = ["serde"] } 50 | zstd = "0.12.4" 51 | fnv = "1.0.7" 52 | fault-injection = "1.0.10" 53 | crossbeam-queue = "0.3.8" 54 | crossbeam-channel = "0.5.8" 55 | tempdir = "0.3.7" 56 | 57 | [dev-dependencies] 58 | env_logger = "0.10.0" 59 | num-format = "0.4.4" 60 | # heed = "0.11.0" 61 | # rocksdb = "0.21.0" 62 | # rusqlite = "0.29.0" 63 | # old_sled = { version = "0.34", package = "sled" } 64 | rand = "0.9" 65 | quickcheck = "1.0.3" 66 | rand_distr = "0.5" 67 | libc = "0.2.147" 68 | 69 | [[test]] 70 | name = "test_crash_recovery" 71 | path = "tests/test_crash_recovery.rs" 72 | harness = false 73 | 74 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2015 Tyler Neely 190 | Copyright 2016 Tyler Neely 191 | Copyright 2017 Tyler Neely 192 | Copyright 2018 Tyler Neely 193 | Copyright 2019 Tyler Neely 194 | Copyright 2020 Tyler Neely 195 | Copyright 2021 Tyler Neely 196 | Copyright 2022 Tyler Neely 197 | Copyright 2023 Tyler Neely 198 | 199 | Licensed under the Apache License, Version 2.0 (the "License"); 200 | you may not use this file except in compliance with the License. 201 | You may obtain a copy of the License at 202 | 203 | http://www.apache.org/licenses/LICENSE-2.0 204 | 205 | Unless required by applicable law or agreed to in writing, software 206 | distributed under the License is distributed on an "AS IS" BASIS, 207 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 208 | See the License for the specific language governing permissions and 209 | limitations under the License. 210 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Tyler Neely 2 | Copyright (c) 2016 Tyler Neely 3 | Copyright (c) 2017 Tyler Neely 4 | Copyright (c) 2018 Tyler Neely 5 | Copyright (c) 2019 Tyler Neely 6 | Copyright (c) 2020 Tyler Neely 7 | Copyright (c) 2021 Tyler Neely 8 | Copyright (c) 2022 Tyler Neely 9 | Copyright (c) 2023 Tyler Neely 10 | 11 | Permission is hereby granted, free of charge, to any 12 | person obtaining a copy of this software and associated 13 | documentation files (the "Software"), to deal in the 14 | Software without restriction, including without 15 | limitation the rights to use, copy, modify, merge, 16 | publish, distribute, sublicense, and/or sell copies of 17 | the Software, and to permit persons to whom the Software 18 | is furnished to do so, subject to the following 19 | conditions: 20 | 21 | The above copyright notice and this permission notice 22 | shall be included in all copies or substantial portions 23 | of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 26 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 27 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 28 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 29 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 30 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 31 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 32 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 33 | DEALINGS IN THE SOFTWARE. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 29 | 30 |
5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
key value
buy a coffee for us to convert into databases
documentation
chat about databases with us
23 |
25 |

26 | 27 |

28 |
31 | 32 | 33 | # sled - ~~it's all downhill from here!!!~~ 34 | 35 | An embedded database. 36 | 37 | ```rust 38 | let tree = sled::open("/tmp/welcome-to-sled")?; 39 | 40 | // insert and get, similar to std's BTreeMap 41 | let old_value = tree.insert("key", "value")?; 42 | 43 | assert_eq!( 44 | tree.get(&"key")?, 45 | Some(sled::IVec::from("value")), 46 | ); 47 | 48 | // range queries 49 | for kv_result in tree.range("key_1".."key_9") {} 50 | 51 | // deletion 52 | let old_value = tree.remove(&"key")?; 53 | 54 | // atomic compare and swap 55 | tree.compare_and_swap( 56 | "key", 57 | Some("current_value"), 58 | Some("new_value"), 59 | )?; 60 | 61 | // block until all operations are stable on disk 62 | // (flush_async also available to get a Future) 63 | tree.flush()?; 64 | ``` 65 | 66 | $${\color{red}This \space README \space is \space out \space of \space sync \space with \space the \space main \space branch \space which \space contains \space a \space large \space in-progress \space rewrite }$$ 67 | 68 | If you would like to work with structured data without paying expensive deserialization costs, check out the [structured](examples/structured.rs) example! 69 | 70 | # features 71 | 72 | * [API](https://docs.rs/sled) similar to a threadsafe `BTreeMap<[u8], [u8]>` 73 | * serializable (ACID) [transactions](https://docs.rs/sled/latest/sled/struct.Tree.html#method.transaction) 74 | for atomically reading and writing to multiple keys in multiple keyspaces. 75 | * fully atomic single-key operations, including [compare and swap](https://docs.rs/sled/latest/sled/struct.Tree.html#method.compare_and_swap) 76 | * zero-copy reads 77 | * [write batches](https://docs.rs/sled/latest/sled/struct.Tree.html#method.apply_batch) 78 | * [subscribe to changes on key 79 | prefixes](https://docs.rs/sled/latest/sled/struct.Tree.html#method.watch_prefix) 80 | * [multiple keyspaces](https://docs.rs/sled/latest/sled/struct.Db.html#method.open_tree) 81 | * [merge operators](https://docs.rs/sled/latest/sled/doc/merge_operators/index.html) 82 | * forward and reverse iterators over ranges of items 83 | * a crash-safe monotonic [ID generator](https://docs.rs/sled/latest/sled/struct.Db.html#method.generate_id) 84 | capable of generating 75-125 million unique ID's per second 85 | * [zstd](https://github.com/facebook/zstd) compression (use the 86 | `compression` build feature, disabled by default) 87 | * cpu-scalable lock-free implementation 88 | * flash-optimized log-structured storage 89 | * uses modern b-tree techniques such as prefix encoding and suffix 90 | truncation for reducing the storage costs of long keys with shared 91 | prefixes. If keys are the same length and sequential then the 92 | system can avoid storing 99%+ of the key data in most cases, 93 | essentially acting like a learned index 94 | 95 | # expectations, gotchas, advice 96 | 97 | * Maybe one of the first things that seems weird is the `IVec` type. 98 | This is an inlinable `Arc`ed slice that makes some things more efficient. 99 | * Durability: **sled automatically fsyncs every 500ms by default**, 100 | which can be configured with the `flush_every_ms` configurable, or you may 101 | call `flush` / `flush_async` manually after operations. 102 | * **Transactions are optimistic** - do not interact with external state 103 | or perform IO from within a transaction closure unless it is 104 | [idempotent](https://en.wikipedia.org/wiki/Idempotent). 105 | * Internal tree node optimizations: sled performs prefix encoding 106 | on long keys with similar prefixes that are grouped together in a range, 107 | as well as suffix truncation to further reduce the indexing costs of 108 | long keys. Nodes will skip potentially expensive length and offset pointers 109 | if keys or values are all the same length (tracked separately, don't worry 110 | about making keys the same length as values), so it may improve space usage 111 | slightly if you use fixed-length keys or values. This also makes it easier 112 | to use [structured access](examples/structured.rs) as well. 113 | * sled does not support multiple open instances for the time being. Please 114 | keep sled open for the duration of your process's lifespan. It's totally 115 | safe and often quite convenient to use a global lazy_static sled instance, 116 | modulo the normal global variable trade-offs. Every operation is threadsafe, 117 | and most are implemented under the hood with lock-free algorithms that avoid 118 | blocking in hot paths. 119 | 120 | # performance 121 | 122 | * [LSM tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree)-like write performance 123 | with [traditional B+ tree](https://en.wikipedia.org/wiki/B%2B_tree)-like read performance 124 | * over a billion operations in under a minute at 95% read 5% writes on 16 cores on a small dataset 125 | * measure your own workloads rather than relying on some marketing for contrived workloads 126 | 127 | # a note on lexicographic ordering and endianness 128 | 129 | If you want to store numerical keys in a way that will play nicely with sled's iterators and ordered operations, please remember to store your numerical items in big-endian form. Little endian (the default of many things) will often appear to be doing the right thing until you start working with more than 256 items (more than 1 byte), causing lexicographic ordering of the serialized bytes to diverge from the lexicographic ordering of their deserialized numerical form. 130 | 131 | * Rust integral types have built-in `to_be_bytes` and `from_be_bytes` [methods](https://doc.rust-lang.org/std/primitive.u64.html#method.from_be_bytes). 132 | * bincode [can be configured](https://docs.rs/bincode/1.2.0/bincode/struct.Config.html#method.big_endian) to store integral types in big-endian form. 133 | 134 | # interaction with async 135 | 136 | If your dataset resides entirely in cache (achievable at startup by setting the cache 137 | to a large enough value and performing a full iteration) then all reads and writes are 138 | non-blocking and async-friendly, without needing to use Futures or an async runtime. 139 | 140 | To asynchronously suspend your async task on the durability of writes, we support the 141 | [`flush_async` method](https://docs.rs/sled/latest/sled/struct.Tree.html#method.flush_async), 142 | which returns a Future that your async tasks can await the completion of if they require 143 | high durability guarantees and you are willing to pay the latency costs of fsync. 144 | Note that sled automatically tries to sync all data to disk several times per second 145 | in the background without blocking user threads. 146 | 147 | We support async subscription to events that happen on key prefixes, because the 148 | `Subscriber` struct implements `Future>`: 149 | 150 | ```rust 151 | let sled = sled::open("my_db").unwrap(); 152 | 153 | let mut sub = sled.watch_prefix(""); 154 | 155 | sled.insert(b"a", b"a").unwrap(); 156 | 157 | extreme::run(async move { 158 | while let Some(event) = (&mut sub).await { 159 | println!("got event {:?}", event); 160 | } 161 | }); 162 | ``` 163 | 164 | # minimum supported Rust version (MSRV) 165 | 166 | We support Rust 1.62 and up. 167 | 168 | # architecture 169 | 170 | lock-free tree on a lock-free pagecache on a lock-free log. the pagecache scatters 171 | partial page fragments across the log, rather than rewriting entire pages at a time 172 | as B+ trees for spinning disks historically have. on page reads, we concurrently 173 | scatter-gather reads across the log to materialize the page from its fragments. 174 | check out the [architectural outlook](https://github.com/spacejam/sled/wiki/sled-architectural-outlook) 175 | for a more detailed overview of where we're at and where we see things going! 176 | 177 | # philosophy 178 | 179 | 1. don't make the user think. the interface should be obvious. 180 | 1. don't surprise users with performance traps. 181 | 1. don't wake up operators. bring reliability techniques from academia into real-world practice. 182 | 1. don't use so much electricity. our data structures should play to modern hardware's strengths. 183 | 184 | # known issues, warnings 185 | 186 | * if reliability is your primary constraint, use SQLite. sled is beta. 187 | * if storage price performance is your primary constraint, use RocksDB. sled uses too much space sometimes. 188 | * if you have a multi-process workload that rarely writes, use LMDB. sled is architected for use with long-running, highly-concurrent workloads such as stateful services or higher-level databases. 189 | * quite young, should be considered unstable for the time being. 190 | * the on-disk format is going to change in ways that require [manual migrations](https://docs.rs/sled/latest/sled/struct.Db.html#method.export) before the `1.0.0` release! 191 | 192 | # priorities 193 | 194 | 1. A full rewrite of sled's storage subsystem is happening on a modular basis as part of the [komora project](https://github.com/komora-io), in particular the marble storage engine. This will dramatically lower both the disk space usage (space amplification) and garbage collection overhead (write amplification) of sled. 195 | 2. The memory layout of tree nodes is being completely rewritten to reduce fragmentation and eliminate serialization costs. 196 | 3. The merge operator feature will change into a trigger feature that resembles traditional database triggers, allowing state to be modified as part of the same atomic writebatch that triggered it for retaining serializability with reactive semantics. 197 | 198 | # fund feature development 199 | 200 | Like what we're doing? Help us out via [GitHub Sponsors](https://github.com/sponsors/spacejam)! 201 | -------------------------------------------------------------------------------- /RELEASE_CHECKLIST.md: -------------------------------------------------------------------------------- 1 | # Release Checklist 2 | 3 | This checklist must be completed before publishing a release of any kind. 4 | 5 | Over time, anything in this list that can be turned into an automated test should be, but 6 | there are still some big blind spots. 7 | 8 | ## API stability 9 | 10 | - [ ] rust-flavored semver respected 11 | 12 | ## Performance 13 | 14 | - [ ] micro-benchmark regressions should not happen unless newly discovered correctness criteria demands them 15 | - [ ] mixed point operation latency distribution should narrow over time 16 | - [ ] sequential operation average throughput should increase over time 17 | - [ ] workloads should pass TSAN and ASAN on macOS. Linux should additionally pass LSAN & MSAN. 18 | - [ ] workload write and space amplification thresholds should see no regressions 19 | 20 | ## Concurrency Audit 21 | 22 | - [ ] any new `Guard` objects are dropped inside the rayon threadpool 23 | - [ ] no new EBR `Collector`s, as they destroy causality. These will be optimized in-bulk in the future. 24 | - [ ] no code assumes a recently read page pointer will remain unchanged (transactions may change this if reads are inline) 25 | - [ ] no calls to `rand::thread_rng` from a droppable function (anything in the SegmentAccountant) 26 | 27 | ## Burn-In 28 | 29 | - [ ] fuzz tests should run at least 24 hours each with zero crashes 30 | - [ ] sequential and point workloads run at least 24 hours in constrained docker container without OOM / out of disk 31 | -------------------------------------------------------------------------------- /SAFETY.md: -------------------------------------------------------------------------------- 1 | # sled safety model 2 | 3 | This document applies 4 | [STPA](http://psas.scripts.mit.edu/home/get_file.php?name=STPA_handbook.pdf)-style 5 | hazard analysis to the sled embedded database for the purpose of guiding 6 | design and testing efforts to prevent unacceptable losses. 7 | 8 | Outline 9 | 10 | * [purpose of analysis](#purpose-of-analysis) 11 | * [losses](#losses) 12 | * [system boundary](#system-boundary) 13 | * [hazards](#hazards) 14 | * [leading indicators](#leading-indicators) 15 | * [constraints](#constraints) 16 | * [model of control structure](#model-of-control-structure) 17 | * [identify unsafe control actions](#identify-unsafe-control-actions) 18 | * [identify loss scenarios][#identify-loss-scenarios) 19 | * [resources for learning more about STAMP, STPA, and CAST](#resources) 20 | 21 | # Purpose of Analysis 22 | 23 | ## Losses 24 | 25 | We wish to prevent the following undesirable situations: 26 | 27 | * data loss 28 | * inconsistent (non-linearizable) data access 29 | * process crash 30 | * resource exhaustion 31 | 32 | ## System Boundary 33 | 34 | We draw the line between system and environment where we can reasonably 35 | invest our efforts to prevent losses. 36 | 37 | Inside the boundary: 38 | 39 | * codebase 40 | * put safe control actions into place that prevent losses 41 | * documentation 42 | * show users how to use sled safely 43 | * recommend hardware, kernels, user code 44 | 45 | Outside the boundary: 46 | 47 | * Direct changes to hardware, kernels, user code 48 | 49 | ## Hazards 50 | 51 | These hazards can result in the above losses: 52 | 53 | * data may be lost if 54 | * bugs in the logging system 55 | * `Db::flush` fails to make previous writes durable 56 | * bugs in the GC system 57 | * the old location is overwritten before the defragmented location becomes durable 58 | * bugs in the recovery system 59 | * hardare failures 60 | * consistency violations may be caused by 61 | * transaction concurrency control failure to enforce linearizability (strict serializability) 62 | * non-linearizable lock-free single-key operations 63 | * panic 64 | * of user threads 65 | * IO threads 66 | * flusher & GC thread 67 | * indexing 68 | * unwraps/expects 69 | * failed TryInto/TryFrom + unwrap 70 | * persistent storage exceeding (2 + N concurrent writers) * logical data size 71 | * in-memory cache exceeding the configured cache size 72 | * caused by incorrect calculation of cache 73 | * use-after-free 74 | * data race 75 | * memory leak 76 | * integer overflow 77 | * buffer overrun 78 | * uninitialized memory access 79 | 80 | ## Constraints 81 | 82 | # Models of Control Structures 83 | 84 | for each control action we have, consider: 85 | 86 | 1. what hazards happen when we fail to apply it / it does not exist? 87 | 2. what hazards happen when we do apply it 88 | 3. what hazards happen when we apply it too early or too late? 89 | 4. what hazards happen if we apply it for too long or not long enough? 90 | 91 | durability model 92 | 93 | * recovery 94 | * LogIter::max_lsn 95 | * return None if last_lsn_in_batch >= self.max_lsn 96 | * batch requirement set to last reservation base + inline len - 1 97 | * reserve bumps 98 | * bump_atomic_lsn(&self.iobufs.max_reserved_lsn, reservation_lsn + inline_buf_len as Lsn - 1); 99 | 100 | lock-free linearizability model 101 | 102 | transactional linearizability (strict serializability) model 103 | 104 | panic model 105 | 106 | memory usage model 107 | 108 | storage usage model 109 | 110 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | sled uses some unsafe functionality in the core lock-free algorithms, and in a few places to more efficiently copy data. 6 | 7 | Please contact [Tyler Neely](mailto:tylerneely@gmail.com?subject=sled%20security%20issue) immediately if you find any vulnerability, and I will work with you to fix the issue rapidly and coordinate public disclosure with an expedited release including the fix. 8 | 9 | If you are a bug hunter or a person with a security interest, here is my mental model of memory corruption risk in the sled codebase: 10 | 11 | 1. memory issues relating to the lock-free data structures in their colder failure paths. these have been tested a bit by injecting delays into random places, but this is still an area with elevated risk 12 | 1. anywhere the `unsafe` keyword is used 13 | -------------------------------------------------------------------------------- /art/CREDITS: -------------------------------------------------------------------------------- 1 | original tree logo with face: 2 | https://twitter.com/daiyitastic 3 | 4 | anti-transphobia additions: 5 | spacejam 6 | -------------------------------------------------------------------------------- /art/tree_face.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spacejam/sled/e72f725be0f20e3e6e53a9d84cdbefa0dcbcde1c/art/tree_face.png -------------------------------------------------------------------------------- /art/tree_face_anti-transphobia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spacejam/sled/e72f725be0f20e3e6e53a9d84cdbefa0dcbcde1c/art/tree_face_anti-transphobia.png -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at tylerneely@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | -------------------------------------------------------------------------------- /examples/bench.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::sync::Barrier; 3 | use std::thread::scope; 4 | use std::time::{Duration, Instant}; 5 | use std::{fs, io}; 6 | 7 | use num_format::{Locale, ToFormattedString}; 8 | 9 | use sled::{Config, Db as SledDb}; 10 | 11 | type Db = SledDb<1024>; 12 | 13 | const N_WRITES_PER_THREAD: u32 = 4 * 1024 * 1024; 14 | const MAX_CONCURRENCY: u32 = 4; 15 | const CONCURRENCY: &[usize] = &[/*1, 2, 4,*/ MAX_CONCURRENCY as _]; 16 | const BYTES_PER_ITEM: u32 = 8; 17 | 18 | trait Databench: Clone + Send { 19 | type READ: AsRef<[u8]>; 20 | const NAME: &'static str; 21 | const PATH: &'static str; 22 | fn open() -> Self; 23 | fn remove_generic(&self, key: &[u8]); 24 | fn insert_generic(&self, key: &[u8], value: &[u8]); 25 | fn get_generic(&self, key: &[u8]) -> Option; 26 | fn flush_generic(&self); 27 | fn print_stats(&self); 28 | } 29 | 30 | impl Databench for Db { 31 | type READ = sled::InlineArray; 32 | 33 | const NAME: &'static str = "sled 1.0.0-alpha"; 34 | const PATH: &'static str = "timing_test.sled-new"; 35 | 36 | fn open() -> Self { 37 | sled::Config { 38 | path: Self::PATH.into(), 39 | zstd_compression_level: 3, 40 | cache_capacity_bytes: 1024 * 1024 * 1024, 41 | entry_cache_percent: 20, 42 | flush_every_ms: Some(200), 43 | ..Config::default() 44 | } 45 | .open() 46 | .unwrap() 47 | } 48 | 49 | fn insert_generic(&self, key: &[u8], value: &[u8]) { 50 | self.insert(key, value).unwrap(); 51 | } 52 | fn remove_generic(&self, key: &[u8]) { 53 | self.remove(key).unwrap(); 54 | } 55 | fn get_generic(&self, key: &[u8]) -> Option { 56 | self.get(key).unwrap() 57 | } 58 | fn flush_generic(&self) { 59 | self.flush().unwrap(); 60 | } 61 | fn print_stats(&self) { 62 | dbg!(self.stats()); 63 | } 64 | } 65 | 66 | /* 67 | impl Databench for old_sled::Db { 68 | type READ = old_sled::IVec; 69 | 70 | const NAME: &'static str = "sled 0.34.7"; 71 | const PATH: &'static str = "timing_test.sled-old"; 72 | 73 | fn open() -> Self { 74 | old_sled::open(Self::PATH).unwrap() 75 | } 76 | fn insert_generic(&self, key: &[u8], value: &[u8]) { 77 | self.insert(key, value).unwrap(); 78 | } 79 | fn get_generic(&self, key: &[u8]) -> Option { 80 | self.get(key).unwrap() 81 | } 82 | fn flush_generic(&self) { 83 | self.flush().unwrap(); 84 | } 85 | } 86 | */ 87 | 88 | /* 89 | impl Databench for Arc { 90 | type READ = Vec; 91 | 92 | const NAME: &'static str = "rocksdb 0.21.0"; 93 | const PATH: &'static str = "timing_test.rocksdb"; 94 | 95 | fn open() -> Self { 96 | Arc::new(rocksdb::DB::open_default(Self::PATH).unwrap()) 97 | } 98 | fn insert_generic(&self, key: &[u8], value: &[u8]) { 99 | self.put(key, value).unwrap(); 100 | } 101 | fn get_generic(&self, key: &[u8]) -> Option { 102 | self.get(key).unwrap() 103 | } 104 | fn flush_generic(&self) { 105 | self.flush().unwrap(); 106 | } 107 | } 108 | */ 109 | 110 | /* 111 | struct Lmdb { 112 | env: heed::Env, 113 | db: heed::Database< 114 | heed::types::UnalignedSlice, 115 | heed::types::UnalignedSlice, 116 | >, 117 | } 118 | 119 | impl Clone for Lmdb { 120 | fn clone(&self) -> Lmdb { 121 | Lmdb { env: self.env.clone(), db: self.db.clone() } 122 | } 123 | } 124 | 125 | impl Databench for Lmdb { 126 | type READ = Vec; 127 | 128 | const NAME: &'static str = "lmdb"; 129 | const PATH: &'static str = "timing_test.lmdb"; 130 | 131 | fn open() -> Self { 132 | let _ = std::fs::create_dir_all(Self::PATH); 133 | let env = heed::EnvOpenOptions::new() 134 | .map_size(1024 * 1024 * 1024) 135 | .open(Self::PATH) 136 | .unwrap(); 137 | let db = env.create_database(None).unwrap(); 138 | Lmdb { env, db } 139 | } 140 | fn insert_generic(&self, key: &[u8], value: &[u8]) { 141 | let mut wtxn = self.env.write_txn().unwrap(); 142 | self.db.put(&mut wtxn, key, value).unwrap(); 143 | wtxn.commit().unwrap(); 144 | } 145 | fn get_generic(&self, key: &[u8]) -> Option { 146 | let rtxn = self.env.read_txn().unwrap(); 147 | let ret = self.db.get(&rtxn, key).unwrap().map(Vec::from); 148 | rtxn.commit().unwrap(); 149 | ret 150 | } 151 | fn flush_generic(&self) { 152 | // NOOP 153 | } 154 | } 155 | */ 156 | 157 | /* 158 | struct Sqlite { 159 | connection: rusqlite::Connection, 160 | } 161 | 162 | impl Clone for Sqlite { 163 | fn clone(&self) -> Sqlite { 164 | Sqlite { connection: rusqlite::Connection::open(Self::PATH).unwrap() } 165 | } 166 | } 167 | 168 | impl Databench for Sqlite { 169 | type READ = Vec; 170 | 171 | const NAME: &'static str = "sqlite"; 172 | const PATH: &'static str = "timing_test.sqlite"; 173 | 174 | fn open() -> Self { 175 | let connection = rusqlite::Connection::open(Self::PATH).unwrap(); 176 | connection 177 | .execute( 178 | "create table if not exists bench ( 179 | key integer primary key, 180 | val integer not null 181 | )", 182 | [], 183 | ) 184 | .unwrap(); 185 | Sqlite { connection } 186 | } 187 | fn insert_generic(&self, key: &[u8], value: &[u8]) { 188 | loop { 189 | let res = self.connection.execute( 190 | "insert or ignore into bench (key, val) values (?1, ?2)", 191 | [ 192 | format!("{}", u32::from_be_bytes(key.try_into().unwrap())), 193 | format!( 194 | "{}", 195 | u32::from_be_bytes(value.try_into().unwrap()) 196 | ), 197 | ], 198 | ); 199 | if res.is_ok() { 200 | break; 201 | } 202 | } 203 | } 204 | fn get_generic(&self, key: &[u8]) -> Option { 205 | let mut stmt = self 206 | .connection 207 | .prepare("SELECT b.val from bench b WHERE key = ?1") 208 | .unwrap(); 209 | let mut rows = 210 | stmt.query([u32::from_be_bytes(key.try_into().unwrap())]).unwrap(); 211 | 212 | let value = rows.next().unwrap()?; 213 | value.get(0).ok() 214 | } 215 | fn flush_generic(&self) { 216 | // NOOP 217 | } 218 | } 219 | */ 220 | 221 | fn allocated() -> usize { 222 | #[cfg(feature = "testing-count-allocator")] 223 | { 224 | return sled::alloc::allocated(); 225 | } 226 | 0 227 | } 228 | 229 | fn freed() -> usize { 230 | #[cfg(feature = "testing-count-allocator")] 231 | { 232 | return sled::alloc::freed(); 233 | } 234 | 0 235 | } 236 | 237 | fn resident() -> usize { 238 | #[cfg(feature = "testing-count-allocator")] 239 | { 240 | return sled::alloc::resident(); 241 | } 242 | 0 243 | } 244 | 245 | fn inserts(store: &D) -> Vec { 246 | println!("{} inserts", D::NAME); 247 | let mut i = 0_u32; 248 | 249 | let factory = move || { 250 | i += 1; 251 | (store.clone(), i - 1) 252 | }; 253 | 254 | let f = |state: (D, u32)| { 255 | let (store, offset) = state; 256 | let start = N_WRITES_PER_THREAD * offset; 257 | let end = N_WRITES_PER_THREAD * (offset + 1); 258 | for i in start..end { 259 | let k: &[u8] = &i.to_be_bytes(); 260 | store.insert_generic(k, k); 261 | } 262 | }; 263 | 264 | let mut ret = vec![]; 265 | 266 | for concurrency in CONCURRENCY { 267 | let insert_elapsed = 268 | execute_lockstep_concurrent(factory, f, *concurrency); 269 | 270 | let flush_timer = Instant::now(); 271 | store.flush_generic(); 272 | 273 | let wps = (N_WRITES_PER_THREAD * *concurrency as u32) as u64 274 | * 1_000_000_u64 275 | / u64::try_from(insert_elapsed.as_micros().max(1)) 276 | .unwrap_or(u64::MAX); 277 | 278 | ret.push(InsertStats { 279 | thread_count: *concurrency, 280 | inserts_per_second: wps, 281 | }); 282 | 283 | println!( 284 | "{} inserts/s with {concurrency} threads over {:?}, then {:?} to flush {}", 285 | wps.to_formatted_string(&Locale::en), 286 | insert_elapsed, 287 | flush_timer.elapsed(), 288 | D::NAME, 289 | ); 290 | } 291 | 292 | ret 293 | } 294 | 295 | fn removes(store: &D) -> Vec { 296 | println!("{} removals", D::NAME); 297 | let mut i = 0_u32; 298 | 299 | let factory = move || { 300 | i += 1; 301 | (store.clone(), i - 1) 302 | }; 303 | 304 | let f = |state: (D, u32)| { 305 | let (store, offset) = state; 306 | let start = N_WRITES_PER_THREAD * offset; 307 | let end = N_WRITES_PER_THREAD * (offset + 1); 308 | for i in start..end { 309 | let k: &[u8] = &i.to_be_bytes(); 310 | store.remove_generic(k); 311 | } 312 | }; 313 | 314 | let mut ret = vec![]; 315 | 316 | for concurrency in CONCURRENCY { 317 | let remove_elapsed = 318 | execute_lockstep_concurrent(factory, f, *concurrency); 319 | 320 | let flush_timer = Instant::now(); 321 | store.flush_generic(); 322 | 323 | let wps = (N_WRITES_PER_THREAD * *concurrency as u32) as u64 324 | * 1_000_000_u64 325 | / u64::try_from(remove_elapsed.as_micros().max(1)) 326 | .unwrap_or(u64::MAX); 327 | 328 | ret.push(RemoveStats { 329 | thread_count: *concurrency, 330 | removes_per_second: wps, 331 | }); 332 | 333 | println!( 334 | "{} removes/s with {concurrency} threads over {:?}, then {:?} to flush {}", 335 | wps.to_formatted_string(&Locale::en), 336 | remove_elapsed, 337 | flush_timer.elapsed(), 338 | D::NAME, 339 | ); 340 | } 341 | 342 | ret 343 | } 344 | 345 | fn gets(store: &D) -> Vec { 346 | println!("{} reads", D::NAME); 347 | 348 | let factory = || store.clone(); 349 | 350 | let f = |store: D| { 351 | let start = 0; 352 | let end = N_WRITES_PER_THREAD * MAX_CONCURRENCY; 353 | for i in start..end { 354 | let k: &[u8] = &i.to_be_bytes(); 355 | store.get_generic(k); 356 | } 357 | }; 358 | 359 | let mut ret = vec![]; 360 | 361 | for concurrency in CONCURRENCY { 362 | let get_stone_elapsed = 363 | execute_lockstep_concurrent(factory, f, *concurrency); 364 | 365 | let rps = (N_WRITES_PER_THREAD * MAX_CONCURRENCY * *concurrency as u32) 366 | as u64 367 | * 1_000_000_u64 368 | / u64::try_from(get_stone_elapsed.as_micros().max(1)) 369 | .unwrap_or(u64::MAX); 370 | 371 | ret.push(GetStats { thread_count: *concurrency, gets_per_second: rps }); 372 | 373 | println!( 374 | "{} gets/s with concurrency of {concurrency}, {:?} total reads {}", 375 | rps.to_formatted_string(&Locale::en), 376 | get_stone_elapsed, 377 | D::NAME 378 | ); 379 | } 380 | ret 381 | } 382 | 383 | fn execute_lockstep_concurrent< 384 | State: Send, 385 | Factory: FnMut() -> State, 386 | F: Sync + Fn(State), 387 | >( 388 | mut factory: Factory, 389 | f: F, 390 | concurrency: usize, 391 | ) -> Duration { 392 | let barrier = &Barrier::new(concurrency + 1); 393 | let f = &f; 394 | 395 | scope(|s| { 396 | let mut threads = vec![]; 397 | 398 | for _ in 0..concurrency { 399 | let state = factory(); 400 | 401 | let thread = s.spawn(move || { 402 | barrier.wait(); 403 | f(state); 404 | }); 405 | 406 | threads.push(thread); 407 | } 408 | 409 | barrier.wait(); 410 | let get_stone = Instant::now(); 411 | 412 | for thread in threads.into_iter() { 413 | thread.join().unwrap(); 414 | } 415 | 416 | get_stone.elapsed() 417 | }) 418 | } 419 | 420 | #[derive(Debug, Clone, Copy)] 421 | struct InsertStats { 422 | thread_count: usize, 423 | inserts_per_second: u64, 424 | } 425 | 426 | #[derive(Debug, Clone, Copy)] 427 | struct GetStats { 428 | thread_count: usize, 429 | gets_per_second: u64, 430 | } 431 | 432 | #[derive(Debug, Clone, Copy)] 433 | struct RemoveStats { 434 | thread_count: usize, 435 | removes_per_second: u64, 436 | } 437 | 438 | #[allow(unused)] 439 | #[derive(Debug, Clone)] 440 | struct Stats { 441 | post_insert_disk_space: u64, 442 | post_remove_disk_space: u64, 443 | allocated_memory: usize, 444 | freed_memory: usize, 445 | resident_memory: usize, 446 | insert_stats: Vec, 447 | get_stats: Vec, 448 | remove_stats: Vec, 449 | } 450 | 451 | impl Stats { 452 | fn print_report(&self) { 453 | println!( 454 | "bytes on disk after inserts: {}", 455 | self.post_insert_disk_space.to_formatted_string(&Locale::en) 456 | ); 457 | println!( 458 | "bytes on disk after removes: {}", 459 | self.post_remove_disk_space.to_formatted_string(&Locale::en) 460 | ); 461 | println!( 462 | "bytes in memory: {}", 463 | self.resident_memory.to_formatted_string(&Locale::en) 464 | ); 465 | for stats in &self.insert_stats { 466 | println!( 467 | "{} threads {} inserts per second", 468 | stats.thread_count, 469 | stats.inserts_per_second.to_formatted_string(&Locale::en) 470 | ); 471 | } 472 | for stats in &self.get_stats { 473 | println!( 474 | "{} threads {} gets per second", 475 | stats.thread_count, 476 | stats.gets_per_second.to_formatted_string(&Locale::en) 477 | ); 478 | } 479 | for stats in &self.remove_stats { 480 | println!( 481 | "{} threads {} removes per second", 482 | stats.thread_count, 483 | stats.removes_per_second.to_formatted_string(&Locale::en) 484 | ); 485 | } 486 | } 487 | } 488 | 489 | fn bench() -> Stats { 490 | let store = D::open(); 491 | 492 | let insert_stats = inserts(&store); 493 | 494 | let before_flush = Instant::now(); 495 | store.flush_generic(); 496 | println!("final flush took {:?} for {}", before_flush.elapsed(), D::NAME); 497 | 498 | let post_insert_disk_space = du(D::PATH.as_ref()).unwrap(); 499 | 500 | let get_stats = gets(&store); 501 | 502 | let remove_stats = removes(&store); 503 | 504 | store.print_stats(); 505 | 506 | Stats { 507 | post_insert_disk_space, 508 | post_remove_disk_space: du(D::PATH.as_ref()).unwrap(), 509 | allocated_memory: allocated(), 510 | freed_memory: freed(), 511 | resident_memory: resident(), 512 | insert_stats, 513 | get_stats, 514 | remove_stats, 515 | } 516 | } 517 | 518 | fn du(path: &Path) -> io::Result { 519 | fn recurse(mut dir: fs::ReadDir) -> io::Result { 520 | dir.try_fold(0, |acc, file| { 521 | let file = file?; 522 | let size = match file.metadata()? { 523 | data if data.is_dir() => recurse(fs::read_dir(file.path())?)?, 524 | data => data.len(), 525 | }; 526 | Ok(acc + size) 527 | }) 528 | } 529 | 530 | recurse(fs::read_dir(path)?) 531 | } 532 | 533 | fn main() { 534 | let _ = env_logger::try_init(); 535 | 536 | let new_stats = bench::(); 537 | 538 | println!( 539 | "raw data size: {}", 540 | (MAX_CONCURRENCY * N_WRITES_PER_THREAD * BYTES_PER_ITEM) 541 | .to_formatted_string(&Locale::en) 542 | ); 543 | println!("sled 1.0 space stats:"); 544 | new_stats.print_report(); 545 | 546 | /* 547 | let old_stats = bench::(); 548 | dbg!(old_stats); 549 | 550 | let new_sled_vs_old_sled_storage_ratio = 551 | new_stats.disk_space as f64 / old_stats.disk_space as f64; 552 | let new_sled_vs_old_sled_allocated_memory_ratio = 553 | new_stats.allocated_memory as f64 / old_stats.allocated_memory as f64; 554 | let new_sled_vs_old_sled_freed_memory_ratio = 555 | new_stats.freed_memory as f64 / old_stats.freed_memory as f64; 556 | let new_sled_vs_old_sled_resident_memory_ratio = 557 | new_stats.resident_memory as f64 / old_stats.resident_memory as f64; 558 | 559 | dbg!(new_sled_vs_old_sled_storage_ratio); 560 | dbg!(new_sled_vs_old_sled_allocated_memory_ratio); 561 | dbg!(new_sled_vs_old_sled_freed_memory_ratio); 562 | dbg!(new_sled_vs_old_sled_resident_memory_ratio); 563 | 564 | let rocksdb_stats = bench::>(); 565 | 566 | bench::(); 567 | 568 | bench::(); 569 | */ 570 | 571 | /* 572 | let new_sled_vs_rocksdb_storage_ratio = 573 | new_stats.disk_space as f64 / rocksdb_stats.disk_space as f64; 574 | let new_sled_vs_rocksdb_allocated_memory_ratio = 575 | new_stats.allocated_memory as f64 / rocksdb_stats.allocated_memory as f64; 576 | let new_sled_vs_rocksdb_freed_memory_ratio = 577 | new_stats.freed_memory as f64 / rocksdb_stats.freed_memory as f64; 578 | let new_sled_vs_rocksdb_resident_memory_ratio = 579 | new_stats.resident_memory as f64 / rocksdb_stats.resident_memory as f64; 580 | 581 | dbg!(new_sled_vs_rocksdb_storage_ratio); 582 | dbg!(new_sled_vs_rocksdb_allocated_memory_ratio); 583 | dbg!(new_sled_vs_rocksdb_freed_memory_ratio); 584 | dbg!(new_sled_vs_rocksdb_resident_memory_ratio); 585 | */ 586 | 587 | /* 588 | let scan = Instant::now(); 589 | let count = stone.iter().count(); 590 | assert_eq!(count as u64, N_WRITES_PER_THREAD); 591 | let scan_elapsed = scan.elapsed(); 592 | println!( 593 | "{} scanned items/s, total {:?}", 594 | (N_WRITES_PER_THREAD * 1_000_000) / u64::try_from(scan_elapsed.as_micros().max(1)).unwrap_or(u64::MAX), 595 | scan_elapsed 596 | ); 597 | */ 598 | 599 | /* 600 | let scan_rev = Instant::now(); 601 | let count = stone.range(..).rev().count(); 602 | assert_eq!(count as u64, N_WRITES_PER_THREAD); 603 | let scan_rev_elapsed = scan_rev.elapsed(); 604 | println!( 605 | "{} reverse-scanned items/s, total {:?}", 606 | (N_WRITES_PER_THREAD * 1_000_000) / u64::try_from(scan_rev_elapsed.as_micros().max(1)).unwrap_or(u64::MAX), 607 | scan_rev_elapsed 608 | ); 609 | */ 610 | } 611 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bloodstone-fuzz" 3 | version = "0.0.0" 4 | authors = ["Automatically generated"] 5 | publish = false 6 | edition = "2018" 7 | 8 | [package.metadata] 9 | cargo-fuzz = true 10 | 11 | [dependencies.libfuzzer-sys] 12 | version = "0.4.0" 13 | features = ["arbitrary-derive"] 14 | 15 | [dependencies] 16 | arbitrary = { version = "1.0.3", features = ["derive"] } 17 | tempfile = "3.5.0" 18 | 19 | [dependencies.sled] 20 | path = ".." 21 | features = [] 22 | 23 | # Prevent this from interfering with workspaces 24 | [workspace] 25 | members = ["."] 26 | 27 | [[bin]] 28 | name = "fuzz_model" 29 | path = "fuzz_targets/fuzz_model.rs" 30 | test = false 31 | doc = false 32 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_model.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | #[macro_use] 3 | extern crate libfuzzer_sys; 4 | extern crate arbitrary; 5 | extern crate sled; 6 | 7 | use arbitrary::Arbitrary; 8 | 9 | use sled::{Config, Db as SledDb, InlineArray}; 10 | 11 | type Db = SledDb<3>; 12 | 13 | const KEYSPACE: u64 = 128; 14 | 15 | #[derive(Debug)] 16 | enum Op { 17 | Get { key: InlineArray }, 18 | Insert { key: InlineArray, value: InlineArray }, 19 | Reboot, 20 | Remove { key: InlineArray }, 21 | Cas { key: InlineArray, old: Option, new: Option }, 22 | Range { start: InlineArray, end: InlineArray }, 23 | } 24 | 25 | fn keygen( 26 | u: &mut arbitrary::Unstructured<'_>, 27 | ) -> arbitrary::Result { 28 | let key_i: u64 = u.int_in_range(0..=KEYSPACE)?; 29 | Ok(key_i.to_be_bytes().as_ref().into()) 30 | } 31 | 32 | impl<'a> Arbitrary<'a> for Op { 33 | fn arbitrary( 34 | u: &mut arbitrary::Unstructured<'a>, 35 | ) -> arbitrary::Result { 36 | Ok(if u.ratio(1, 2)? { 37 | Op::Insert { key: keygen(u)?, value: keygen(u)? } 38 | } else if u.ratio(1, 2)? { 39 | Op::Get { key: keygen(u)? } 40 | } else if u.ratio(1, 2)? { 41 | Op::Reboot 42 | } else if u.ratio(1, 2)? { 43 | Op::Remove { key: keygen(u)? } 44 | } else if u.ratio(1, 2)? { 45 | Op::Cas { 46 | key: keygen(u)?, 47 | old: if u.ratio(1, 2)? { Some(keygen(u)?) } else { None }, 48 | new: if u.ratio(1, 2)? { Some(keygen(u)?) } else { None }, 49 | } 50 | } else { 51 | let start = u.int_in_range(0..=KEYSPACE)?; 52 | let end = (start + 1).max(u.int_in_range(0..=KEYSPACE)?); 53 | 54 | Op::Range { 55 | start: start.to_be_bytes().as_ref().into(), 56 | end: end.to_be_bytes().as_ref().into(), 57 | } 58 | }) 59 | } 60 | } 61 | 62 | fuzz_target!(|ops: Vec| { 63 | let tmp_dir = tempfile::TempDir::new().unwrap(); 64 | let tmp_path = tmp_dir.path().to_owned(); 65 | let config = Config::new().path(tmp_path); 66 | 67 | let mut tree: Db = config.open().unwrap(); 68 | let mut model = std::collections::BTreeMap::new(); 69 | 70 | for (_i, op) in ops.into_iter().enumerate() { 71 | match op { 72 | Op::Insert { key, value } => { 73 | assert_eq!( 74 | tree.insert(key.clone(), value.clone()).unwrap(), 75 | model.insert(key, value) 76 | ); 77 | } 78 | Op::Get { key } => { 79 | assert_eq!(tree.get(&key).unwrap(), model.get(&key).cloned()); 80 | } 81 | Op::Reboot => { 82 | drop(tree); 83 | tree = config.open().unwrap(); 84 | } 85 | Op::Remove { key } => { 86 | assert_eq!(tree.remove(&key).unwrap(), model.remove(&key)); 87 | } 88 | Op::Range { start, end } => { 89 | let mut model_iter = 90 | model.range::(&start..&end); 91 | let mut tree_iter = tree.range(start..end); 92 | 93 | for (k1, v1) in &mut model_iter { 94 | let (k2, v2) = tree_iter 95 | .next() 96 | .expect("None returned from iter when Some expected") 97 | .expect("IO issue encountered"); 98 | assert_eq!((k1, v1), (&k2, &v2)); 99 | } 100 | 101 | assert!(tree_iter.next().is_none()); 102 | } 103 | Op::Cas { key, old, new } => { 104 | let succ = if old == model.get(&key).cloned() { 105 | if let Some(n) = &new { 106 | model.insert(key.clone(), n.clone()); 107 | } else { 108 | model.remove(&key); 109 | } 110 | true 111 | } else { 112 | false 113 | }; 114 | 115 | let res = tree 116 | .compare_and_swap(key, old.as_ref(), new) 117 | .expect("hit IO error"); 118 | 119 | if succ { 120 | assert!(res.is_ok()); 121 | } else { 122 | assert!(res.is_err()); 123 | } 124 | } 125 | }; 126 | 127 | for (key, value) in &model { 128 | assert_eq!(tree.get(key).unwrap().unwrap(), value); 129 | } 130 | 131 | for kv_res in &tree { 132 | let (key, value) = kv_res.unwrap(); 133 | assert_eq!(model.get(&key), Some(&value)); 134 | } 135 | } 136 | 137 | let mut model_iter = model.iter(); 138 | let mut tree_iter = tree.iter(); 139 | 140 | for (k1, v1) in &mut model_iter { 141 | let (k2, v2) = tree_iter.next().unwrap().unwrap(); 142 | assert_eq!((k1, v1), (&k2, &v2)); 143 | } 144 | 145 | assert!(tree_iter.next().is_none()); 146 | }); 147 | -------------------------------------------------------------------------------- /scripts/cgtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cgdelete memory:sledTest || true 5 | cgcreate -g memory:sledTest 6 | echo 100M > /sys/fs/cgroup/memory/sledTest/memory.limit_in_bytes 7 | 8 | su $SUDO_USER -c 'cargo build --release --features=testing' 9 | 10 | for test in target/release/deps/test*; do 11 | if [[ -x $test ]] 12 | then 13 | echo running test: $test 14 | cgexec -g memory:sledTest $test --test-threads=1 15 | rm $test 16 | fi 17 | done 18 | -------------------------------------------------------------------------------- /scripts/cross_compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | # checks sled's compatibility using several targets 5 | 6 | targets="wasm32-wasi wasm32-unknown-unknown aarch64-fuchsia aarch64-linux-android \ 7 | i686-linux-android i686-unknown-linux-gnu \ 8 | x86_64-linux-android x86_64-fuchsia \ 9 | mips-unknown-linux-musl aarch64-apple-ios" 10 | 11 | rustup update --no-self-update 12 | 13 | RUSTFLAGS="--cfg miri" cargo check 14 | 15 | rustup toolchain install 1.62 --no-self-update 16 | cargo clean 17 | rm Cargo.lock 18 | cargo +1.62 check 19 | 20 | for target in $targets; do 21 | echo "setting up $target..." 22 | rustup target add $target 23 | echo "checking $target..." 24 | cargo check --target $target 25 | done 26 | 27 | -------------------------------------------------------------------------------- /scripts/execution_explorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/gdb --command 2 | 3 | """ 4 | a simple python GDB script for running multithreaded 5 | programs in a way that is "deterministic enough" 6 | to tease out and replay interesting bugs. 7 | 8 | Tyler Neely 25 Sept 2017 9 | t@jujit.su 10 | 11 | references: 12 | https://sourceware.org/gdb/onlinedocs/gdb/All_002dStop-Mode.html 13 | https://sourceware.org/gdb/onlinedocs/gdb/Non_002dStop-Mode.html 14 | https://sourceware.org/gdb/onlinedocs/gdb/Threads-In-Python.html 15 | https://sourceware.org/gdb/onlinedocs/gdb/Events-In-Python.html 16 | https://blog.0x972.info/index.php?tag=gdb.py 17 | """ 18 | 19 | import gdb 20 | import random 21 | 22 | ############################################################################### 23 | # config # 24 | ############################################################################### 25 | # set this to a number for reproducing results or None to explore randomly 26 | seed = 156112673742 # None # 951931004895 27 | 28 | # set this to the number of valid threads in the program 29 | # {2, 3} assumes a main thread that waits on 2 workers. 30 | # {1, ... N} assumes all of the first N threads are to be explored 31 | threads_whitelist = {2, 3} 32 | 33 | # set this to the file of the binary to explore 34 | filename = "target/debug/binary" 35 | 36 | # set this to the place the threads should rendezvous before exploring 37 | entrypoint = "src/main.rs:8" 38 | 39 | # set this to after the threads are done 40 | exitpoint = "src/main.rs:12" 41 | 42 | # invariant unreachable points that should never be accessed 43 | unreachable = [ 44 | "panic_unwind::imp::panic" 45 | ] 46 | 47 | # set this to the locations you want to test interleavings for 48 | interesting = [ 49 | "src/main.rs:8", 50 | "src/main.rs:9" 51 | ] 52 | 53 | # uncomment this to output the specific commands issued to gdb 54 | gdb.execute("set trace-commands on") 55 | 56 | ############################################################################### 57 | ############################################################################### 58 | 59 | 60 | class UnreachableBreakpoint(gdb.Breakpoint): 61 | pass 62 | 63 | 64 | class DoneBreakpoint(gdb.Breakpoint): 65 | pass 66 | 67 | 68 | class InterestingBreakpoint(gdb.Breakpoint): 69 | pass 70 | 71 | 72 | class DeterministicExecutor: 73 | def __init__(self, seed=None): 74 | if seed: 75 | print("seeding with", seed) 76 | self.seed = seed 77 | random.seed(seed) 78 | else: 79 | # pick a random new seed if not provided with one 80 | self.reseed() 81 | 82 | gdb.execute("file " + filename) 83 | 84 | # non-stop is necessary to provide thread-specific 85 | # information when breakpoints are hit. 86 | gdb.execute("set non-stop on") 87 | gdb.execute("set confirm off") 88 | 89 | self.ready = set() 90 | self.finished = set() 91 | 92 | def reseed(self): 93 | random.seed() 94 | self.seed = random.randrange(1e12) 95 | print("reseeding with", self.seed) 96 | random.seed(self.seed) 97 | 98 | def restart(self): 99 | # reset inner state 100 | self.ready = set() 101 | self.finished = set() 102 | 103 | # disconnect callbacks 104 | gdb.events.stop.disconnect(self.scheduler_callback) 105 | gdb.events.exited.disconnect(self.exit_callback) 106 | 107 | # nuke all breakpoints 108 | gdb.execute("d") 109 | 110 | # end execution 111 | gdb.execute("k") 112 | 113 | # pick new seed 114 | self.reseed() 115 | 116 | self.run() 117 | 118 | def rendezvous_callback(self, event): 119 | try: 120 | self.ready.add(event.inferior_thread.num) 121 | if len(self.ready) == len(threads_whitelist): 122 | self.run_schedule() 123 | except Exception as e: 124 | # this will be thrown if breakpoint is not a part of event, 125 | # like when the event was stopped for another reason. 126 | print(e) 127 | 128 | def run(self): 129 | gdb.execute("b " + entrypoint) 130 | 131 | gdb.events.stop.connect(self.rendezvous_callback) 132 | gdb.events.exited.connect(self.exit_callback) 133 | 134 | gdb.execute("r") 135 | 136 | def run_schedule(self): 137 | print("running schedule") 138 | gdb.execute("d") 139 | gdb.events.stop.disconnect(self.rendezvous_callback) 140 | gdb.events.stop.connect(self.scheduler_callback) 141 | 142 | for bp in interesting: 143 | InterestingBreakpoint(bp) 144 | 145 | for bp in unreachable: 146 | UnreachableBreakpoint(bp) 147 | 148 | DoneBreakpoint(exitpoint) 149 | 150 | self.pick() 151 | 152 | def pick(self): 153 | threads = self.runnable_threads() 154 | if not threads: 155 | print("restarting execution after running out of valid threads") 156 | self.restart() 157 | return 158 | 159 | thread = random.choice(threads) 160 | 161 | gdb.execute("t " + str(thread.num)) 162 | gdb.execute("c") 163 | 164 | def scheduler_callback(self, event): 165 | if not isinstance(event, gdb.BreakpointEvent): 166 | print("WTF sched callback got", event.__dict__) 167 | return 168 | 169 | if isinstance(event.breakpoint, DoneBreakpoint): 170 | self.finished.add(event.inferior_thread.num) 171 | elif isinstance(event.breakpoint, UnreachableBreakpoint): 172 | print("!" * 80) 173 | print("unreachable breakpoint triggered with seed", self.seed) 174 | print("!" * 80) 175 | gdb.events.exited.disconnect(self.exit_callback) 176 | gdb.execute("q") 177 | else: 178 | print("thread", event.inferior_thread.num, 179 | "hit breakpoint at", event.breakpoint.location) 180 | 181 | self.pick() 182 | 183 | def runnable_threads(self): 184 | threads = gdb.selected_inferior().threads() 185 | 186 | def f(it): 187 | return (it.is_valid() and not 188 | it.is_exited() and 189 | it.num in threads_whitelist and 190 | it.num not in self.finished) 191 | 192 | good_threads = [it for it in threads if f(it)] 193 | good_threads.sort(key=lambda it: it.num) 194 | 195 | return good_threads 196 | 197 | def exit_callback(self, event): 198 | try: 199 | if event.exit_code != 0: 200 | print("!" * 80) 201 | print("interesting exit with seed", self.seed) 202 | print("!" * 80) 203 | else: 204 | print("happy exit") 205 | self.restart() 206 | 207 | gdb.execute("q") 208 | except Exception as e: 209 | pass 210 | 211 | de = DeterministicExecutor(seed) 212 | de.run() 213 | -------------------------------------------------------------------------------- /scripts/instructions: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # counts instructions for a standard workload 3 | set -e 4 | 5 | OUTFILE="cachegrind.stress2.`git describe --always --dirty`-`date +%s`" 6 | 7 | rm -rf default.sled || true 8 | 9 | cargo build \ 10 | --bin=stress2 \ 11 | --release 12 | 13 | 14 | # --tool=callgrind --dump-instr=yes --collect-jumps=yes --simulate-cache=yes \ 15 | # --callgrind-out-file="$OUTFILE" \ 16 | 17 | valgrind \ 18 | --tool=cachegrind \ 19 | --cachegrind-out-file="$OUTFILE" \ 20 | ./target/release/stress2 --total-ops=50000 --set-prop=1000000000000 --threads=1 21 | 22 | LAST=`ls -t cachegrind.stress2.* | sed -n 2p` 23 | 24 | echo "comparing $LAST with new $OUTFILE" 25 | 26 | echo "--------------------------------------------------------------------------------" 27 | echo "change since last run:" 28 | echo " Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw" 29 | echo "--------------------------------------------------------------------------------" 30 | cg_diff $LAST $OUTFILE | tail -1 31 | -------------------------------------------------------------------------------- /scripts/sanitizers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | 4 | pushd benchmarks/stress2 5 | 6 | rustup toolchain install nightly 7 | rustup toolchain install nightly --component rust-src 8 | rustup update 9 | 10 | export SLED_LOCK_FREE_DELAY_INTENSITY=2000 11 | 12 | echo "msan" 13 | cargo clean 14 | export RUSTFLAGS="-Zsanitizer=memory -Zsanitizer-memory-track-origins" 15 | cargo +nightly build -Zbuild-std --target x86_64-unknown-linux-gnu 16 | sudo rm -rf default.sled 17 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30 --set-prop=100000000 --val-len=1000 --entries=100 --threads=100 18 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30 --entries=100 19 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30 20 | unset MSAN_OPTIONS 21 | 22 | echo "asan" 23 | cargo clean 24 | export RUSTFLAGS="-Z sanitizer=address" 25 | export ASAN_OPTIONS="detect_odr_violation=0" 26 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu 27 | sudo rm -rf default.sled 28 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=60 29 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6 30 | unset ASAN_OPTIONS 31 | 32 | echo "lsan" 33 | cargo clean 34 | export RUSTFLAGS="-Z sanitizer=leak" 35 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu 36 | sudo rm -rf default.sled 37 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=60 38 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6 39 | 40 | echo "tsan" 41 | cargo clean 42 | export RUSTFLAGS="-Z sanitizer=thread" 43 | export TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt 44 | sudo rm -rf default.sled 45 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=60 46 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=6 47 | unset RUSTFLAGS 48 | unset TSAN_OPTIONS 49 | -------------------------------------------------------------------------------- /scripts/shufnice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | while true; do 4 | PID=`pgrep $1` 5 | TIDS=`ls /proc/$PID/task` 6 | TID=`echo $TIDS | tr " " "\n" | shuf -n1` 7 | NICE=$((`shuf -i 0-39 -n 1` - 20)) 8 | echo "renicing $TID to $NICE" 9 | renice -n $NICE -p $TID 10 | done 11 | -------------------------------------------------------------------------------- /scripts/ubuntu_bench: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sudo apt-get update 4 | sudo apt-get install htop dstat build-essential linux-tools-common linux-tools-generic linux-tools-`uname -r` 5 | curl https://sh.rustup.rs -sSf | sh 6 | source $HOME/.cargo/env 7 | 8 | cargo install flamegraph 9 | 10 | git clone https://github.com/spacejam/sled.git 11 | cd sled 12 | 13 | cores=$(grep -c ^processor /proc/cpuinfo) 14 | writers=(($cores / 5 + 1 )) 15 | readers=$(( ($cores / 5 + 1) * 4 )) 16 | 17 | cargo build --release --bin=stress2 --features=stress 18 | 19 | # we use sudo here to get access to symbols 20 | pushd benchmarks/stress2 21 | cargo flamegraph --release -- --get=$readers --set=$writers 22 | -------------------------------------------------------------------------------- /src/alloc.rs: -------------------------------------------------------------------------------- 1 | #[cfg(any( 2 | feature = "testing-shred-allocator", 3 | feature = "testing-count-allocator" 4 | ))] 5 | pub use alloc::*; 6 | 7 | // the memshred feature causes all allocated and deallocated 8 | // memory to be set to a specific non-zero value of 0xa1 for 9 | // uninitialized allocations and 0xde for deallocated memory, 10 | // in the hope that it will cause memory errors to surface 11 | // more quickly. 12 | 13 | #[cfg(feature = "testing-shred-allocator")] 14 | mod alloc { 15 | use std::alloc::{Layout, System}; 16 | 17 | #[global_allocator] 18 | static ALLOCATOR: ShredAllocator = ShredAllocator; 19 | 20 | #[derive(Default, Debug, Clone, Copy)] 21 | struct ShredAllocator; 22 | 23 | unsafe impl std::alloc::GlobalAlloc for ShredAllocator { 24 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 25 | let ret = System.alloc(layout); 26 | assert_ne!(ret, std::ptr::null_mut()); 27 | std::ptr::write_bytes(ret, 0xa1, layout.size()); 28 | ret 29 | } 30 | 31 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 32 | std::ptr::write_bytes(ptr, 0xde, layout.size()); 33 | System.dealloc(ptr, layout) 34 | } 35 | } 36 | } 37 | 38 | #[cfg(feature = "testing-count-allocator")] 39 | mod alloc { 40 | use std::alloc::{Layout, System}; 41 | 42 | #[global_allocator] 43 | static ALLOCATOR: CountingAllocator = CountingAllocator; 44 | 45 | static ALLOCATED: AtomicUsize = AtomicUsize::new(0); 46 | static FREED: AtomicUsize = AtomicUsize::new(0); 47 | static RESIDENT: AtomicUsize = AtomicUsize::new(0); 48 | 49 | fn allocated() -> usize { 50 | ALLOCATED.swap(0, Ordering::Relaxed) 51 | } 52 | 53 | fn freed() -> usize { 54 | FREED.swap(0, Ordering::Relaxed) 55 | } 56 | 57 | fn resident() -> usize { 58 | RESIDENT.load(Ordering::Relaxed) 59 | } 60 | 61 | #[derive(Default, Debug, Clone, Copy)] 62 | struct CountingAllocator; 63 | 64 | unsafe impl std::alloc::GlobalAlloc for CountingAllocator { 65 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 66 | let ret = System.alloc(layout); 67 | assert_ne!(ret, std::ptr::null_mut()); 68 | ALLOCATED.fetch_add(layout.size(), Ordering::Relaxed); 69 | RESIDENT.fetch_add(layout.size(), Ordering::Relaxed); 70 | std::ptr::write_bytes(ret, 0xa1, layout.size()); 71 | ret 72 | } 73 | 74 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 75 | std::ptr::write_bytes(ptr, 0xde, layout.size()); 76 | FREED.fetch_add(layout.size(), Ordering::Relaxed); 77 | RESIDENT.fetch_sub(layout.size(), Ordering::Relaxed); 78 | System.dealloc(ptr, layout) 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/block_checker.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::panic::Location; 3 | use std::sync::atomic::{AtomicU64, Ordering}; 4 | use std::sync::{LazyLock, Mutex}; 5 | 6 | static COUNTER: AtomicU64 = AtomicU64::new(0); 7 | static CHECK_INS: LazyLock = LazyLock::new(|| { 8 | std::thread::spawn(move || { 9 | let mut last_top_10 = Default::default(); 10 | loop { 11 | std::thread::sleep(std::time::Duration::from_secs(5)); 12 | last_top_10 = CHECK_INS.report(last_top_10); 13 | } 14 | }); 15 | 16 | BlockChecker::default() 17 | }); 18 | 19 | type LocationMap = BTreeMap>; 20 | 21 | #[derive(Default)] 22 | pub(crate) struct BlockChecker { 23 | state: Mutex, 24 | } 25 | 26 | impl BlockChecker { 27 | fn report(&self, last_top_10: LocationMap) -> LocationMap { 28 | let state = self.state.lock().unwrap(); 29 | println!("top 10 longest blocking sections:"); 30 | 31 | let top_10: LocationMap = 32 | state.iter().take(10).map(|(k, v)| (*k, *v)).collect(); 33 | 34 | for (id, location) in &top_10 { 35 | if last_top_10.contains_key(id) { 36 | println!("id: {}, location: {:?}", id, location); 37 | } 38 | } 39 | 40 | top_10 41 | } 42 | 43 | fn check_in(&self, location: &'static Location) -> BlockGuard { 44 | let next_id = COUNTER.fetch_add(1, Ordering::Relaxed); 45 | let mut state = self.state.lock().unwrap(); 46 | state.insert(next_id, location); 47 | BlockGuard { id: next_id } 48 | } 49 | 50 | fn check_out(&self, id: u64) { 51 | let mut state = self.state.lock().unwrap(); 52 | state.remove(&id); 53 | } 54 | } 55 | 56 | pub(crate) struct BlockGuard { 57 | id: u64, 58 | } 59 | 60 | impl Drop for BlockGuard { 61 | fn drop(&mut self) { 62 | CHECK_INS.check_out(self.id) 63 | } 64 | } 65 | 66 | #[track_caller] 67 | pub(crate) fn track_blocks() -> BlockGuard { 68 | let caller = Location::caller(); 69 | CHECK_INS.check_in(caller) 70 | } 71 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::path::{Path, PathBuf}; 3 | use std::sync::Arc; 4 | 5 | use fault_injection::{annotate, fallible}; 6 | use tempdir::TempDir; 7 | 8 | use crate::Db; 9 | 10 | macro_rules! builder { 11 | ($(($name:ident, $t:ty, $desc:expr)),*) => { 12 | $( 13 | #[doc=$desc] 14 | pub fn $name(mut self, to: $t) -> Self { 15 | self.$name = to; 16 | self 17 | } 18 | )* 19 | } 20 | } 21 | 22 | #[derive(Debug, Clone)] 23 | pub struct Config { 24 | /// The base directory for storing the database. 25 | pub path: PathBuf, 26 | /// Cache size in **bytes**. Default is 512mb. 27 | pub cache_capacity_bytes: usize, 28 | /// The percentage of the cache that is dedicated to the 29 | /// scan-resistant entry cache. 30 | pub entry_cache_percent: u8, 31 | /// Start a background thread that flushes data to disk 32 | /// every few milliseconds. Defaults to every 200ms. 33 | pub flush_every_ms: Option, 34 | /// The zstd compression level to use when writing data to disk. Defaults to 3. 35 | pub zstd_compression_level: i32, 36 | /// This is only set to `Some` for objects created via 37 | /// `Config::tmp`, and will remove the storage directory 38 | /// when the final Arc drops. 39 | pub tempdir_deleter: Option>, 40 | /// A float between 0.0 and 1.0 that controls how much fragmentation can 41 | /// exist in a file before GC attempts to recompact it. 42 | pub target_heap_file_fill_ratio: f32, 43 | /// Values larger than this configurable will be stored as separate blob 44 | pub max_inline_value_threshold: usize, 45 | } 46 | 47 | impl Default for Config { 48 | fn default() -> Config { 49 | Config { 50 | path: "bloodstone.default".into(), 51 | flush_every_ms: Some(200), 52 | cache_capacity_bytes: 512 * 1024 * 1024, 53 | entry_cache_percent: 20, 54 | zstd_compression_level: 3, 55 | tempdir_deleter: None, 56 | target_heap_file_fill_ratio: 0.9, 57 | max_inline_value_threshold: 4096, 58 | } 59 | } 60 | } 61 | 62 | impl Config { 63 | /// Returns a default `Config` 64 | pub fn new() -> Config { 65 | Config::default() 66 | } 67 | 68 | /// Returns a config with the `path` initialized to a system 69 | /// temporary directory that will be deleted when this `Config` 70 | /// is dropped. 71 | pub fn tmp() -> io::Result { 72 | let tempdir = fallible!(tempdir::TempDir::new("sled_tmp")); 73 | 74 | Ok(Config { 75 | path: tempdir.path().into(), 76 | tempdir_deleter: Some(Arc::new(tempdir)), 77 | ..Config::default() 78 | }) 79 | } 80 | 81 | /// Set the path of the database (builder). 82 | pub fn path>(mut self, path: P) -> Config { 83 | self.path = path.as_ref().to_path_buf(); 84 | self 85 | } 86 | 87 | builder!( 88 | (flush_every_ms, Option, "Start a background thread that flushes data to disk every few milliseconds. Defaults to every 200ms."), 89 | (cache_capacity_bytes, usize, "Cache size in **bytes**. Default is 512mb."), 90 | (entry_cache_percent, u8, "The percentage of the cache that is dedicated to the scan-resistant entry cache."), 91 | (zstd_compression_level, i32, "The zstd compression level to use when writing data to disk. Defaults to 3."), 92 | (target_heap_file_fill_ratio, f32, "A float between 0.0 and 1.0 that controls how much fragmentation can exist in a file before GC attempts to recompact it."), 93 | (max_inline_value_threshold, usize, "Values larger than this configurable will be stored as separate blob") 94 | ); 95 | 96 | pub fn open( 97 | &self, 98 | ) -> io::Result> { 99 | if LEAF_FANOUT < 3 { 100 | return Err(annotate!(io::Error::new( 101 | io::ErrorKind::Unsupported, 102 | "Db's LEAF_FANOUT const generic must be 3 or greater." 103 | ))); 104 | } 105 | Db::open_with_config(self) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/event_verifier.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::sync::Mutex; 3 | 4 | use crate::{FlushEpoch, ObjectId}; 5 | 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 7 | pub(crate) enum State { 8 | Unallocated, 9 | Dirty, 10 | CooperativelySerialized, 11 | AddedToWriteBatch, 12 | Flushed, 13 | CleanPagedIn, 14 | PagedOut, 15 | } 16 | 17 | impl State { 18 | fn can_transition_within_epoch_to(&self, next: State) -> bool { 19 | match (self, next) { 20 | (State::Flushed, State::PagedOut) => true, 21 | (State::Flushed, _) => false, 22 | (State::AddedToWriteBatch, State::Flushed) => true, 23 | (State::AddedToWriteBatch, _) => false, 24 | (State::CleanPagedIn, State::AddedToWriteBatch) => false, 25 | (State::CleanPagedIn, State::Flushed) => false, 26 | (State::Dirty, State::AddedToWriteBatch) => true, 27 | (State::CooperativelySerialized, State::AddedToWriteBatch) => true, 28 | (State::CooperativelySerialized, _) => false, 29 | (State::Unallocated, State::AddedToWriteBatch) => true, 30 | (State::Unallocated, _) => false, 31 | (State::Dirty, State::Dirty) => true, 32 | (State::Dirty, State::CooperativelySerialized) => true, 33 | (State::Dirty, State::Unallocated) => true, 34 | (State::Dirty, _) => false, 35 | (State::CleanPagedIn, State::Dirty) => true, 36 | (State::CleanPagedIn, State::PagedOut) => true, 37 | (State::CleanPagedIn, State::CleanPagedIn) => true, 38 | (State::CleanPagedIn, State::Unallocated) => true, 39 | (State::CleanPagedIn, State::CooperativelySerialized) => true, 40 | (State::PagedOut, State::CleanPagedIn) => true, 41 | (State::PagedOut, _) => false, 42 | } 43 | } 44 | 45 | fn needs_flush(&self) -> bool { 46 | match self { 47 | State::CleanPagedIn => false, 48 | State::Flushed => false, 49 | State::PagedOut => false, 50 | _ => true, 51 | } 52 | } 53 | } 54 | 55 | #[derive(Debug, Default)] 56 | pub(crate) struct EventVerifier { 57 | flush_model: 58 | Mutex>>, 59 | } 60 | 61 | impl Drop for EventVerifier { 62 | fn drop(&mut self) { 63 | // assert that nothing is currently Dirty 64 | let flush_model = self.flush_model.lock().unwrap(); 65 | for ((oid, _epoch), history) in flush_model.iter() { 66 | if let Some((last_state, _at)) = history.last() { 67 | assert_ne!( 68 | *last_state, 69 | State::Dirty, 70 | "{oid:?} is Dirty when system shutting down" 71 | ); 72 | } 73 | } 74 | } 75 | } 76 | 77 | impl EventVerifier { 78 | pub(crate) fn mark( 79 | &self, 80 | object_id: ObjectId, 81 | epoch: FlushEpoch, 82 | state: State, 83 | at: &'static str, 84 | ) { 85 | if matches!(state, State::PagedOut) { 86 | let dirty_epochs = self.dirty_epochs(object_id); 87 | if !dirty_epochs.is_empty() { 88 | println!("{object_id:?} was paged out while having dirty epochs {dirty_epochs:?}"); 89 | self.print_debug_history_for_object(object_id); 90 | println!("{state:?} {epoch:?} {at}"); 91 | println!("invalid object state transition"); 92 | std::process::abort(); 93 | } 94 | } 95 | 96 | let mut flush_model = self.flush_model.lock().unwrap(); 97 | let history = flush_model.entry((object_id, epoch)).or_default(); 98 | 99 | if let Some((last_state, _at)) = history.last() { 100 | if !last_state.can_transition_within_epoch_to(state) { 101 | println!( 102 | "object_id {object_id:?} performed \ 103 | illegal state transition from {last_state:?} \ 104 | to {state:?} at {at} in epoch {epoch:?}." 105 | ); 106 | 107 | println!("history:"); 108 | history.push((state, at)); 109 | 110 | let active_epochs = flush_model.range( 111 | (object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX), 112 | ); 113 | for ((_oid, epoch), history) in active_epochs { 114 | for (last_state, at) in history { 115 | println!("{last_state:?} {epoch:?} {at}"); 116 | } 117 | } 118 | 119 | println!("invalid object state transition"); 120 | 121 | std::process::abort(); 122 | } 123 | } 124 | history.push((state, at)); 125 | } 126 | 127 | /// Returns the FlushEpochs for which this ObjectId has unflushed 128 | /// dirty data for. 129 | fn dirty_epochs(&self, object_id: ObjectId) -> Vec { 130 | let mut dirty_epochs = vec![]; 131 | let flush_model = self.flush_model.lock().unwrap(); 132 | 133 | let active_epochs = flush_model 134 | .range((object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX)); 135 | 136 | for ((_oid, epoch), history) in active_epochs { 137 | let (last_state, _at) = history.last().unwrap(); 138 | if last_state.needs_flush() { 139 | dirty_epochs.push(*epoch); 140 | } 141 | } 142 | 143 | dirty_epochs 144 | } 145 | 146 | pub(crate) fn print_debug_history_for_object(&self, object_id: ObjectId) { 147 | let flush_model = self.flush_model.lock().unwrap(); 148 | println!("history for object {:?}:", object_id); 149 | let active_epochs = flush_model 150 | .range((object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX)); 151 | for ((_oid, epoch), history) in active_epochs { 152 | for (last_state, at) in history { 153 | println!("{last_state:?} {epoch:?} {at}"); 154 | } 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/flush_epoch.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroU64; 2 | use std::sync::atomic::{AtomicPtr, AtomicU64, Ordering}; 3 | use std::sync::{Arc, Condvar, Mutex}; 4 | 5 | const SEAL_BIT: u64 = 1 << 63; 6 | const SEAL_MASK: u64 = u64::MAX - SEAL_BIT; 7 | const MIN_EPOCH: u64 = 2; 8 | 9 | #[derive( 10 | Debug, 11 | Clone, 12 | Copy, 13 | serde::Serialize, 14 | serde::Deserialize, 15 | PartialOrd, 16 | Ord, 17 | PartialEq, 18 | Eq, 19 | Hash, 20 | )] 21 | pub struct FlushEpoch(NonZeroU64); 22 | 23 | impl FlushEpoch { 24 | pub const MIN: FlushEpoch = FlushEpoch(NonZeroU64::MIN); 25 | #[allow(unused)] 26 | pub const MAX: FlushEpoch = FlushEpoch(NonZeroU64::MAX); 27 | 28 | pub fn increment(&self) -> FlushEpoch { 29 | FlushEpoch(NonZeroU64::new(self.0.get() + 1).unwrap()) 30 | } 31 | 32 | pub fn get(&self) -> u64 { 33 | self.0.get() 34 | } 35 | } 36 | 37 | impl concurrent_map::Minimum for FlushEpoch { 38 | const MIN: FlushEpoch = FlushEpoch::MIN; 39 | } 40 | 41 | #[derive(Debug)] 42 | pub(crate) struct FlushInvariants { 43 | max_flushed_epoch: AtomicU64, 44 | max_flushing_epoch: AtomicU64, 45 | } 46 | 47 | impl Default for FlushInvariants { 48 | fn default() -> FlushInvariants { 49 | FlushInvariants { 50 | max_flushed_epoch: (MIN_EPOCH - 1).into(), 51 | max_flushing_epoch: (MIN_EPOCH - 1).into(), 52 | } 53 | } 54 | } 55 | 56 | impl FlushInvariants { 57 | pub(crate) fn mark_flushed_epoch(&self, epoch: FlushEpoch) { 58 | let last = self.max_flushed_epoch.swap(epoch.get(), Ordering::SeqCst); 59 | 60 | assert_eq!(last + 1, epoch.get()); 61 | } 62 | 63 | pub(crate) fn mark_flushing_epoch(&self, epoch: FlushEpoch) { 64 | let last = self.max_flushing_epoch.swap(epoch.get(), Ordering::SeqCst); 65 | 66 | assert_eq!(last + 1, epoch.get()); 67 | } 68 | } 69 | 70 | #[derive(Clone, Debug)] 71 | pub(crate) struct Completion { 72 | mu: Arc>, 73 | cv: Arc, 74 | epoch: FlushEpoch, 75 | } 76 | 77 | impl Completion { 78 | pub fn epoch(&self) -> FlushEpoch { 79 | self.epoch 80 | } 81 | 82 | pub fn new(epoch: FlushEpoch) -> Completion { 83 | Completion { mu: Default::default(), cv: Default::default(), epoch } 84 | } 85 | 86 | pub fn wait_for_complete(self) -> FlushEpoch { 87 | let mut mu = self.mu.lock().unwrap(); 88 | while !*mu { 89 | mu = self.cv.wait(mu).unwrap(); 90 | } 91 | 92 | self.epoch 93 | } 94 | 95 | pub fn mark_complete(self) { 96 | self.mark_complete_inner(false); 97 | } 98 | 99 | fn mark_complete_inner(&self, previously_sealed: bool) { 100 | let mut mu = self.mu.lock().unwrap(); 101 | if !previously_sealed { 102 | // TODO reevaluate - assert!(!*mu); 103 | } 104 | log::trace!("marking epoch {:?} as complete", self.epoch); 105 | // it's possible for *mu to already be true due to this being 106 | // immediately dropped in the check_in method when we see that 107 | // the checked-in epoch has already been marked as sealed. 108 | *mu = true; 109 | drop(mu); 110 | self.cv.notify_all(); 111 | } 112 | 113 | #[cfg(test)] 114 | pub fn is_complete(&self) -> bool { 115 | *self.mu.lock().unwrap() 116 | } 117 | } 118 | 119 | pub struct FlushEpochGuard<'a> { 120 | tracker: &'a EpochTracker, 121 | previously_sealed: bool, 122 | } 123 | 124 | impl Drop for FlushEpochGuard<'_> { 125 | fn drop(&mut self) { 126 | let rc = self.tracker.rc.fetch_sub(1, Ordering::SeqCst) - 1; 127 | if rc & SEAL_MASK == 0 && (rc & SEAL_BIT) == SEAL_BIT { 128 | crate::debug_delay(); 129 | self.tracker 130 | .vacancy_notifier 131 | .mark_complete_inner(self.previously_sealed); 132 | } 133 | } 134 | } 135 | 136 | impl FlushEpochGuard<'_> { 137 | pub fn epoch(&self) -> FlushEpoch { 138 | self.tracker.epoch 139 | } 140 | } 141 | 142 | #[derive(Debug)] 143 | pub(crate) struct EpochTracker { 144 | epoch: FlushEpoch, 145 | rc: AtomicU64, 146 | vacancy_notifier: Completion, 147 | previous_flush_complete: Completion, 148 | } 149 | 150 | #[derive(Clone, Debug)] 151 | pub(crate) struct FlushEpochTracker { 152 | active_ebr: ebr::Ebr, 16, 16>, 153 | inner: Arc, 154 | } 155 | 156 | #[derive(Debug)] 157 | pub(crate) struct FlushEpochInner { 158 | counter: AtomicU64, 159 | roll_mu: Mutex<()>, 160 | current_active: AtomicPtr, 161 | } 162 | 163 | impl Drop for FlushEpochInner { 164 | fn drop(&mut self) { 165 | let vacancy_mu = self.roll_mu.lock().unwrap(); 166 | let old_ptr = 167 | self.current_active.swap(std::ptr::null_mut(), Ordering::SeqCst); 168 | if !old_ptr.is_null() { 169 | //let old: &EpochTracker = &*old_ptr; 170 | unsafe { drop(Box::from_raw(old_ptr)) } 171 | } 172 | drop(vacancy_mu); 173 | } 174 | } 175 | 176 | impl Default for FlushEpochTracker { 177 | fn default() -> FlushEpochTracker { 178 | let last = Completion::new(FlushEpoch(NonZeroU64::new(1).unwrap())); 179 | let current_active_ptr = Box::into_raw(Box::new(EpochTracker { 180 | epoch: FlushEpoch(NonZeroU64::new(MIN_EPOCH).unwrap()), 181 | rc: AtomicU64::new(0), 182 | vacancy_notifier: Completion::new(FlushEpoch( 183 | NonZeroU64::new(MIN_EPOCH).unwrap(), 184 | )), 185 | previous_flush_complete: last.clone(), 186 | })); 187 | 188 | last.mark_complete(); 189 | 190 | let current_active = AtomicPtr::new(current_active_ptr); 191 | 192 | FlushEpochTracker { 193 | inner: Arc::new(FlushEpochInner { 194 | counter: AtomicU64::new(2), 195 | roll_mu: Mutex::new(()), 196 | current_active, 197 | }), 198 | active_ebr: ebr::Ebr::default(), 199 | } 200 | } 201 | } 202 | 203 | impl FlushEpochTracker { 204 | /// Returns the epoch notifier for the previous epoch. 205 | /// Intended to be passed to a flusher that can eventually 206 | /// notify the flush-requesting thread. 207 | pub fn roll_epoch_forward(&self) -> (Completion, Completion, Completion) { 208 | let mut tracker_guard = self.active_ebr.pin(); 209 | 210 | let vacancy_mu = self.inner.roll_mu.lock().unwrap(); 211 | 212 | let flush_through = self.inner.counter.fetch_add(1, Ordering::SeqCst); 213 | 214 | let flush_through_epoch = 215 | FlushEpoch(NonZeroU64::new(flush_through).unwrap()); 216 | 217 | let new_epoch = flush_through_epoch.increment(); 218 | 219 | let forward_flush_notifier = Completion::new(flush_through_epoch); 220 | 221 | let new_active = Box::into_raw(Box::new(EpochTracker { 222 | epoch: new_epoch, 223 | rc: AtomicU64::new(0), 224 | vacancy_notifier: Completion::new(new_epoch), 225 | previous_flush_complete: forward_flush_notifier.clone(), 226 | })); 227 | 228 | let old_ptr = 229 | self.inner.current_active.swap(new_active, Ordering::SeqCst); 230 | 231 | assert!(!old_ptr.is_null()); 232 | 233 | let (last_flush_complete_notifier, vacancy_notifier) = unsafe { 234 | let old: &EpochTracker = &*old_ptr; 235 | let last = old.rc.fetch_add(SEAL_BIT + 1, Ordering::SeqCst); 236 | 237 | assert_eq!( 238 | last & SEAL_BIT, 239 | 0, 240 | "epoch {} double-sealed", 241 | flush_through 242 | ); 243 | 244 | // mark_complete_inner called via drop in a uniform way 245 | //println!("dropping flush epoch guard for epoch {flush_through}"); 246 | drop(FlushEpochGuard { tracker: old, previously_sealed: true }); 247 | 248 | (old.previous_flush_complete.clone(), old.vacancy_notifier.clone()) 249 | }; 250 | tracker_guard.defer_drop(unsafe { Box::from_raw(old_ptr) }); 251 | drop(vacancy_mu); 252 | (last_flush_complete_notifier, vacancy_notifier, forward_flush_notifier) 253 | } 254 | 255 | pub fn check_in<'a>(&self) -> FlushEpochGuard<'a> { 256 | let _tracker_guard = self.active_ebr.pin(); 257 | loop { 258 | let tracker: &'a EpochTracker = 259 | unsafe { &*self.inner.current_active.load(Ordering::SeqCst) }; 260 | 261 | let rc = tracker.rc.fetch_add(1, Ordering::SeqCst); 262 | 263 | let previously_sealed = rc & SEAL_BIT == SEAL_BIT; 264 | 265 | let guard = FlushEpochGuard { tracker, previously_sealed }; 266 | 267 | if previously_sealed { 268 | // the epoch is already closed, so we must drop the rc 269 | // and possibly notify, which is handled in the guard's 270 | // Drop impl. 271 | drop(guard); 272 | } else { 273 | return guard; 274 | } 275 | } 276 | } 277 | 278 | pub fn manually_advance_epoch(&self) { 279 | self.active_ebr.manually_advance_epoch(); 280 | } 281 | 282 | pub fn current_flush_epoch(&self) -> FlushEpoch { 283 | let current = self.inner.counter.load(Ordering::SeqCst); 284 | 285 | FlushEpoch(NonZeroU64::new(current).unwrap()) 286 | } 287 | } 288 | 289 | #[test] 290 | fn flush_epoch_basic_functionality() { 291 | let epoch_tracker = FlushEpochTracker::default(); 292 | 293 | for expected in MIN_EPOCH..1_000_000 { 294 | let g1 = epoch_tracker.check_in(); 295 | let g2 = epoch_tracker.check_in(); 296 | 297 | assert_eq!(g1.tracker.epoch.0.get(), expected); 298 | assert_eq!(g2.tracker.epoch.0.get(), expected); 299 | 300 | let previous_notifier = epoch_tracker.roll_epoch_forward().1; 301 | assert!(!previous_notifier.is_complete()); 302 | 303 | drop(g1); 304 | assert!(!previous_notifier.is_complete()); 305 | drop(g2); 306 | assert_eq!(previous_notifier.wait_for_complete().0.get(), expected); 307 | } 308 | } 309 | 310 | #[cfg(test)] 311 | fn concurrent_flush_epoch_burn_in_inner() { 312 | const N_THREADS: usize = 10; 313 | const N_OPS_PER_THREAD: usize = 3000; 314 | 315 | let fa = FlushEpochTracker::default(); 316 | 317 | let barrier = std::sync::Arc::new(std::sync::Barrier::new(21)); 318 | 319 | let pt = pagetable::PageTable::::default(); 320 | 321 | let rolls = || { 322 | let fa = fa.clone(); 323 | let barrier = barrier.clone(); 324 | let pt = &pt; 325 | move || { 326 | barrier.wait(); 327 | for _ in 0..N_OPS_PER_THREAD { 328 | let (previous, this, next) = fa.roll_epoch_forward(); 329 | let last_epoch = previous.wait_for_complete().0.get(); 330 | assert_eq!(0, pt.get(last_epoch).load(Ordering::Acquire)); 331 | let flush_through_epoch = this.wait_for_complete().0.get(); 332 | assert_eq!( 333 | 0, 334 | pt.get(flush_through_epoch).load(Ordering::Acquire) 335 | ); 336 | 337 | next.mark_complete(); 338 | } 339 | } 340 | }; 341 | 342 | let check_ins = || { 343 | let fa = fa.clone(); 344 | let barrier = barrier.clone(); 345 | let pt = &pt; 346 | move || { 347 | barrier.wait(); 348 | for _ in 0..N_OPS_PER_THREAD { 349 | let guard = fa.check_in(); 350 | let epoch = guard.epoch().0.get(); 351 | pt.get(epoch).fetch_add(1, Ordering::SeqCst); 352 | std::thread::yield_now(); 353 | pt.get(epoch).fetch_sub(1, Ordering::SeqCst); 354 | drop(guard); 355 | } 356 | } 357 | }; 358 | 359 | std::thread::scope(|s| { 360 | let mut threads = vec![]; 361 | 362 | for _ in 0..N_THREADS { 363 | threads.push(s.spawn(rolls())); 364 | threads.push(s.spawn(check_ins())); 365 | } 366 | 367 | barrier.wait(); 368 | 369 | for thread in threads.into_iter() { 370 | thread.join().expect("a test thread crashed unexpectedly"); 371 | } 372 | }); 373 | 374 | for i in 0..N_OPS_PER_THREAD * N_THREADS { 375 | assert_eq!(0, pt.get(i as u64).load(Ordering::Acquire)); 376 | } 377 | } 378 | 379 | #[test] 380 | fn concurrent_flush_epoch_burn_in() { 381 | for _ in 0..128 { 382 | concurrent_flush_epoch_burn_in_inner(); 383 | } 384 | } 385 | -------------------------------------------------------------------------------- /src/id_allocator.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeSet; 2 | use std::sync::atomic::{AtomicU64, Ordering}; 3 | use std::sync::Arc; 4 | 5 | use crossbeam_queue::SegQueue; 6 | use fnv::FnvHashSet; 7 | use parking_lot::Mutex; 8 | 9 | #[derive(Default, Debug)] 10 | struct FreeSetAndTip { 11 | free_set: BTreeSet, 12 | next_to_allocate: u64, 13 | } 14 | 15 | #[derive(Default, Debug)] 16 | pub struct Allocator { 17 | free_and_pending: Mutex, 18 | /// Flat combining. 19 | /// 20 | /// A lock free queue of recently freed ids which uses when there is contention on `free_and_pending`. 21 | free_queue: SegQueue, 22 | allocation_counter: AtomicU64, 23 | free_counter: AtomicU64, 24 | } 25 | 26 | impl Allocator { 27 | /// Intended primarily for heap slab slot allocators when performing GC. 28 | /// 29 | /// If the slab is fragmented beyond the desired fill ratio, this returns 30 | /// the range of offsets (min inclusive, max exclusive) that may be copied 31 | /// into earlier free slots if they are currently occupied in order to 32 | /// achieve the desired fragmentation ratio. 33 | pub fn fragmentation_cutoff( 34 | &self, 35 | desired_ratio: f32, 36 | ) -> Option<(u64, u64)> { 37 | let mut free_and_tip = self.free_and_pending.lock(); 38 | 39 | let next_to_allocate = free_and_tip.next_to_allocate; 40 | 41 | if next_to_allocate == 0 { 42 | return None; 43 | } 44 | 45 | while let Some(free_id) = self.free_queue.pop() { 46 | free_and_tip.free_set.insert(free_id); 47 | } 48 | 49 | let live_objects = 50 | next_to_allocate - free_and_tip.free_set.len() as u64; 51 | let actual_ratio = live_objects as f32 / next_to_allocate as f32; 52 | 53 | log::trace!( 54 | "fragmented_slots actual ratio: {actual_ratio}, free len: {}", 55 | free_and_tip.free_set.len() 56 | ); 57 | 58 | if desired_ratio <= actual_ratio { 59 | return None; 60 | } 61 | 62 | // calculate theoretical cut-off point, return everything past that 63 | let min = (live_objects as f32 / desired_ratio) as u64; 64 | let max = next_to_allocate; 65 | assert!(min < max); 66 | Some((min, max)) 67 | } 68 | 69 | pub fn from_allocated(allocated: &FnvHashSet) -> Allocator { 70 | let mut heap = BTreeSet::::default(); 71 | let max = allocated.iter().copied().max(); 72 | 73 | for i in 0..max.unwrap_or(0) { 74 | if !allocated.contains(&i) { 75 | heap.insert(i); 76 | } 77 | } 78 | 79 | let free_and_pending = Mutex::new(FreeSetAndTip { 80 | free_set: heap, 81 | next_to_allocate: max.map(|m| m + 1).unwrap_or(0), 82 | }); 83 | 84 | Allocator { 85 | free_and_pending, 86 | free_queue: SegQueue::default(), 87 | allocation_counter: 0.into(), 88 | free_counter: 0.into(), 89 | } 90 | } 91 | 92 | pub fn max_allocated(&self) -> Option { 93 | let next = self.free_and_pending.lock().next_to_allocate; 94 | 95 | if next == 0 { 96 | None 97 | } else { 98 | Some(next - 1) 99 | } 100 | } 101 | 102 | pub fn allocate(&self) -> u64 { 103 | self.allocation_counter.fetch_add(1, Ordering::Relaxed); 104 | let mut free_and_tip = self.free_and_pending.lock(); 105 | while let Some(free_id) = self.free_queue.pop() { 106 | free_and_tip.free_set.insert(free_id); 107 | } 108 | 109 | compact(&mut free_and_tip); 110 | 111 | let pop_attempt = free_and_tip.free_set.pop_first(); 112 | 113 | if let Some(id) = pop_attempt { 114 | id 115 | } else { 116 | let ret = free_and_tip.next_to_allocate; 117 | free_and_tip.next_to_allocate += 1; 118 | ret 119 | } 120 | } 121 | 122 | pub fn free(&self, id: u64) { 123 | if cfg!(not(feature = "monotonic-behavior")) { 124 | self.free_counter.fetch_add(1, Ordering::Relaxed); 125 | if let Some(mut free) = self.free_and_pending.try_lock() { 126 | while let Some(free_id) = self.free_queue.pop() { 127 | free.free_set.insert(free_id); 128 | } 129 | free.free_set.insert(id); 130 | 131 | compact(&mut free); 132 | } else { 133 | self.free_queue.push(id); 134 | } 135 | } 136 | } 137 | 138 | /// Returns the counters for allocated, free 139 | pub fn counters(&self) -> (u64, u64) { 140 | ( 141 | self.allocation_counter.load(Ordering::Acquire), 142 | self.free_counter.load(Ordering::Acquire), 143 | ) 144 | } 145 | } 146 | 147 | fn compact(free: &mut FreeSetAndTip) { 148 | let next = &mut free.next_to_allocate; 149 | 150 | while *next > 1 && free.free_set.contains(&(*next - 1)) { 151 | free.free_set.remove(&(*next - 1)); 152 | *next -= 1; 153 | } 154 | } 155 | 156 | pub struct DeferredFree { 157 | pub allocator: Arc, 158 | pub freed_slot: u64, 159 | } 160 | 161 | impl Drop for DeferredFree { 162 | fn drop(&mut self) { 163 | self.allocator.free(self.freed_slot) 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/leaf.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] 4 | pub(crate) struct Leaf { 5 | pub lo: InlineArray, 6 | pub hi: Option, 7 | pub prefix_length: usize, 8 | data: stack_map::StackMap, 9 | pub in_memory_size: usize, 10 | pub mutation_count: u64, 11 | #[serde(skip)] 12 | pub dirty_flush_epoch: Option, 13 | #[serde(skip)] 14 | pub page_out_on_flush: Option, 15 | #[serde(skip)] 16 | pub deleted: Option, 17 | #[serde(skip)] 18 | pub max_unflushed_epoch: Option, 19 | } 20 | 21 | impl Leaf { 22 | pub(crate) fn empty() -> Leaf { 23 | Leaf { 24 | lo: InlineArray::default(), 25 | hi: None, 26 | prefix_length: 0, 27 | data: stack_map::StackMap::default(), 28 | // this does not need to be marked as dirty until it actually 29 | // receives inserted data 30 | dirty_flush_epoch: None, 31 | in_memory_size: std::mem::size_of::>(), 32 | mutation_count: 0, 33 | page_out_on_flush: None, 34 | deleted: None, 35 | max_unflushed_epoch: None, 36 | } 37 | } 38 | 39 | pub(crate) const fn is_empty(&self) -> bool { 40 | self.data.is_empty() 41 | } 42 | 43 | pub(crate) fn set_dirty_epoch(&mut self, epoch: FlushEpoch) { 44 | assert!(self.deleted.is_none()); 45 | if let Some(current_epoch) = self.dirty_flush_epoch { 46 | assert!(current_epoch <= epoch); 47 | } 48 | if self.page_out_on_flush < Some(epoch) { 49 | self.page_out_on_flush = None; 50 | } 51 | self.dirty_flush_epoch = Some(epoch); 52 | } 53 | 54 | fn prefix(&self) -> &[u8] { 55 | assert!(self.deleted.is_none()); 56 | &self.lo[..self.prefix_length] 57 | } 58 | 59 | pub(crate) fn get(&self, key: &[u8]) -> Option<&InlineArray> { 60 | assert!(self.deleted.is_none()); 61 | assert!(key.starts_with(self.prefix())); 62 | let prefixed_key = &key[self.prefix_length..]; 63 | self.data.get(prefixed_key) 64 | } 65 | 66 | pub(crate) fn insert( 67 | &mut self, 68 | key: InlineArray, 69 | value: InlineArray, 70 | ) -> Option { 71 | assert!(self.deleted.is_none()); 72 | assert!(key.starts_with(self.prefix())); 73 | let prefixed_key = key[self.prefix_length..].into(); 74 | self.data.insert(prefixed_key, value) 75 | } 76 | 77 | pub(crate) fn remove(&mut self, key: &[u8]) -> Option { 78 | assert!(self.deleted.is_none()); 79 | let prefix = self.prefix(); 80 | assert!(key.starts_with(prefix)); 81 | let partial_key = &key[self.prefix_length..]; 82 | self.data.remove(partial_key) 83 | } 84 | 85 | pub(crate) fn merge_from(&mut self, other: &mut Self) { 86 | assert!(self.is_empty()); 87 | 88 | self.hi = other.hi.clone(); 89 | 90 | let new_prefix_len = if let Some(hi) = &self.hi { 91 | self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count() 92 | } else { 93 | 0 94 | }; 95 | 96 | assert_eq!(self.lo[..new_prefix_len], other.lo[..new_prefix_len]); 97 | 98 | // self.prefix_length is not read because it's expected to be 99 | // initialized here. 100 | self.prefix_length = new_prefix_len; 101 | 102 | if self.prefix() == other.prefix() { 103 | self.data = std::mem::take(&mut other.data); 104 | return; 105 | } 106 | 107 | assert!( 108 | self.prefix_length < other.prefix_length, 109 | "self: {:?} other: {:?}", 110 | self, 111 | other 112 | ); 113 | 114 | let unshifted_key_amount = other.prefix_length - self.prefix_length; 115 | let unshifted_prefix = &other.lo 116 | [other.prefix_length - unshifted_key_amount..other.prefix_length]; 117 | 118 | for (k, v) in other.data.iter() { 119 | let mut unshifted_key = 120 | Vec::with_capacity(unshifted_prefix.len() + k.len()); 121 | unshifted_key.extend_from_slice(unshifted_prefix); 122 | unshifted_key.extend_from_slice(k); 123 | self.data.insert(unshifted_key.into(), v.clone()); 124 | } 125 | 126 | assert_eq!(other.data.len(), self.data.len()); 127 | 128 | #[cfg(feature = "for-internal-testing-only")] 129 | assert_eq!( 130 | self.iter().collect::>(), 131 | other.iter().collect::>(), 132 | "self: {:#?} \n other: {:#?}\n", 133 | self, 134 | other 135 | ); 136 | } 137 | 138 | pub(crate) fn iter( 139 | &self, 140 | ) -> impl Iterator { 141 | let prefix = self.prefix(); 142 | self.data.iter().map(|(k, v)| { 143 | let mut unshifted_key = Vec::with_capacity(prefix.len() + k.len()); 144 | unshifted_key.extend_from_slice(prefix); 145 | unshifted_key.extend_from_slice(k); 146 | (unshifted_key.into(), v.clone()) 147 | }) 148 | } 149 | 150 | pub(crate) fn serialize(&self, zstd_compression_level: i32) -> Vec { 151 | let mut ret = vec![]; 152 | 153 | let mut zstd_enc = 154 | zstd::stream::Encoder::new(&mut ret, zstd_compression_level) 155 | .unwrap(); 156 | 157 | bincode::serialize_into(&mut zstd_enc, self).unwrap(); 158 | 159 | zstd_enc.finish().unwrap(); 160 | 161 | ret 162 | } 163 | 164 | pub(crate) fn deserialize( 165 | buf: &[u8], 166 | ) -> std::io::Result>> { 167 | let zstd_decoded = zstd::stream::decode_all(buf).unwrap(); 168 | let mut leaf: Box> = 169 | bincode::deserialize(&zstd_decoded).unwrap(); 170 | 171 | // use decompressed buffer length as a cheap proxy for in-memory size for now 172 | leaf.in_memory_size = zstd_decoded.len(); 173 | 174 | Ok(leaf) 175 | } 176 | 177 | fn set_in_memory_size(&mut self) { 178 | self.in_memory_size = std::mem::size_of::>() 179 | + self.hi.as_ref().map(|h| h.len()).unwrap_or(0) 180 | + self.lo.len() 181 | + self.data.iter().map(|(k, v)| k.len() + v.len()).sum::(); 182 | } 183 | 184 | pub(crate) fn split_if_full( 185 | &mut self, 186 | new_epoch: FlushEpoch, 187 | allocator: &ObjectCache, 188 | collection_id: CollectionId, 189 | ) -> Option<(InlineArray, Object)> { 190 | if self.data.is_full() { 191 | let original_len = self.data.len(); 192 | 193 | let old_prefix_len = self.prefix_length; 194 | // split 195 | let split_offset = if self.lo.is_empty() { 196 | // split left-most shard almost at the beginning for 197 | // optimizing downward-growing workloads 198 | 1 199 | } else if self.hi.is_none() { 200 | // split right-most shard almost at the end for 201 | // optimizing upward-growing workloads 202 | self.data.len() - 2 203 | } else { 204 | self.data.len() / 2 205 | }; 206 | 207 | let data = self.data.split_off(split_offset); 208 | 209 | let left_max = &self.data.last().unwrap().0; 210 | let right_min = &data.first().unwrap().0; 211 | 212 | // suffix truncation attempts to shrink the split key 213 | // so that shorter keys bubble up into the index 214 | let splitpoint_length = right_min 215 | .iter() 216 | .zip(left_max.iter()) 217 | .take_while(|(a, b)| a == b) 218 | .count() 219 | + 1; 220 | 221 | let mut split_vec = 222 | Vec::with_capacity(self.prefix_length + splitpoint_length); 223 | split_vec.extend_from_slice(self.prefix()); 224 | split_vec.extend_from_slice(&right_min[..splitpoint_length]); 225 | let split_key = InlineArray::from(split_vec); 226 | 227 | let rhs_id = allocator.allocate_object_id(new_epoch); 228 | 229 | log::trace!( 230 | "split leaf {:?} at split key: {:?} into new {:?} at {:?}", 231 | self.lo, 232 | split_key, 233 | rhs_id, 234 | new_epoch, 235 | ); 236 | 237 | let mut rhs = Leaf { 238 | dirty_flush_epoch: Some(new_epoch), 239 | hi: self.hi.clone(), 240 | lo: split_key.clone(), 241 | prefix_length: 0, 242 | in_memory_size: 0, 243 | data, 244 | mutation_count: 0, 245 | page_out_on_flush: None, 246 | deleted: None, 247 | max_unflushed_epoch: None, 248 | }; 249 | 250 | rhs.shorten_keys_after_split(old_prefix_len); 251 | 252 | rhs.set_in_memory_size(); 253 | 254 | self.hi = Some(split_key.clone()); 255 | 256 | self.shorten_keys_after_split(old_prefix_len); 257 | 258 | self.set_in_memory_size(); 259 | 260 | assert_eq!(self.hi.as_ref().unwrap(), &split_key); 261 | assert_eq!(rhs.lo, &split_key); 262 | assert_eq!(rhs.data.len() + self.data.len(), original_len); 263 | 264 | let rhs_node = Object { 265 | object_id: rhs_id, 266 | collection_id, 267 | low_key: split_key.clone(), 268 | inner: Arc::new(RwLock::new(CacheBox { 269 | leaf: Some(Box::new(rhs)), 270 | logged_index: BTreeMap::default(), 271 | })), 272 | }; 273 | 274 | return Some((split_key, rhs_node)); 275 | } 276 | 277 | None 278 | } 279 | 280 | pub(crate) fn shorten_keys_after_split(&mut self, old_prefix_len: usize) { 281 | let Some(hi) = self.hi.as_ref() else { return }; 282 | 283 | let new_prefix_len = 284 | self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count(); 285 | 286 | assert_eq!(self.lo[..new_prefix_len], hi[..new_prefix_len]); 287 | 288 | // self.prefix_length is not read because it's expected to be 289 | // initialized here. 290 | self.prefix_length = new_prefix_len; 291 | 292 | if new_prefix_len == old_prefix_len { 293 | return; 294 | } 295 | 296 | assert!( 297 | new_prefix_len > old_prefix_len, 298 | "expected new prefix length of {} to be greater than the pre-split prefix length of {} for node {:?}", 299 | new_prefix_len, 300 | old_prefix_len, 301 | self 302 | ); 303 | 304 | let key_shift = new_prefix_len - old_prefix_len; 305 | 306 | for (k, v) in std::mem::take(&mut self.data).iter() { 307 | self.data.insert(k[key_shift..].into(), v.clone()); 308 | } 309 | } 310 | } 311 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // 1.0 blockers 2 | // 3 | // bugs 4 | // * page-out needs to be deferred until after any flush of the dirty epoch 5 | // * need to remove max_unflushed_epoch after flushing it 6 | // * can't send reliable page-out request backwards from 7->6 7 | // * re-locking every mutex in a writebatch feels bad 8 | // * need to signal stability status forward 9 | // * maybe we already are 10 | // * can make dirty_flush_epoch atomic and CAS it to 0 after flush 11 | // * can change dirty_flush_epoch to unflushed_epoch 12 | // * can always set mutation_count to max dirty flush epoch 13 | // * this feels nice, we can lazily update a global stable flushed counter 14 | // * can get rid of dirty_flush_epoch and page_out_on_flush? 15 | // * or at least dirty_flush_epoch 16 | // * dirty_flush_epoch really means "hasn't yet been cooperatively serialized @ F.E." 17 | // * interesting metrics: 18 | // * whether dirty for some epoch 19 | // * whether cooperatively serialized for some epoch 20 | // * whether fully flushed for some epoch 21 | // * clean -> dirty -> {maybe coop} -> flushed 22 | // * for page-out, we only care if it's stable or if we need to add it to 23 | // a page-out priority queue 24 | // * page-out doesn't seem to happen as expected 25 | // 26 | // reliability 27 | // TODO make all writes wrapped in a Tearable wrapper that splits writes 28 | // and can possibly crash based on a counter. 29 | // TODO test concurrent drop_tree when other threads are still using it 30 | // TODO list trees test for recovering empty collections 31 | // TODO set explicit max key and value sizes w/ corresponding heap 32 | // TODO add failpoints to writepath 33 | // 34 | // performance 35 | // TODO handle prefix encoding 36 | // TODO (minor) remove cache access for removed node in merge function 37 | // TODO index+log hybrid - tinylsm key -> object location 38 | // 39 | // features 40 | // TODO multi-collection batch 41 | // 42 | // misc 43 | // TODO skim inlining output of RUSTFLAGS="-Cremark=all -Cdebuginfo=1" 44 | // 45 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.0 cutoff ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | // 47 | // post-1.0 improvements 48 | // 49 | // reliability 50 | // TODO bug hiding: if the crash_iter test panics, the test doesn't fail as expected 51 | // TODO event log assertion for testing heap location bidirectional referential integrity, 52 | // particularly in the object location mapper. 53 | // TODO ensure nothing "from the future" gets copied into earlier epochs during GC 54 | // TODO collection_id on page_in checks - it needs to be pinned w/ heap's EBR? 55 | // TODO put aborts behind feature flags for hard crashes 56 | // TODO re-enable transaction tests in test_tree.rs 57 | // 58 | // performance 59 | // TODO force writers to flush when some number of dirty epochs have built up 60 | // TODO serialize flush batch in parallel 61 | // TODO concurrent serialization of NotYetSerialized dirty objects 62 | // TODO make the Arc`, 80 | //! but with several additional capabilities for 81 | //! assisting creators of stateful systems. 82 | //! 83 | //! It is fully thread-safe, and all operations are 84 | //! atomic. Multiple `Tree`s with isolated keyspaces 85 | //! are supported with the 86 | //! [`Db::open_tree`](struct.Db.html#method.open_tree) method. 87 | //! 88 | //! `sled` is built by experienced database engineers 89 | //! who think users should spend less time tuning and 90 | //! working against high-friction APIs. Expect 91 | //! significant ergonomic and performance improvements 92 | //! over time. Most surprises are bugs, so please 93 | //! [let us know](mailto:tylerneely@gmail.com?subject=sled%20sucks!!!) 94 | //! if something is high friction. 95 | //! 96 | //! # Examples 97 | //! 98 | //! ``` 99 | //! # let _ = std::fs::remove_dir_all("my_db"); 100 | //! let db: sled::Db = sled::open("my_db").unwrap(); 101 | //! 102 | //! // insert and get 103 | //! db.insert(b"yo!", b"v1"); 104 | //! assert_eq!(&db.get(b"yo!").unwrap().unwrap(), b"v1"); 105 | //! 106 | //! // Atomic compare-and-swap. 107 | //! db.compare_and_swap( 108 | //! b"yo!", // key 109 | //! Some(b"v1"), // old value, None for not present 110 | //! Some(b"v2"), // new value, None for delete 111 | //! ) 112 | //! .unwrap(); 113 | //! 114 | //! // Iterates over key-value pairs, starting at the given key. 115 | //! let scan_key: &[u8] = b"a non-present key before yo!"; 116 | //! let mut iter = db.range(scan_key..); 117 | //! assert_eq!(&iter.next().unwrap().unwrap().0, b"yo!"); 118 | //! assert!(iter.next().is_none()); 119 | //! 120 | //! db.remove(b"yo!"); 121 | //! assert!(db.get(b"yo!").unwrap().is_none()); 122 | //! 123 | //! let other_tree: sled::Tree = db.open_tree(b"cool db facts").unwrap(); 124 | //! other_tree.insert( 125 | //! b"k1", 126 | //! &b"a Db acts like a Tree due to implementing Deref"[..] 127 | //! ).unwrap(); 128 | //! # let _ = std::fs::remove_dir_all("my_db"); 129 | //! ``` 130 | #[cfg(feature = "for-internal-testing-only")] 131 | mod block_checker; 132 | mod config; 133 | mod db; 134 | mod flush_epoch; 135 | mod heap; 136 | mod id_allocator; 137 | mod leaf; 138 | mod metadata_store; 139 | mod object_cache; 140 | mod object_location_mapper; 141 | mod tree; 142 | 143 | #[cfg(any( 144 | feature = "testing-shred-allocator", 145 | feature = "testing-count-allocator" 146 | ))] 147 | pub mod alloc; 148 | 149 | #[cfg(feature = "for-internal-testing-only")] 150 | mod event_verifier; 151 | 152 | #[inline] 153 | fn debug_delay() { 154 | #[cfg(debug_assertions)] 155 | { 156 | let rand = 157 | std::time::SystemTime::UNIX_EPOCH.elapsed().unwrap().as_nanos(); 158 | 159 | if rand % 128 > 100 { 160 | for _ in 0..rand % 16 { 161 | std::thread::yield_now(); 162 | } 163 | } 164 | } 165 | } 166 | 167 | pub use crate::config::Config; 168 | pub use crate::db::Db; 169 | pub use crate::tree::{Batch, Iter, Tree}; 170 | pub use inline_array::InlineArray; 171 | 172 | const NAME_MAPPING_COLLECTION_ID: CollectionId = CollectionId(0); 173 | const DEFAULT_COLLECTION_ID: CollectionId = CollectionId(1); 174 | const INDEX_FANOUT: usize = 64; 175 | const EBR_LOCAL_GC_BUFFER_SIZE: usize = 128; 176 | 177 | use std::collections::BTreeMap; 178 | use std::num::NonZeroU64; 179 | use std::ops::Bound; 180 | use std::sync::Arc; 181 | 182 | use parking_lot::RwLock; 183 | 184 | use crate::flush_epoch::{ 185 | FlushEpoch, FlushEpochGuard, FlushEpochTracker, FlushInvariants, 186 | }; 187 | use crate::heap::{ 188 | HeapStats, ObjectRecovery, SlabAddress, Update, WriteBatchStats, 189 | }; 190 | use crate::id_allocator::{Allocator, DeferredFree}; 191 | use crate::leaf::Leaf; 192 | 193 | // These are public so that they can be easily crash tested in external 194 | // binaries. They are hidden because there are zero guarantees around their 195 | // API stability or functionality. 196 | #[doc(hidden)] 197 | pub use crate::heap::{Heap, HeapRecovery}; 198 | #[doc(hidden)] 199 | pub use crate::metadata_store::MetadataStore; 200 | #[doc(hidden)] 201 | pub use crate::object_cache::{CacheStats, Dirty, FlushStats, ObjectCache}; 202 | 203 | /// Opens a `Db` with a default configuration at the 204 | /// specified path. This will create a new storage 205 | /// directory at the specified path if it does 206 | /// not already exist. You can use the `Db::was_recovered` 207 | /// method to determine if your database was recovered 208 | /// from a previous instance. 209 | pub fn open>(path: P) -> std::io::Result { 210 | Config::new().path(path).open() 211 | } 212 | 213 | #[derive(Debug, Copy, Clone)] 214 | pub struct Stats { 215 | pub cache: CacheStats, 216 | } 217 | 218 | /// Compare and swap result. 219 | /// 220 | /// It returns `Ok(Ok(()))` if operation finishes successfully and 221 | /// - `Ok(Err(CompareAndSwapError(current, proposed)))` if operation failed 222 | /// to setup a new value. `CompareAndSwapError` contains current and 223 | /// proposed values. 224 | /// - `Err(Error::Unsupported)` if the database is opened in read-only mode. 225 | /// otherwise. 226 | pub type CompareAndSwapResult = std::io::Result< 227 | std::result::Result, 228 | >; 229 | 230 | type Index = concurrent_map::ConcurrentMap< 231 | InlineArray, 232 | Object, 233 | INDEX_FANOUT, 234 | EBR_LOCAL_GC_BUFFER_SIZE, 235 | >; 236 | 237 | /// Compare and swap error. 238 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 239 | pub struct CompareAndSwapError { 240 | /// The current value which caused your CAS to fail. 241 | pub current: Option, 242 | /// Returned value that was proposed unsuccessfully. 243 | pub proposed: Option, 244 | } 245 | 246 | /// Compare and swap success. 247 | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 248 | pub struct CompareAndSwapSuccess { 249 | /// The current value which was successfully installed. 250 | pub new_value: Option, 251 | /// Returned value that was previously stored. 252 | pub previous_value: Option, 253 | } 254 | 255 | impl std::fmt::Display for CompareAndSwapError { 256 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 257 | write!(f, "Compare and swap conflict") 258 | } 259 | } 260 | 261 | impl std::error::Error for CompareAndSwapError {} 262 | 263 | #[derive( 264 | Debug, 265 | Clone, 266 | Copy, 267 | serde::Serialize, 268 | serde::Deserialize, 269 | PartialOrd, 270 | Ord, 271 | PartialEq, 272 | Eq, 273 | Hash, 274 | )] 275 | pub struct ObjectId(NonZeroU64); 276 | 277 | impl ObjectId { 278 | fn new(from: u64) -> Option { 279 | NonZeroU64::new(from).map(ObjectId) 280 | } 281 | } 282 | 283 | impl std::ops::Deref for ObjectId { 284 | type Target = u64; 285 | 286 | fn deref(&self) -> &u64 { 287 | let self_ref: &NonZeroU64 = &self.0; 288 | 289 | // NonZeroU64 is repr(transparent) where it wraps a u64 290 | // so it is guaranteed to match the binary layout. This 291 | // makes it safe to cast a reference to one as a reference 292 | // to the other like this. 293 | let self_ptr: *const NonZeroU64 = self_ref as *const _; 294 | let reference: *const u64 = self_ptr as *const u64; 295 | 296 | unsafe { &*reference } 297 | } 298 | } 299 | 300 | impl concurrent_map::Minimum for ObjectId { 301 | const MIN: ObjectId = ObjectId(NonZeroU64::MIN); 302 | } 303 | 304 | #[derive( 305 | Debug, 306 | Clone, 307 | Copy, 308 | serde::Serialize, 309 | serde::Deserialize, 310 | PartialOrd, 311 | Ord, 312 | PartialEq, 313 | Eq, 314 | Hash, 315 | )] 316 | pub struct CollectionId(u64); 317 | 318 | impl concurrent_map::Minimum for CollectionId { 319 | const MIN: CollectionId = CollectionId(u64::MIN); 320 | } 321 | 322 | #[derive(Debug, Clone)] 323 | struct CacheBox { 324 | leaf: Option>>, 325 | #[allow(unused)] 326 | logged_index: BTreeMap, 327 | } 328 | 329 | #[allow(unused)] 330 | #[derive(Debug, Clone)] 331 | struct LogValue { 332 | location: SlabAddress, 333 | value: Option, 334 | } 335 | 336 | #[derive(Debug, Clone)] 337 | pub struct Object { 338 | object_id: ObjectId, 339 | collection_id: CollectionId, 340 | low_key: InlineArray, 341 | inner: Arc>>, 342 | } 343 | 344 | impl PartialEq for Object { 345 | fn eq(&self, other: &Self) -> bool { 346 | self.object_id == other.object_id 347 | } 348 | } 349 | 350 | /// Stored on `Db` and `Tree` in an Arc, so that when the 351 | /// last "high-level" struct is dropped, the flusher thread 352 | /// is cleaned up. 353 | struct ShutdownDropper { 354 | shutdown_sender: parking_lot::Mutex< 355 | std::sync::mpsc::Sender>, 356 | >, 357 | cache: parking_lot::Mutex>, 358 | } 359 | 360 | impl Drop for ShutdownDropper { 361 | fn drop(&mut self) { 362 | let (tx, rx) = std::sync::mpsc::channel(); 363 | log::debug!("sending shutdown signal to flusher"); 364 | if self.shutdown_sender.lock().send(tx).is_ok() { 365 | if let Err(e) = rx.recv() { 366 | log::error!("failed to shut down flusher thread: {:?}", e); 367 | } else { 368 | log::debug!("flush thread successfully terminated"); 369 | } 370 | } else { 371 | log::debug!( 372 | "failed to shut down flusher, manually flushing ObjectCache" 373 | ); 374 | let cache = self.cache.lock(); 375 | if let Err(e) = cache.flush() { 376 | log::error!( 377 | "Db flusher encountered error while flushing: {:?}", 378 | e 379 | ); 380 | cache.set_error(&e); 381 | } 382 | } 383 | } 384 | } 385 | 386 | fn map_bound U>(bound: Bound, f: F) -> Bound { 387 | match bound { 388 | Bound::Unbounded => Bound::Unbounded, 389 | Bound::Included(x) => Bound::Included(f(x)), 390 | Bound::Excluded(x) => Bound::Excluded(f(x)), 391 | } 392 | } 393 | 394 | const fn _assert_public_types_send_sync() { 395 | use std::fmt::Debug; 396 | 397 | const fn _assert_send() {} 398 | 399 | const fn _assert_send_sync() {} 400 | 401 | /* 402 | _assert_send::(); 403 | _assert_send_sync::(); 404 | _assert_send_sync::(); 405 | _assert_send_sync::(); 406 | */ 407 | 408 | _assert_send::(); 409 | 410 | _assert_send_sync::(); 411 | _assert_send_sync::(); 412 | _assert_send_sync::(); 413 | _assert_send_sync::(); 414 | _assert_send_sync::(); 415 | } 416 | -------------------------------------------------------------------------------- /src/object_location_mapper.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroU64; 2 | use std::sync::Arc; 3 | use std::sync::atomic::{AtomicU64, Ordering}; 4 | 5 | use fnv::FnvHashSet; 6 | use pagetable::PageTable; 7 | 8 | use crate::{ 9 | Allocator, ObjectId, 10 | heap::{N_SLABS, SlabAddress, UpdateMetadata}, 11 | }; 12 | 13 | #[derive(Debug, Default, Copy, Clone)] 14 | pub struct AllocatorStats { 15 | pub objects_allocated: u64, 16 | pub objects_freed: u64, 17 | pub heap_slots_allocated: u64, 18 | pub heap_slots_freed: u64, 19 | } 20 | 21 | #[derive(Default)] 22 | struct SlabTenancy { 23 | slot_to_object_id: PageTable, 24 | slot_allocator: Arc, 25 | } 26 | 27 | impl SlabTenancy { 28 | // returns (ObjectId, slot index) pairs 29 | fn objects_to_defrag( 30 | &self, 31 | target_fill_ratio: f32, 32 | ) -> Vec<(ObjectId, u64)> { 33 | let (frag_min, frag_max) = if let Some(frag) = 34 | self.slot_allocator.fragmentation_cutoff(target_fill_ratio) 35 | { 36 | frag 37 | } else { 38 | return vec![]; 39 | }; 40 | 41 | let mut ret = vec![]; 42 | 43 | for fragmented_slot in frag_min..frag_max { 44 | let object_id_u64 = self 45 | .slot_to_object_id 46 | .get(fragmented_slot) 47 | .load(Ordering::Acquire); 48 | 49 | if let Some(object_id) = ObjectId::new(object_id_u64) { 50 | ret.push((object_id, fragmented_slot)); 51 | } 52 | } 53 | 54 | ret 55 | } 56 | } 57 | 58 | #[derive(Clone)] 59 | pub(crate) struct ObjectLocationMapper { 60 | object_id_to_location: PageTable, 61 | slab_tenancies: Arc<[SlabTenancy; N_SLABS]>, 62 | object_id_allocator: Arc, 63 | target_fill_ratio: f32, 64 | } 65 | 66 | impl ObjectLocationMapper { 67 | pub(crate) fn new( 68 | recovered_metadata: &[UpdateMetadata], 69 | target_fill_ratio: f32, 70 | ) -> ObjectLocationMapper { 71 | let mut ret = ObjectLocationMapper { 72 | object_id_to_location: PageTable::default(), 73 | slab_tenancies: Arc::new(core::array::from_fn(|_| { 74 | SlabTenancy::default() 75 | })), 76 | object_id_allocator: Arc::default(), 77 | target_fill_ratio, 78 | }; 79 | 80 | let mut object_ids: FnvHashSet = Default::default(); 81 | let mut slots_per_slab: [FnvHashSet; N_SLABS] = 82 | core::array::from_fn(|_| Default::default()); 83 | 84 | for update_metadata in recovered_metadata { 85 | match update_metadata { 86 | UpdateMetadata::Store { 87 | object_id, 88 | collection_id: _, 89 | location, 90 | low_key: _, 91 | } => { 92 | object_ids.insert(**object_id); 93 | let slab_address = SlabAddress::from(*location); 94 | slots_per_slab[slab_address.slab() as usize] 95 | .insert(slab_address.slot()); 96 | ret.insert(*object_id, slab_address); 97 | } 98 | UpdateMetadata::Free { .. } => { 99 | unreachable!() 100 | } 101 | } 102 | } 103 | 104 | ret.object_id_allocator = 105 | Arc::new(Allocator::from_allocated(&object_ids)); 106 | 107 | let slabs = Arc::get_mut(&mut ret.slab_tenancies).unwrap(); 108 | 109 | for i in 0..N_SLABS { 110 | let slab = &mut slabs[i]; 111 | slab.slot_allocator = 112 | Arc::new(Allocator::from_allocated(&slots_per_slab[i])); 113 | } 114 | 115 | ret 116 | } 117 | 118 | pub(crate) fn get_max_allocated_per_slab(&self) -> Vec<(usize, u64)> { 119 | let mut ret = vec![]; 120 | 121 | for (i, slab) in self.slab_tenancies.iter().enumerate() { 122 | if let Some(max_allocated) = slab.slot_allocator.max_allocated() { 123 | ret.push((i, max_allocated)); 124 | } 125 | } 126 | 127 | ret 128 | } 129 | 130 | pub(crate) fn stats(&self) -> AllocatorStats { 131 | let (objects_allocated, objects_freed) = 132 | self.object_id_allocator.counters(); 133 | 134 | let mut heap_slots_allocated = 0; 135 | let mut heap_slots_freed = 0; 136 | 137 | for slab_id in 0..N_SLABS { 138 | let (allocated, freed) = 139 | self.slab_tenancies[slab_id].slot_allocator.counters(); 140 | heap_slots_allocated += allocated; 141 | heap_slots_freed += freed; 142 | } 143 | 144 | AllocatorStats { 145 | objects_allocated, 146 | objects_freed, 147 | heap_slots_allocated, 148 | heap_slots_freed, 149 | } 150 | } 151 | 152 | pub(crate) fn clone_object_id_allocator_arc(&self) -> Arc { 153 | self.object_id_allocator.clone() 154 | } 155 | 156 | pub(crate) fn allocate_object_id(&self) -> ObjectId { 157 | // object IDs wrap a NonZeroU64, so if we get 0, just re-allocate and leak the id 158 | 159 | let mut object_id = self.object_id_allocator.allocate(); 160 | if object_id == 0 { 161 | object_id = self.object_id_allocator.allocate(); 162 | assert_ne!(object_id, 0); 163 | } 164 | ObjectId::new(object_id).unwrap() 165 | } 166 | 167 | pub(crate) fn clone_slab_allocator_arc( 168 | &self, 169 | slab_id: u8, 170 | ) -> Arc { 171 | self.slab_tenancies[usize::from(slab_id)].slot_allocator.clone() 172 | } 173 | 174 | pub(crate) fn allocate_slab_slot(&self, slab_id: u8) -> SlabAddress { 175 | let slot = 176 | self.slab_tenancies[usize::from(slab_id)].slot_allocator.allocate(); 177 | SlabAddress::from_slab_slot(slab_id, slot) 178 | } 179 | 180 | pub(crate) fn free_slab_slot(&self, slab_address: SlabAddress) { 181 | self.slab_tenancies[usize::from(slab_address.slab())] 182 | .slot_allocator 183 | .free(slab_address.slot()) 184 | } 185 | 186 | pub(crate) fn get_location_for_object( 187 | &self, 188 | object_id: ObjectId, 189 | ) -> Option { 190 | let location_u64 = 191 | self.object_id_to_location.get(*object_id).load(Ordering::Acquire); 192 | 193 | let nzu = NonZeroU64::new(location_u64)?; 194 | 195 | Some(SlabAddress::from(nzu)) 196 | } 197 | 198 | /// Returns the previous address for this object, if it is vacating one. 199 | /// 200 | /// # Panics 201 | /// 202 | /// Asserts that the new location is actually unoccupied. This is a major 203 | /// correctness violation if that isn't true. 204 | pub(crate) fn insert( 205 | &self, 206 | object_id: ObjectId, 207 | new_location: SlabAddress, 208 | ) -> Option { 209 | // insert into object_id_to_location 210 | let location_nzu: NonZeroU64 = new_location.into(); 211 | let location_u64 = location_nzu.get(); 212 | 213 | let last_u64 = self 214 | .object_id_to_location 215 | .get(*object_id) 216 | .swap(location_u64, Ordering::Release); 217 | 218 | let last_address_opt = if let Some(nzu) = NonZeroU64::new(last_u64) { 219 | let last_address = SlabAddress::from(nzu); 220 | Some(last_address) 221 | } else { 222 | None 223 | }; 224 | 225 | // insert into slab_tenancies 226 | let slab = new_location.slab(); 227 | let slot = new_location.slot(); 228 | 229 | let _last_oid_at_location = self.slab_tenancies[usize::from(slab)] 230 | .slot_to_object_id 231 | .get(slot) 232 | .swap(*object_id, Ordering::Release); 233 | 234 | // TODO add debug event verifier here assert_eq!(0, last_oid_at_location); 235 | 236 | last_address_opt 237 | } 238 | 239 | /// Unmaps an object and returns its location. 240 | /// 241 | /// # Panics 242 | /// 243 | /// Asserts that the object was actually stored in a location. 244 | pub(crate) fn remove(&self, object_id: ObjectId) -> Option { 245 | let last_u64 = self 246 | .object_id_to_location 247 | .get(*object_id) 248 | .swap(0, Ordering::Release); 249 | 250 | if let Some(nzu) = NonZeroU64::new(last_u64) { 251 | let last_address = SlabAddress::from(nzu); 252 | 253 | let slab = last_address.slab(); 254 | let slot = last_address.slot(); 255 | 256 | let last_oid_at_location = self.slab_tenancies[usize::from(slab)] 257 | .slot_to_object_id 258 | .get(slot) 259 | .swap(0, Ordering::Release); 260 | 261 | assert_eq!(*object_id, last_oid_at_location); 262 | 263 | Some(last_address) 264 | } else { 265 | None 266 | } 267 | } 268 | 269 | pub(crate) fn objects_to_defrag(&self) -> FnvHashSet { 270 | let mut ret = FnvHashSet::default(); 271 | 272 | for slab_id in 0..N_SLABS { 273 | let slab = &self.slab_tenancies[slab_id]; 274 | 275 | for (object_id, slot) in 276 | slab.objects_to_defrag(self.target_fill_ratio) 277 | { 278 | let sa = SlabAddress::from_slab_slot( 279 | u8::try_from(slab_id).unwrap(), 280 | slot, 281 | ); 282 | 283 | let rt_sa = if let Some(rt_raw_sa) = NonZeroU64::new( 284 | self.object_id_to_location 285 | .get(*object_id) 286 | .load(Ordering::Acquire), 287 | ) { 288 | SlabAddress::from(rt_raw_sa) 289 | } else { 290 | // object has been removed but its slot has not yet been freed, 291 | // hopefully due to a deferred write 292 | // TODO test that with a testing event log 293 | continue; 294 | }; 295 | 296 | if sa == rt_sa { 297 | let newly_inserted = ret.insert(object_id); 298 | assert!( 299 | newly_inserted, 300 | "{object_id:?} present multiple times across slab objects_to_defrag" 301 | ); 302 | } 303 | } 304 | } 305 | 306 | ret 307 | } 308 | } 309 | -------------------------------------------------------------------------------- /tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | // the memshred feature causes all allocated and deallocated 2 | // memory to be set to a specific non-zero value of 0xa1 for 3 | // uninitialized allocations and 0xde for deallocated memory, 4 | // in the hope that it will cause memory errors to surface 5 | // more quickly. 6 | #[cfg(feature = "testing-shred-allocator")] 7 | mod alloc { 8 | use std::alloc::{Layout, System}; 9 | 10 | #[global_allocator] 11 | static ALLOCATOR: Alloc = Alloc; 12 | 13 | #[derive(Default, Debug, Clone, Copy)] 14 | struct Alloc; 15 | 16 | unsafe impl std::alloc::GlobalAlloc for Alloc { 17 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 18 | let ret = System.alloc(layout); 19 | assert_ne!(ret, std::ptr::null_mut()); 20 | std::ptr::write_bytes(ret, 0xa1, layout.size()); 21 | ret 22 | } 23 | 24 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 25 | std::ptr::write_bytes(ptr, 0xde, layout.size()); 26 | System.dealloc(ptr, layout) 27 | } 28 | } 29 | } 30 | 31 | pub fn setup_logger() { 32 | use std::io::Write; 33 | 34 | fn tn() -> String { 35 | std::thread::current().name().unwrap_or("unknown").to_owned() 36 | } 37 | 38 | let mut builder = env_logger::Builder::new(); 39 | builder 40 | .format(|buf, record| { 41 | writeln!( 42 | buf, 43 | "{:05} {:20} {:10} {}", 44 | record.level(), 45 | tn(), 46 | record.module_path().unwrap().split("::").last().unwrap(), 47 | record.args() 48 | ) 49 | }) 50 | .filter(None, log::LevelFilter::Info); 51 | 52 | if let Ok(env) = std::env::var("RUST_LOG") { 53 | builder.parse_filters(&env); 54 | } 55 | 56 | let _r = builder.try_init(); 57 | } 58 | 59 | #[allow(dead_code)] 60 | pub fn cleanup(dir: &str) { 61 | let dir = std::path::Path::new(dir); 62 | if dir.exists() { 63 | std::fs::remove_dir_all(dir).unwrap(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tests/concurrent_batch_atomicity.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{Arc, Barrier}; 2 | use std::thread; 3 | 4 | use sled::{Config, Db as SledDb}; 5 | 6 | const CONCURRENCY: usize = 32; 7 | const N_KEYS: usize = 1024; 8 | 9 | type Db = SledDb<8>; 10 | 11 | fn batch_writer(db: Db, barrier: Arc, thread_number: usize) { 12 | barrier.wait(); 13 | let mut batch = sled::Batch::default(); 14 | for key_number in 0_u128..N_KEYS as _ { 15 | // LE is intentionally a little scrambled 16 | batch.insert(&key_number.to_le_bytes(), &thread_number.to_le_bytes()); 17 | } 18 | 19 | db.apply_batch(batch).unwrap(); 20 | } 21 | 22 | #[test] 23 | fn concurrent_batch_atomicity() { 24 | let db: Db = Config { 25 | path: "concurrent_batch_atomicity".into(), 26 | ..Default::default() 27 | } 28 | .open() 29 | .unwrap(); 30 | 31 | let mut threads = vec![]; 32 | 33 | let flusher_barrier = Arc::new(Barrier::new(CONCURRENCY)); 34 | for tn in 0..CONCURRENCY { 35 | let db = db.clone(); 36 | let barrier = flusher_barrier.clone(); 37 | let thread = thread::Builder::new() 38 | .name(format!("t(thread: {} flusher)", tn)) 39 | .spawn(move || { 40 | db.flush().unwrap(); 41 | barrier.wait(); 42 | }) 43 | .expect("should be able to spawn thread"); 44 | threads.push(thread); 45 | } 46 | 47 | let barrier = Arc::new(Barrier::new(CONCURRENCY + 1)); 48 | for thread_number in 0..CONCURRENCY { 49 | let db = db.clone(); 50 | let barrier = barrier.clone(); 51 | let jh = 52 | thread::spawn(move || batch_writer(db, barrier, thread_number)); 53 | threads.push(jh); 54 | } 55 | 56 | barrier.wait(); 57 | let before = std::time::Instant::now(); 58 | 59 | for thread in threads.into_iter() { 60 | thread.join().unwrap(); 61 | } 62 | 63 | println!("writers took {:?}", before.elapsed()); 64 | 65 | let mut expected_v = None; 66 | 67 | for key_number in 0_u128..N_KEYS as _ { 68 | let actual_v = db.get(&key_number.to_le_bytes()).unwrap().unwrap(); 69 | if expected_v.is_none() { 70 | expected_v = Some(actual_v.clone()); 71 | } 72 | assert_eq!(Some(actual_v), expected_v); 73 | } 74 | 75 | let _ = std::fs::remove_dir_all("concurrent_batch_atomicity"); 76 | } 77 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_batches.rs: -------------------------------------------------------------------------------- 1 | use std::thread; 2 | 3 | use rand::Rng; 4 | 5 | use super::*; 6 | 7 | const CACHE_SIZE: usize = 1024 * 1024; 8 | const BATCH_SIZE: u32 = 8; 9 | const SEGMENT_SIZE: usize = 1024; 10 | 11 | /// Verifies that the keys in the tree are correctly recovered (i.e., equal). 12 | /// Panics if they are incorrect. 13 | fn verify_batches(tree: &Db) -> u32 { 14 | let mut iter = tree.iter(); 15 | let first_value = match iter.next() { 16 | Some(Ok((_k, v))) => slice_to_u32(&*v), 17 | Some(Err(e)) => panic!("{:?}", e), 18 | None => return 0, 19 | }; 20 | 21 | // we now expect all items in the batch to be present and to have the same value 22 | 23 | for key in 0..BATCH_SIZE { 24 | let res = tree.get(u32_to_vec(key)); 25 | let option = res.unwrap(); 26 | let v = match option { 27 | Some(v) => v, 28 | None => panic!( 29 | "expected key {} to have a value, instead it was missing in db with keys: {}", 30 | key, 31 | tree_to_string(&tree) 32 | ), 33 | }; 34 | let value = slice_to_u32(&*v); 35 | // FIXME BUG 1 count 2 36 | // assertion `left == right` failed: expected key 0 to have value 62003, instead it had value 62375 in db with keys: 37 | // {0:62003, 1:62003, 2:62003, 3:62003, 4:62003, 5:62003, 6:62003, 7:62003, 38 | // Human: iterating shows correct value, but first get did not 39 | // 40 | // expected key 1 to have value 1, instead it had value 29469 in db with keys: 41 | // {0:1, 1:29469, 2:29469, 3:29469, 4:29469, 5:29469, 6:29469, 7:29469, 42 | // Human: 0 didn't get included in later syncs 43 | // 44 | // expected key 0 to have value 59485, instead it had value 59484 in db with keys: 45 | // {0:59485, 1:59485, 2:59485, 3:59485, 4:59485, 5:59485, 6:59485, 7:59485, 46 | // Human: had key N during first check, then N + 1 in iteration 47 | assert_eq!( 48 | first_value, 49 | value, 50 | "expected key {} to have value {}, instead it had value {}. second get: {:?}. db iter: {}. third get: {:?}", 51 | key, 52 | first_value, 53 | value, 54 | slice_to_u32(&*tree.get(u32_to_vec(key)).unwrap().unwrap()), 55 | tree_to_string(&tree), 56 | slice_to_u32(&*tree.get(u32_to_vec(key)).unwrap().unwrap()), 57 | ); 58 | } 59 | 60 | first_value 61 | } 62 | 63 | fn run_batches_inner(db: Db) { 64 | fn do_batch(i: u32, db: &Db) { 65 | let mut rng = rand::rng(); 66 | let base_value = u32_to_vec(i); 67 | 68 | let mut batch = sled::Batch::default(); 69 | if rng.random_bool(0.1) { 70 | for key in 0..BATCH_SIZE { 71 | batch.remove(u32_to_vec(key)); 72 | } 73 | } else { 74 | for key in 0..BATCH_SIZE { 75 | let mut value = base_value.clone(); 76 | let additional_len = rng.random_range(0..SEGMENT_SIZE / 3); 77 | value.append(&mut vec![0u8; additional_len]); 78 | 79 | batch.insert(u32_to_vec(key), value); 80 | } 81 | } 82 | db.apply_batch(batch).unwrap(); 83 | } 84 | 85 | let mut i = verify_batches(&db); 86 | i += 1; 87 | do_batch(i, &db); 88 | 89 | loop { 90 | i += 1; 91 | do_batch(i, &db); 92 | } 93 | } 94 | 95 | pub fn run_crash_batches() { 96 | let crash_during_initialization = rand::rng().random_ratio(1, 10); 97 | 98 | if crash_during_initialization { 99 | spawn_killah(); 100 | } 101 | 102 | let path = std::path::Path::new(CRASH_DIR).join(BATCHES_DIR); 103 | let config = Config::new() 104 | .cache_capacity_bytes(CACHE_SIZE) 105 | .flush_every_ms(Some(1)) 106 | .path(path); 107 | 108 | let db = config.open().expect("couldn't open batch db"); 109 | let db2 = db.clone(); 110 | 111 | let t1 = thread::spawn(|| run_batches_inner(db)); 112 | let t2 = thread::spawn(move || { 113 | loop { 114 | db2.flush().unwrap(); 115 | } 116 | }); // run_batches_inner(db2)); 117 | 118 | if !crash_during_initialization { 119 | spawn_killah(); 120 | } 121 | 122 | let Err(e) = t1.join().and_then(|_| t2.join()); 123 | 124 | println!("worker thread failed: {:?}", e); 125 | std::process::exit(15); 126 | } 127 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_heap.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | const FANOUT: usize = 3; 4 | 5 | pub fn run_crash_heap() { 6 | let path = std::path::Path::new(CRASH_DIR).join(HEAP_DIR); 7 | let config = Config::new().path(path); 8 | 9 | let HeapRecovery { heap, recovered_nodes, was_recovered } = 10 | Heap::recover(FANOUT, &config).unwrap(); 11 | 12 | // validate 13 | 14 | spawn_killah(); 15 | 16 | loop {} 17 | } 18 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_iter.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{Arc, Barrier}; 2 | use std::thread; 3 | 4 | use super::*; 5 | 6 | const CACHE_SIZE: usize = 256; 7 | 8 | pub fn run_crash_iter() { 9 | const N_FORWARD: usize = 50; 10 | const N_REVERSE: usize = 50; 11 | 12 | let path = std::path::Path::new(CRASH_DIR).join(ITER_DIR); 13 | let config = Config::new() 14 | .cache_capacity_bytes(CACHE_SIZE) 15 | .path(path) 16 | .flush_every_ms(Some(1)); 17 | 18 | let db: Db = config.open().expect("couldn't open iter db"); 19 | let t = db.open_tree(b"crash_iter_test").unwrap(); 20 | 21 | thread::Builder::new() 22 | .name("crash_iter_flusher".to_string()) 23 | .spawn({ 24 | let t = t.clone(); 25 | move || loop { 26 | t.flush().unwrap(); 27 | } 28 | }) 29 | .unwrap(); 30 | 31 | const INDELIBLE: [&[u8]; 16] = [ 32 | &[0u8], 33 | &[1u8], 34 | &[2u8], 35 | &[3u8], 36 | &[4u8], 37 | &[5u8], 38 | &[6u8], 39 | &[7u8], 40 | &[8u8], 41 | &[9u8], 42 | &[10u8], 43 | &[11u8], 44 | &[12u8], 45 | &[13u8], 46 | &[14u8], 47 | &[15u8], 48 | ]; 49 | 50 | for item in &INDELIBLE { 51 | t.insert(*item, *item).unwrap(); 52 | } 53 | t.flush().unwrap(); 54 | 55 | let barrier = Arc::new(Barrier::new(N_FORWARD + N_REVERSE + 2)); 56 | let mut threads = vec![]; 57 | 58 | for i in 0..N_FORWARD { 59 | let t = thread::Builder::new() 60 | .name(format!("forward({})", i)) 61 | .spawn({ 62 | let t = t.clone(); 63 | let barrier = barrier.clone(); 64 | move || { 65 | barrier.wait(); 66 | loop { 67 | let expected = INDELIBLE.iter(); 68 | let mut keys = t.iter().keys(); 69 | 70 | for expect in expected { 71 | loop { 72 | let k = keys.next().unwrap().unwrap(); 73 | assert!( 74 | &*k <= *expect, 75 | "witnessed key is {:?} but we expected \ 76 | one <= {:?}, so we overshot due to a \ 77 | concurrent modification", 78 | k, 79 | expect, 80 | ); 81 | if &*k == *expect { 82 | break; 83 | } 84 | } 85 | } 86 | } 87 | } 88 | }) 89 | .unwrap(); 90 | threads.push(t); 91 | } 92 | 93 | for i in 0..N_REVERSE { 94 | let t = thread::Builder::new() 95 | .name(format!("reverse({})", i)) 96 | .spawn({ 97 | let t = t.clone(); 98 | let barrier = barrier.clone(); 99 | move || { 100 | barrier.wait(); 101 | loop { 102 | let expected = INDELIBLE.iter().rev(); 103 | let mut keys = t.iter().keys().rev(); 104 | 105 | for expect in expected { 106 | loop { 107 | if let Some(Ok(k)) = keys.next() { 108 | assert!( 109 | &*k >= *expect, 110 | "witnessed key is {:?} but we expected \ 111 | one >= {:?}, so we overshot due to a \ 112 | concurrent modification\n{:?}", 113 | k, 114 | expect, 115 | t, 116 | ); 117 | if &*k == *expect { 118 | break; 119 | } 120 | } else { 121 | panic!("undershot key on tree: \n{:?}", t); 122 | } 123 | } 124 | } 125 | } 126 | } 127 | }) 128 | .unwrap(); 129 | 130 | threads.push(t); 131 | } 132 | 133 | let inserter = thread::Builder::new() 134 | .name("inserter".into()) 135 | .spawn({ 136 | let t = t.clone(); 137 | let barrier = barrier.clone(); 138 | move || { 139 | barrier.wait(); 140 | 141 | loop { 142 | for i in 0..(16 * 16 * 8) { 143 | let major = i / (16 * 8); 144 | let minor = i % 16; 145 | 146 | let mut base = INDELIBLE[major].to_vec(); 147 | base.push(minor as u8); 148 | t.insert(base.clone(), base.clone()).unwrap(); 149 | } 150 | } 151 | } 152 | }) 153 | .unwrap(); 154 | 155 | threads.push(inserter); 156 | 157 | let deleter = thread::Builder::new() 158 | .name("deleter".into()) 159 | .spawn({ 160 | move || { 161 | barrier.wait(); 162 | 163 | loop { 164 | for i in 0..(16 * 16 * 8) { 165 | let major = i / (16 * 8); 166 | let minor = i % 16; 167 | 168 | let mut base = INDELIBLE[major].to_vec(); 169 | base.push(minor as u8); 170 | t.remove(&base).unwrap(); 171 | } 172 | } 173 | } 174 | }) 175 | .unwrap(); 176 | 177 | spawn_killah(); 178 | 179 | threads.push(deleter); 180 | 181 | for thread in threads.into_iter() { 182 | thread.join().expect("thread should not have crashed"); 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_metadata_store.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | pub fn run_crash_metadata_store() { 4 | let (metadata_store, recovered) = 5 | MetadataStore::recover(&HEAP_DIR).unwrap(); 6 | 7 | // validate 8 | 9 | spawn_killah(); 10 | 11 | loop {} 12 | } 13 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_object_cache.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | const FANOUT: usize = 3; 4 | 5 | pub fn run_crash_object_cache() { 6 | let path = std::path::Path::new(CRASH_DIR).join(OBJECT_CACHE_DIR); 7 | let config = Config::new().flush_every_ms(Some(1)).path(path); 8 | 9 | let (oc, collections, was_recovered): (ObjectCache, _, bool) = 10 | ObjectCache::recover(&config).unwrap(); 11 | 12 | // validate 13 | 14 | spawn_killah(); 15 | 16 | loop {} 17 | } 18 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_sequential_writes.rs: -------------------------------------------------------------------------------- 1 | use std::thread; 2 | 3 | use super::*; 4 | 5 | const CACHE_SIZE: usize = 1024 * 1024; 6 | const CYCLE: usize = 256; 7 | const SEGMENT_SIZE: usize = 1024; 8 | 9 | /// Verifies that the keys in the tree are correctly recovered. 10 | /// Panics if they are incorrect. 11 | /// Returns the key that should be resumed at, and the current cycle value. 12 | fn verify(tree: &Db) -> (u32, u32) { 13 | // key 0 should always be the highest value, as that's where we increment 14 | // at some point, it might go down by one 15 | // it should never return, or go down again after that 16 | let mut iter = tree.iter(); 17 | let highest = match iter.next() { 18 | Some(Ok((_k, v))) => slice_to_u32(&*v), 19 | Some(Err(e)) => panic!("{:?}", e), 20 | None => return (0, 0), 21 | }; 22 | 23 | let highest_vec = u32_to_vec(highest); 24 | 25 | // find how far we got 26 | let mut contiguous: u32 = 0; 27 | let mut lowest_with_high_value = 0; 28 | 29 | for res in iter { 30 | let (k, v) = res.unwrap(); 31 | if v[..4] == highest_vec[..4] { 32 | contiguous += 1; 33 | } else { 34 | let expected = if highest == 0 { 35 | CYCLE as u32 - 1 36 | } else { 37 | (highest - 1) % CYCLE as u32 38 | }; 39 | let actual = slice_to_u32(&*v); 40 | // FIXME BUG 2 41 | // thread '' panicked at tests/test_crash_recovery.rs:159:13: 42 | // assertion `left == right` failed 43 | // left: 139 44 | // right: 136 45 | assert_eq!( 46 | expected, 47 | actual, 48 | "tree failed assertion with iterated values: {}, k: {:?} v: {:?} expected: {} highest: {}", 49 | tree_to_string(&tree), 50 | k, 51 | v, 52 | expected, 53 | highest 54 | ); 55 | lowest_with_high_value = actual; 56 | break; 57 | } 58 | } 59 | 60 | // ensure nothing changes after this point 61 | let low_beginning = u32_to_vec(contiguous + 1); 62 | 63 | for res in tree.range(&*low_beginning..) { 64 | let (k, v): (sled::InlineArray, _) = res.unwrap(); 65 | assert_eq!( 66 | slice_to_u32(&*v), 67 | lowest_with_high_value, 68 | "expected key {} to have value {}, instead it had value {} in db: {:?}", 69 | slice_to_u32(&*k), 70 | lowest_with_high_value, 71 | slice_to_u32(&*v), 72 | tree 73 | ); 74 | } 75 | 76 | (contiguous, highest) 77 | } 78 | 79 | fn run_inner(config: Config) { 80 | let crash_during_initialization = rand::rng().random_bool(0.1); 81 | 82 | if crash_during_initialization { 83 | spawn_killah(); 84 | } 85 | 86 | let tree = config.open().expect("couldn't open db"); 87 | 88 | if !crash_during_initialization { 89 | spawn_killah(); 90 | } 91 | 92 | let (key, highest) = verify(&tree); 93 | 94 | let mut hu = ((highest as usize) * CYCLE) + key as usize; 95 | assert_eq!(hu % CYCLE, key as usize); 96 | assert_eq!(hu / CYCLE, highest as usize); 97 | 98 | loop { 99 | let key = u32_to_vec((hu % CYCLE) as u32); 100 | 101 | //dbg!(hu, hu % CYCLE); 102 | 103 | let mut value = u32_to_vec((hu / CYCLE) as u32); 104 | let additional_len = rand::rng().random_range(0..SEGMENT_SIZE / 3); 105 | value.append(&mut vec![0u8; additional_len]); 106 | 107 | tree.insert(&key, value).unwrap(); 108 | 109 | hu += 1; 110 | 111 | if hu / CYCLE >= CYCLE { 112 | hu = 0; 113 | } 114 | } 115 | } 116 | 117 | pub fn run_crash_sequential_writes() { 118 | let path = std::path::Path::new(CRASH_DIR).join(SEQUENTIAL_WRITES_DIR); 119 | let config = Config::new() 120 | .cache_capacity_bytes(CACHE_SIZE) 121 | .flush_every_ms(Some(1)) 122 | .path(path); 123 | 124 | if let Err(e) = thread::spawn(|| run_inner(config)).join() { 125 | println!("worker thread failed: {:?}", e); 126 | std::process::exit(15); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /tests/crash_tests/crash_tx.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | const CACHE_SIZE: usize = 1024 * 1024; 4 | 5 | pub fn run_crash_tx() { 6 | let config = Config::new() 7 | .cache_capacity_bytes(CACHE_SIZE) 8 | .flush_every_ms(Some(1)) 9 | .path(TX_DIR); 10 | 11 | let _db: Db = config.open().unwrap(); 12 | 13 | spawn_killah(); 14 | 15 | loop {} 16 | 17 | /* 18 | db.insert(b"k1", b"cats").unwrap(); 19 | db.insert(b"k2", b"dogs").unwrap(); 20 | db.insert(b"id", &0_u64.to_le_bytes()).unwrap(); 21 | 22 | let mut threads = vec![]; 23 | 24 | const N_WRITERS: usize = 50; 25 | const N_READERS: usize = 5; 26 | 27 | let barrier = Arc::new(Barrier::new(N_WRITERS + N_READERS)); 28 | 29 | for _ in 0..N_WRITERS { 30 | let db = db.clone(); 31 | let barrier = barrier.clone(); 32 | let thread = std::thread::spawn(move || { 33 | barrier.wait(); 34 | loop { 35 | db.transaction::<_, _, ()>(|db| { 36 | let v1 = db.remove(b"k1").unwrap().unwrap(); 37 | let v2 = db.remove(b"k2").unwrap().unwrap(); 38 | 39 | db.insert(b"id", &db.generate_id().unwrap().to_le_bytes()) 40 | .unwrap(); 41 | 42 | db.insert(b"k1", v2).unwrap(); 43 | db.insert(b"k2", v1).unwrap(); 44 | Ok(()) 45 | }) 46 | .unwrap(); 47 | } 48 | }); 49 | threads.push(thread); 50 | } 51 | 52 | for _ in 0..N_READERS { 53 | let db = db.clone(); 54 | let barrier = barrier.clone(); 55 | let thread = std::thread::spawn(move || { 56 | barrier.wait(); 57 | let mut last_id = 0; 58 | loop { 59 | let read_id = db 60 | .transaction::<_, _, ()>(|db| { 61 | let v1 = db.get(b"k1").unwrap().unwrap(); 62 | let v2 = db.get(b"k2").unwrap().unwrap(); 63 | let id = u64::from_le_bytes( 64 | TryFrom::try_from( 65 | &*db.get(b"id").unwrap().unwrap(), 66 | ) 67 | .unwrap(), 68 | ); 69 | 70 | let mut results = vec![v1, v2]; 71 | results.sort(); 72 | 73 | assert_eq!( 74 | [&results[0], &results[1]], 75 | [b"cats", b"dogs"] 76 | ); 77 | 78 | Ok(id) 79 | }) 80 | .unwrap(); 81 | assert!(read_id >= last_id); 82 | last_id = read_id; 83 | } 84 | }); 85 | threads.push(thread); 86 | } 87 | 88 | spawn_killah(); 89 | 90 | for thread in threads.into_iter() { 91 | thread.join().expect("threads should not crash"); 92 | } 93 | 94 | let v1 = db.get(b"k1").unwrap().unwrap(); 95 | let v2 = db.get(b"k2").unwrap().unwrap(); 96 | assert_eq!([v1, v2], [b"cats", b"dogs"]); 97 | */ 98 | } 99 | -------------------------------------------------------------------------------- /tests/crash_tests/mod.rs: -------------------------------------------------------------------------------- 1 | use std::mem::size_of; 2 | use std::process::exit; 3 | use std::thread; 4 | use std::time::Duration; 5 | 6 | use rand::Rng; 7 | 8 | use sled::{ 9 | Config, Db as SledDb, Heap, HeapRecovery, MetadataStore, ObjectCache, 10 | }; 11 | 12 | mod crash_batches; 13 | mod crash_heap; 14 | mod crash_iter; 15 | mod crash_metadata_store; 16 | mod crash_object_cache; 17 | mod crash_sequential_writes; 18 | mod crash_tx; 19 | 20 | pub use crash_batches::run_crash_batches; 21 | pub use crash_heap::run_crash_heap; 22 | pub use crash_iter::run_crash_iter; 23 | pub use crash_metadata_store::run_crash_metadata_store; 24 | pub use crash_object_cache::run_crash_object_cache; 25 | pub use crash_sequential_writes::run_crash_sequential_writes; 26 | pub use crash_tx::run_crash_tx; 27 | 28 | type Db = SledDb<8>; 29 | 30 | // test names, also used as dir names 31 | pub const SEQUENTIAL_WRITES_DIR: &str = "crash_sequential_writes"; 32 | pub const BATCHES_DIR: &str = "crash_batches"; 33 | pub const ITER_DIR: &str = "crash_iter"; 34 | pub const TX_DIR: &str = "crash_tx"; 35 | pub const METADATA_STORE_DIR: &str = "crash_metadata_store"; 36 | pub const HEAP_DIR: &str = "crash_heap"; 37 | pub const OBJECT_CACHE_DIR: &str = "crash_object_cache"; 38 | 39 | const CRASH_DIR: &str = "crash_test_files"; 40 | 41 | fn spawn_killah() { 42 | thread::spawn(|| { 43 | let runtime = rand::rng().random_range(0..60_000); 44 | thread::sleep(Duration::from_micros(runtime)); 45 | exit(9); 46 | }); 47 | } 48 | 49 | fn u32_to_vec(u: u32) -> Vec { 50 | let buf: [u8; size_of::()] = u.to_be_bytes(); 51 | buf.to_vec() 52 | } 53 | 54 | fn slice_to_u32(b: &[u8]) -> u32 { 55 | let mut buf = [0u8; size_of::()]; 56 | buf.copy_from_slice(&b[..size_of::()]); 57 | 58 | u32::from_be_bytes(buf) 59 | } 60 | 61 | fn tree_to_string(tree: &Db) -> String { 62 | let mut ret = String::from("{"); 63 | for kv_res in tree.iter() { 64 | let (k, v) = kv_res.unwrap(); 65 | let k_s = slice_to_u32(&k); 66 | let v_s = slice_to_u32(&v); 67 | ret.push_str(&format!("{}:{}, ", k_s, v_s)); 68 | } 69 | ret.push_str("}"); 70 | ret 71 | } 72 | -------------------------------------------------------------------------------- /tests/test_crash_recovery.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | mod crash_tests; 3 | 4 | use std::alloc::{Layout, System}; 5 | use std::env::{self, VarError}; 6 | use std::process::Command; 7 | use std::thread; 8 | 9 | use common::cleanup; 10 | 11 | const TEST_ENV_VAR: &str = "SLED_CRASH_TEST"; 12 | const N_TESTS: usize = 100; 13 | 14 | const TESTS: [&str; 7] = [ 15 | crash_tests::SEQUENTIAL_WRITES_DIR, 16 | crash_tests::BATCHES_DIR, 17 | crash_tests::ITER_DIR, 18 | crash_tests::TX_DIR, 19 | crash_tests::METADATA_STORE_DIR, 20 | crash_tests::HEAP_DIR, 21 | crash_tests::OBJECT_CACHE_DIR, 22 | ]; 23 | 24 | const CRASH_CHANCE: u32 = 250; 25 | 26 | #[global_allocator] 27 | static ALLOCATOR: ShredAllocator = ShredAllocator; 28 | 29 | #[derive(Default, Debug, Clone, Copy)] 30 | struct ShredAllocator; 31 | 32 | unsafe impl std::alloc::GlobalAlloc for ShredAllocator { 33 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 34 | assert!(layout.size() < 1_000_000_000); 35 | let ret = unsafe { System.alloc(layout) }; 36 | assert_ne!(ret, std::ptr::null_mut()); 37 | unsafe { 38 | std::ptr::write_bytes(ret, 0xa1, layout.size()); 39 | } 40 | ret 41 | } 42 | 43 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 44 | unsafe { 45 | std::ptr::write_bytes(ptr, 0xde, layout.size()); 46 | } 47 | unsafe { System.dealloc(ptr, layout) } 48 | } 49 | } 50 | 51 | fn main() { 52 | // Don't actually run this harness=false test under miri, as it requires 53 | // spawning and killing child processes. 54 | if cfg!(miri) { 55 | return; 56 | } 57 | 58 | common::setup_logger(); 59 | 60 | match env::var(TEST_ENV_VAR) { 61 | Err(VarError::NotPresent) => { 62 | let filtered: Vec<&'static str> = 63 | if let Some(filter) = std::env::args().nth(1) { 64 | TESTS 65 | .iter() 66 | .filter(|name| name.contains(&filter)) 67 | .cloned() 68 | .collect() 69 | } else { 70 | TESTS.to_vec() 71 | }; 72 | 73 | let filtered_len = filtered.len(); 74 | 75 | println!(); 76 | println!( 77 | "running {} test{}", 78 | filtered.len(), 79 | if filtered.len() == 1 { "" } else { "s" }, 80 | ); 81 | 82 | let mut tests = vec![]; 83 | for test_name in filtered.into_iter() { 84 | let test = thread::spawn(move || { 85 | let res = 86 | std::panic::catch_unwind(|| supervisor(test_name)); 87 | println!( 88 | "test {} ... {}", 89 | test_name, 90 | if res.is_ok() { "ok" } else { "panicked" } 91 | ); 92 | res.unwrap(); 93 | }); 94 | tests.push((test_name, test)); 95 | } 96 | 97 | for (test_name, test) in tests.into_iter() { 98 | test.join().expect(test_name); 99 | } 100 | 101 | println!(); 102 | println!( 103 | "test result: ok. {} passed; {} filtered out", 104 | filtered_len, 105 | TESTS.len() - filtered_len, 106 | ); 107 | println!(); 108 | } 109 | 110 | Ok(ref s) if s == crash_tests::SEQUENTIAL_WRITES_DIR => { 111 | crash_tests::run_crash_sequential_writes() 112 | } 113 | Ok(ref s) if s == crash_tests::BATCHES_DIR => { 114 | crash_tests::run_crash_batches() 115 | } 116 | Ok(ref s) if s == crash_tests::ITER_DIR => { 117 | crash_tests::run_crash_iter() 118 | } 119 | Ok(ref s) if s == crash_tests::TX_DIR => crash_tests::run_crash_tx(), 120 | Ok(ref s) if s == crash_tests::METADATA_STORE_DIR => { 121 | crash_tests::run_crash_metadata_store() 122 | } 123 | Ok(ref s) if s == crash_tests::HEAP_DIR => { 124 | crash_tests::run_crash_heap() 125 | } 126 | Ok(ref s) if s == crash_tests::OBJECT_CACHE_DIR => { 127 | crash_tests::run_crash_object_cache() 128 | } 129 | Ok(other) => panic!("invalid crash test case: {other}"), 130 | Err(e) => panic!("env var {TEST_ENV_VAR} unable to be read: {e:?}"), 131 | } 132 | } 133 | 134 | fn run_child_process(dir: &str) { 135 | let bin = env::current_exe().expect("could not get test binary path"); 136 | 137 | unsafe { 138 | env::set_var(TEST_ENV_VAR, dir); 139 | } 140 | 141 | let status_res = Command::new(bin) 142 | .env(TEST_ENV_VAR, dir) 143 | .env("SLED_CRASH_CHANCE", CRASH_CHANCE.to_string()) 144 | .spawn() 145 | .unwrap_or_else(|_| { 146 | panic!("could not spawn child process for {} test", dir) 147 | }) 148 | .wait(); 149 | 150 | match status_res { 151 | Ok(status) => { 152 | let code = status.code(); 153 | 154 | if code.is_none() || code.unwrap() != 9 { 155 | cleanup(dir); 156 | panic!("{} test child exited abnormally", dir); 157 | } 158 | } 159 | Err(e) => { 160 | cleanup(dir); 161 | panic!("error waiting for {} test child: {}", dir, e); 162 | } 163 | } 164 | } 165 | 166 | fn supervisor(dir: &str) { 167 | cleanup(dir); 168 | 169 | for _ in 0..N_TESTS { 170 | run_child_process(dir); 171 | } 172 | 173 | cleanup(dir); 174 | } 175 | -------------------------------------------------------------------------------- /tests/test_quiescent.rs: -------------------------------------------------------------------------------- 1 | #![cfg(all(target_os = "linux", not(miri)))] 2 | 3 | mod common; 4 | 5 | use std::time::{Duration, Instant}; 6 | 7 | use common::cleanup; 8 | 9 | #[test] 10 | fn quiescent_cpu_time() { 11 | const DB_DIR: &str = "sleeper"; 12 | cleanup(DB_DIR); 13 | 14 | fn run() { 15 | let start = Instant::now(); 16 | let db = sled::open(DB_DIR).unwrap(); 17 | std::thread::sleep(Duration::from_secs(10)); 18 | drop(db); 19 | let end = Instant::now(); 20 | 21 | let (user_cpu_time, system_cpu_time) = unsafe { 22 | let mut resource_usage: libc::rusage = std::mem::zeroed(); 23 | let return_value = libc::getrusage( 24 | libc::RUSAGE_SELF, 25 | (&mut resource_usage) as *mut libc::rusage, 26 | ); 27 | if return_value != 0 { 28 | panic!("error {} from getrusage()", *libc::__errno_location()); 29 | } 30 | (resource_usage.ru_utime, resource_usage.ru_stime) 31 | }; 32 | 33 | let user_cpu_seconds = 34 | user_cpu_time.tv_sec as f64 + user_cpu_time.tv_usec as f64 * 1e-6; 35 | let system_cpu_seconds = system_cpu_time.tv_sec as f64 36 | + system_cpu_time.tv_usec as f64 * 1e-6; 37 | let real_time_elapsed = end.duration_since(start); 38 | 39 | if user_cpu_seconds + system_cpu_seconds > 1.0 { 40 | panic!( 41 | "Database used too much CPU during a quiescent workload. User: {}s, system: {}s (wall clock: {}s)", 42 | user_cpu_seconds, 43 | system_cpu_seconds, 44 | real_time_elapsed.as_secs_f64(), 45 | ); 46 | } 47 | } 48 | 49 | let child = unsafe { libc::fork() }; 50 | if child == 0 { 51 | common::setup_logger(); 52 | if let Err(e) = std::thread::spawn(run).join() { 53 | println!("test failed: {:?}", e); 54 | std::process::exit(15); 55 | } else { 56 | std::process::exit(0); 57 | } 58 | } else { 59 | let mut status = 0; 60 | unsafe { 61 | libc::waitpid(child, &mut status as *mut libc::c_int, 0); 62 | } 63 | if status != 0 { 64 | cleanup(DB_DIR); 65 | panic!("child exited abnormally"); 66 | } 67 | } 68 | 69 | cleanup(DB_DIR); 70 | } 71 | -------------------------------------------------------------------------------- /tests/test_space_leaks.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | mod common; 4 | 5 | #[test] 6 | #[cfg_attr(miri, ignore)] 7 | fn size_leak() -> io::Result<()> { 8 | common::setup_logger(); 9 | 10 | let tree: sled::Db<1024> = 11 | sled::Config::tmp()?.flush_every_ms(None).open()?; 12 | 13 | for _ in 0..10_000 { 14 | tree.insert(b"", b"")?; 15 | } 16 | 17 | tree.flush()?; 18 | 19 | let sz = tree.size_on_disk()?; 20 | assert!( 21 | sz <= 16384, 22 | "expected system to use less than or equal to \ 23 | 16486 bytes, but actually used {}", 24 | sz 25 | ); 26 | 27 | Ok(()) 28 | } 29 | -------------------------------------------------------------------------------- /tests/tree/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::BTreeMap, convert::TryInto, fmt, panic}; 2 | 3 | use quickcheck::{Arbitrary, Gen}; 4 | use rand_distr::{Distribution, Gamma}; 5 | 6 | use sled::{Config, Db as SledDb, InlineArray}; 7 | 8 | type Db = SledDb<3>; 9 | 10 | #[derive(Clone, Ord, PartialOrd, Eq, PartialEq)] 11 | pub struct Key(pub Vec); 12 | 13 | impl fmt::Debug for Key { 14 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 15 | if !self.0.is_empty() { 16 | write!( 17 | f, 18 | "Key(vec![{}; {}])", 19 | self.0.first().copied().unwrap_or(0), 20 | self.0.len() 21 | ) 22 | } else { 23 | write!(f, "Key(vec!{:?})", self.0) 24 | } 25 | } 26 | } 27 | 28 | fn range(g: &mut Gen, min_inclusive: usize, max_exclusive: usize) -> usize { 29 | assert!(max_exclusive > min_inclusive); 30 | let range = max_exclusive - min_inclusive; 31 | let generated = usize::arbitrary(g) % range; 32 | min_inclusive + generated 33 | } 34 | 35 | impl Arbitrary for Key { 36 | #![allow(clippy::cast_possible_truncation)] 37 | #![allow(clippy::cast_precision_loss)] 38 | #![allow(clippy::cast_sign_loss)] 39 | 40 | fn arbitrary(g: &mut Gen) -> Self { 41 | if bool::arbitrary(g) { 42 | let gs = g.size(); 43 | let gamma = Gamma::new(0.3, gs as f64).unwrap(); 44 | let v = gamma.sample(&mut rand::rng()); 45 | let len = if v > 3000.0 { 10000 } else { (v % 300.) as usize }; 46 | 47 | let space = range(g, 0, gs) + 1; 48 | 49 | let inner = (0..len).map(|_| range(g, 0, space) as u8).collect(); 50 | 51 | Self(inner) 52 | } else { 53 | let len = range(g, 0, 2); 54 | let mut inner = vec![]; 55 | 56 | for _ in 0..len { 57 | inner.push(u8::arbitrary(g)); 58 | } 59 | 60 | Self(inner) 61 | } 62 | } 63 | 64 | fn shrink(&self) -> Box> { 65 | // we only want to shrink on length, not byte values 66 | Box::new( 67 | self.0 68 | .len() 69 | .shrink() 70 | .zip(std::iter::repeat(self.0.clone())) 71 | .map(|(len, underlying)| Self(underlying[..len].to_vec())), 72 | ) 73 | } 74 | } 75 | 76 | #[derive(Debug, Clone)] 77 | pub enum Op { 78 | Set(Key, u8), 79 | // Merge(Key, u8), 80 | Get(Key), 81 | GetLt(Key), 82 | GetGt(Key), 83 | Del(Key), 84 | Cas(Key, u8, u8), 85 | Scan(Key, isize), 86 | Restart, 87 | } 88 | 89 | use self::Op::*; 90 | 91 | impl Arbitrary for Op { 92 | fn arbitrary(g: &mut Gen) -> Self { 93 | if range(g, 0, 10) == 0 { 94 | return Restart; 95 | } 96 | 97 | let choice = range(g, 0, 7); 98 | 99 | match choice { 100 | 0 => Set(Key::arbitrary(g), u8::arbitrary(g)), 101 | 1 => Get(Key::arbitrary(g)), 102 | 2 => GetLt(Key::arbitrary(g)), 103 | 3 => GetGt(Key::arbitrary(g)), 104 | 4 => Del(Key::arbitrary(g)), 105 | 5 => Cas(Key::arbitrary(g), u8::arbitrary(g), u8::arbitrary(g)), 106 | 6 => Scan(Key::arbitrary(g), range(g, 0, 80) as isize - 40), 107 | //7 => Merge(Key::arbitrary(g), u8::arbitrary(g)), 108 | _ => panic!("impossible choice"), 109 | } 110 | } 111 | 112 | fn shrink(&self) -> Box> { 113 | match *self { 114 | Set(ref k, v) => Box::new(k.shrink().map(move |sk| Set(sk, v))), 115 | /* 116 | Merge(ref k, v) => Box::new( 117 | k.shrink() 118 | .flat_map(move |k| vec![Set(k.clone(), v), Merge(k, v)]), 119 | ), 120 | */ 121 | Get(ref k) => Box::new(k.shrink().map(Get)), 122 | GetLt(ref k) => Box::new(k.shrink().map(GetLt)), 123 | GetGt(ref k) => Box::new(k.shrink().map(GetGt)), 124 | Cas(ref k, old, new) => { 125 | Box::new(k.shrink().map(move |k| Cas(k, old, new))) 126 | } 127 | Scan(ref k, len) => Box::new(k.shrink().map(move |k| Scan(k, len))), 128 | Del(ref k) => Box::new(k.shrink().map(Del)), 129 | Restart => Box::new(vec![].into_iter()), 130 | } 131 | } 132 | } 133 | 134 | fn bytes_to_u16(v: &[u8]) -> u16 { 135 | assert_eq!(v.len(), 2); 136 | (u16::from(v[0]) << 8) + u16::from(v[1]) 137 | } 138 | 139 | fn u16_to_bytes(u: u16) -> Vec { 140 | u.to_be_bytes().to_vec() 141 | } 142 | 143 | /* 144 | // just adds up values as if they were u16's 145 | fn merge_operator( 146 | _k: &[u8], 147 | old: Option<&[u8]>, 148 | to_merge: &[u8], 149 | ) -> Option> { 150 | let base = old.unwrap_or(&[0, 0]); 151 | let base_n = bytes_to_u16(base); 152 | let new_n = base_n + u16::from(to_merge[0]); 153 | let ret = u16_to_bytes(new_n); 154 | Some(ret) 155 | } 156 | */ 157 | 158 | pub fn prop_tree_matches_btreemap( 159 | ops: Vec, 160 | flusher: bool, 161 | compression_level: i32, 162 | cache_size: usize, 163 | ) -> bool { 164 | if let Err(e) = prop_tree_matches_btreemap_inner( 165 | ops, 166 | flusher, 167 | compression_level, 168 | cache_size, 169 | ) { 170 | eprintln!("hit error while running quickcheck on tree: {:?}", e); 171 | false 172 | } else { 173 | true 174 | } 175 | } 176 | 177 | fn prop_tree_matches_btreemap_inner( 178 | ops: Vec, 179 | flusher: bool, 180 | compression: i32, 181 | cache_size: usize, 182 | ) -> std::io::Result<()> { 183 | use self::*; 184 | 185 | super::common::setup_logger(); 186 | 187 | let config = Config::tmp()? 188 | .zstd_compression_level(compression) 189 | .flush_every_ms(if flusher { Some(1) } else { None }) 190 | .cache_capacity_bytes(cache_size); 191 | 192 | let mut tree: Db = config.open().unwrap(); 193 | //tree.set_merge_operator(merge_operator); 194 | 195 | let mut reference: BTreeMap = BTreeMap::new(); 196 | 197 | for op in ops { 198 | match op { 199 | Set(k, v) => { 200 | let old_actual = tree.insert(&k.0, vec![0, v]).unwrap(); 201 | let old_reference = reference.insert(k.clone(), u16::from(v)); 202 | assert_eq!( 203 | old_actual.map(|v| bytes_to_u16(&*v)), 204 | old_reference, 205 | "when setting key {:?}, expected old returned value to be {:?}\n{:?}", 206 | k, 207 | old_reference, 208 | tree 209 | ); 210 | } 211 | /* 212 | Merge(k, v) => { 213 | tree.merge(&k.0, vec![v]).unwrap(); 214 | let entry = reference.entry(k).or_insert(0_u16); 215 | *entry += u16::from(v); 216 | } 217 | */ 218 | Get(k) => { 219 | let res1 = tree.get(&*k.0).unwrap().map(|v| bytes_to_u16(&*v)); 220 | let res2 = reference.get(&k).cloned(); 221 | assert_eq!(res1, res2); 222 | } 223 | GetLt(k) => { 224 | let res1 = tree.get_lt(&*k.0).unwrap().map(|v| v.0); 225 | let res2 = reference 226 | .iter() 227 | .rev() 228 | .find(|(key, _)| **key < k) 229 | .map(|(k, _v)| InlineArray::from(&*k.0)); 230 | assert_eq!( 231 | res1, res2, 232 | "get_lt({:?}) should have returned {:?} \ 233 | but it returned {:?} instead. \ 234 | \n Db: {:?}", 235 | k, res2, res1, tree 236 | ); 237 | } 238 | GetGt(k) => { 239 | let res1 = tree.get_gt(&*k.0).unwrap().map(|v| v.0); 240 | let res2 = reference 241 | .iter() 242 | .find(|(key, _)| **key > k) 243 | .map(|(k, _v)| InlineArray::from(&*k.0)); 244 | assert_eq!( 245 | res1, res2, 246 | "get_gt({:?}) expected {:?} in tree {:?}", 247 | k, res2, tree 248 | ); 249 | } 250 | Del(k) => { 251 | tree.remove(&*k.0).unwrap(); 252 | reference.remove(&k); 253 | } 254 | Cas(k, old, new) => { 255 | let tree_old = tree.get(&*k.0).unwrap(); 256 | if let Some(old_tree) = tree_old { 257 | if old_tree == *vec![0, old] { 258 | tree.insert(&k.0, vec![0, new]).unwrap(); 259 | } 260 | } 261 | 262 | let ref_old = reference.get(&k).cloned(); 263 | if ref_old == Some(u16::from(old)) { 264 | reference.insert(k, u16::from(new)); 265 | } 266 | } 267 | Scan(k, len) => { 268 | if len > 0 { 269 | let mut tree_iter = tree 270 | .range(&*k.0..) 271 | .take(len.abs().try_into().unwrap()) 272 | .map(Result::unwrap); 273 | let ref_iter = reference 274 | .iter() 275 | .filter(|&(rk, _rv)| *rk >= k) 276 | .take(len.abs().try_into().unwrap()) 277 | .map(|(rk, rv)| (rk.0.clone(), *rv)); 278 | 279 | for r in ref_iter { 280 | let tree_next = tree_iter 281 | .next() 282 | .expect("iterator incorrectly stopped early"); 283 | let lhs = (tree_next.0, &*tree_next.1); 284 | let rhs = (r.0.clone(), &*u16_to_bytes(r.1)); 285 | assert_eq!( 286 | (lhs.0.as_ref(), lhs.1), 287 | (rhs.0.as_ref(), rhs.1), 288 | "expected {:?} while iterating from {:?} on tree: {:?}", 289 | rhs, 290 | k, 291 | tree 292 | ); 293 | } 294 | 295 | assert!(tree_iter.next().is_none()); 296 | } else { 297 | let mut tree_iter = tree 298 | .range(&*k.0..) 299 | .rev() 300 | .take(len.abs().try_into().unwrap()) 301 | .map(Result::unwrap); 302 | let ref_iter = reference 303 | .iter() 304 | .rev() 305 | .filter(|&(rk, _rv)| *rk >= k) 306 | .take(len.abs().try_into().unwrap()) 307 | .map(|(rk, rv)| (rk.0.clone(), *rv)); 308 | 309 | for r in ref_iter { 310 | let tree_next = tree_iter.next().unwrap(); 311 | let lhs = (tree_next.0, &*tree_next.1); 312 | let rhs = (r.0.clone(), &*u16_to_bytes(r.1)); 313 | assert_eq!( 314 | (lhs.0.as_ref(), lhs.1), 315 | (rhs.0.as_ref(), rhs.1), 316 | "expected {:?} while reverse iterating from {:?} on tree: {:?}", 317 | rhs, 318 | k, 319 | tree 320 | ); 321 | } 322 | 323 | assert!(tree_iter.next().is_none()); 324 | } 325 | } 326 | Restart => { 327 | drop(tree); 328 | tree = config.open().unwrap(); 329 | //tree.set_merge_operator(merge_operator); 330 | } 331 | } 332 | if let Err(e) = tree.check_error() { 333 | eprintln!("quickcheck test encountered error: {:?}", e); 334 | return Err(e); 335 | } 336 | } 337 | 338 | let _ = std::fs::remove_dir_all(config.path); 339 | 340 | tree.check_error() 341 | } 342 | -------------------------------------------------------------------------------- /tsan_suppressions.txt: -------------------------------------------------------------------------------- 1 | # This suppressions file should really only be used for things 2 | # that TSAN can not correctly reason about, like raw memory 3 | # fences or implicit equivalents created by performing atomic 4 | # operations on variables. 5 | 6 | # Read more about how to use this file at: 7 | # https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions 8 | 9 | # Arc::drop is not properly detected by TSAN due to the use 10 | # of a raw atomic Acquire fence after the strong-count 11 | # atomic subtraction with a Release fence in the Drop impl. 12 | race:Arc*drop 13 | 14 | # lazy_static and thread_local rely on implicit barriers not 15 | # picked-up by TSAN 16 | race:lazy_static 17 | race:std::thread::local 18 | --------------------------------------------------------------------------------