├── benchmarks
    ├── criterion
    │   ├── src
    │   │   └── lib.rs
    │   ├── Cargo.toml
    │   └── benches
    │   │   └── sled.rs
    └── stress2
    │   ├── lsan.sh
    │   ├── tsan.sh
    │   └── Cargo.toml
├── bindings
    ├── neon-sled
    │   ├── .npmignore
    │   ├── native
    │   │   ├── index.node
    │   │   ├── build.rs
    │   │   ├── artifacts.json
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── lib.rs
    │   ├── README.md
    │   ├── package.json
    │   └── lib
    │   │   └── index.js
    ├── python
    │   ├── alice_populate.py
    │   ├── alice_check.py
    │   └── rsdb.py
    └── sled-native
    │   ├── README.md
    │   ├── cbindgen.toml
    │   ├── Cargo.toml
    │   └── src
    │       └── lib.rs
├── art
    ├── tree_face.png
    ├── tree_face_anti-transphobia.png
    └── CREDITS
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── blank_issue.md
    │   ├── config.yml
    │   ├── feature_request.md
    │   └── bugs.md
    ├── FUNDING.yml
    └── workflows
    │   └── test.yml
├── .rustfmt.toml
├── experiments
    ├── new_segment_ownership
    │   ├── Cargo.lock
    │   ├── Cargo.toml
    │   └── src
    │   │   └── main.rs
    └── epoch
    │   ├── Cargo.toml
    │   ├── sanitizers.sh
    │   ├── Cargo.lock
    │   └── src
    │       └── main.rs
├── scripts
    ├── shufnice.sh
    ├── ubuntu_bench
    ├── cross_compile.sh
    ├── instructions
    ├── sanitizers.sh
    └── execution_explorer.py
├── .gitignore
├── tests
    ├── test_space_leaks.rs
    ├── common
    │   └── mod.rs
    └── test_quiescent.rs
├── src
    ├── measure_allocs.rs
    ├── fail.rs
    ├── pagecache
    │   ├── parallel_io_unix.rs
    │   ├── constants.rs
    │   ├── header.rs
    │   ├── disk_pointer.rs
    │   ├── parallel_io_windows.rs
    │   ├── parallel_io_polyfill.rs
    │   ├── reservation.rs
    │   └── pagetable.rs
    ├── doc
    │   ├── performance_guide
    │   │   └── mod.rs
    │   ├── testing_strategies
    │   │   └── mod.rs
    │   ├── reactive_semantics
    │   │   └── mod.rs
    │   ├── limits
    │   │   └── mod.rs
    │   ├── engineering_practices
    │   │   └── mod.rs
    │   ├── mod.rs
    │   ├── merge_operators
    │   │   └── mod.rs
    │   └── motivating_experiences
    │   │   └── mod.rs
    ├── fastcmp.rs
    ├── fastlock.rs
    ├── batch.rs
    ├── lazy.rs
    ├── concurrency_control.rs
    ├── varint.rs
    ├── context.rs
    ├── debug_delay.rs
    ├── meta.rs
    ├── oneshot.rs
    ├── sys_limits.rs
    ├── dll.rs
    ├── flusher.rs
    ├── threadpool.rs
    ├── result.rs
    ├── event_log.rs
    ├── atomic_shim.rs
    ├── stack.rs
    └── arc.rs
├── SECURITY.md
├── tsan_suppressions.txt
├── LICENSE-MIT
├── RELEASE_CHECKLIST.md
├── CONTRIBUTING.md
├── examples
    └── playground.rs
├── Cargo.toml
├── code-of-conduct.md
└── SAFETY.md


/benchmarks/criterion/src/lib.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/.npmignore:
--------------------------------------------------------------------------------
1 | native/target
2 | native/index.node


--------------------------------------------------------------------------------
/art/tree_face.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timokoesters/sled/master/art/tree_face.png


--------------------------------------------------------------------------------
/art/tree_face_anti-transphobia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timokoesters/sled/master/art/tree_face_anti-transphobia.png


--------------------------------------------------------------------------------
/bindings/neon-sled/native/index.node:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timokoesters/sled/master/bindings/neon-sled/native/index.node


--------------------------------------------------------------------------------
/art/CREDITS:
--------------------------------------------------------------------------------
1 | original tree logo with face:
2 |   https://twitter.com/daiyitastic
3 | 
4 | anti-transphobia additions:
5 |   spacejam
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/blank_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank Issue (do not use this for bug reports or feature requests)
3 | about: Create an issue with a blank template.
4 | ---
5 | 


--------------------------------------------------------------------------------
/bindings/python/alice_populate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from sled import Conf
3 | 
4 | c = Conf()
5 | c.path(b"ALICE.data")
6 | t = c.tree()
7 | t.set(b"k1", b"v1")
8 | t.close()
9 | 


--------------------------------------------------------------------------------
/.rustfmt.toml:
--------------------------------------------------------------------------------
1 | version = "Two"
2 | use_small_heuristics = "Max"
3 | reorder_imports = true
4 | max_width = 80
5 | wrap_comments = true
6 | combine_control_expr = true
7 | report_todo = "Always"
8 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/native/build.rs:
--------------------------------------------------------------------------------
1 | extern crate neon_build;
2 | 
3 | fn main() {
4 |     neon_build::setup(); // must be called in build.rs
5 | 
6 |     // add project-specific build logic here...
7 | }
8 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/README.md:
--------------------------------------------------------------------------------
 1 | # neon-sled
 2 | 
 3 |     $ cargo check
 4 |     $ neon build
 5 |     $ 
 6 |     
 7 | ## in node
 8 | 
 9 |     $ node
10 |     > let addon = require('.')
11 |     
12 | 
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: sled discord
4 |     url: https://discord.gg/Z6VsXds
5 |     about: Please ask questions in the discord server here.
6 | 


--------------------------------------------------------------------------------
/experiments/new_segment_ownership/Cargo.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Cargo.
2 | # It is not intended for manual editing.
3 | [[package]]
4 | name = "new_segment_ownership"
5 | version = "0.1.0"
6 | 
7 | 


--------------------------------------------------------------------------------
/bindings/sled-native/README.md:
--------------------------------------------------------------------------------
 1 | # Native C-API for sled
 2 | 
 3 | ## Building
 4 | 
 5 | ```
 6 | $ cargo install cargo-c
 7 | $ cargo cinstall --prefix=/usr --destdir=/tmp/staging
 8 | $ sudo cp -a /tmp/staging/* /
 9 | ```
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/epoch/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "epoch"
 3 | version = "0.1.0"
 4 | authors = ["Tyler Neely <t@jujit.su>"]
 5 | edition = "2018"
 6 | 
 7 | [profile.release]
 8 | debug = true
 9 | 
10 | [dependencies]
11 | crossbeam-epoch = "0.8.0"
12 | 


--------------------------------------------------------------------------------
/scripts/shufnice.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | while true; do
 4 |   PID=`pgrep $1`
 5 |   TIDS=`ls /proc/$PID/task`
 6 |   TID=`echo $TIDS |  tr " " "\n" | shuf -n1`
 7 |   NICE=$((`shuf -i 0-39 -n 1` - 20))
 8 |   echo "renicing $TID to $NICE"
 9 |   renice -n $NICE -p $TID
10 | done
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Request a feature for sled
 4 | labels: feature
 5 | ---
 6 | 
 7 | #### Use Case:
 8 | 
 9 | #### Proposed Change:
10 | 
11 | #### Who Benefits From The Change(s)?
12 | 
13 | #### Alternative Approaches
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *db
 2 | *conf
 3 | *snap.*
 4 | *grind.out*
 5 | vgcore*
 6 | *.bk
 7 | *orig
 8 | tags
 9 | perf*
10 | *folded
11 | *out
12 | *perf
13 | *svg
14 | *txt
15 | experiments
16 | target
17 | Cargo.lock
18 | *swp
19 | *swo
20 | *.proptest-regressions
21 | corpus
22 | artifacts
23 | .idea
24 | cargo-timing*
25 | 


--------------------------------------------------------------------------------
/experiments/new_segment_ownership/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "new_segment_ownership"
 3 | version = "0.1.0"
 4 | authors = ["Tyler Neely <t@jujit.su>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | 


--------------------------------------------------------------------------------
/benchmarks/stress2/lsan.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | echo "lsan"
 6 | export RUSTFLAGS="-Z sanitizer=leak"
 7 | cargo build --features=no_jemalloc --target x86_64-unknown-linux-gnu
 8 | rm -rf default.sled
 9 | target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10 --set-prop=100000000 --val-len=100000
10 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/native/artifacts.json:
--------------------------------------------------------------------------------
1 | {"active":"release","targets":{"release":{"rustc":"rustc 1.20.0 (f3d6973f4 2017-08-27)","env":{"npm_config_target":"1.7.8","npm_config_arch":"x64","npm_config_target_arch":"x64","npm_config_disturl":"https://atom.io/download/electron","npm_config_runtime":"electron","npm_config_build_from_source":"true","npm_config_devdir":"/Users/mn/.electron-gyp"}}}}


--------------------------------------------------------------------------------
/bindings/python/alice_check.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import sys
 4 | 
 5 | from sled import Conf
 6 | 
 7 | crashed_state_directory = sys.argv[1]
 8 | os.chdir(crashed_state_directory)
 9 | 
10 | dirlist = os.listdir('.')
11 | 
12 | assert("ALICE.data" in dirlist)
13 | 
14 | c = Conf()
15 | c.path(b"ALICE.data")
16 | 
17 | t = c.tree()
18 | 
19 | assert(t.get(b"k1") == b"v1")
20 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "neon-sled",
 3 |   "version": "0.1.3",
 4 |   "description": "",
 5 |   "main": "lib/index.js",
 6 |   "author": [
 7 |     "Matthias Nehlsen <matthias.nehlsen@gmail.com>",
 8 |     "Tyler Neely <t@jujit.su>"
 9 |   ],
10 |   "license": "Apache 2",
11 |   "dependencies": {
12 |     "neon-cli": "^0.1.20"
13 |   },
14 |   "scripts": {
15 |     "install": "neon build"
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/native/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "neon-sled"
 3 | version = "0.1.2"
 4 | authors = ["Matthias Nehlsen <matthias.nehlsen@gmail.com>",
 5 |            "Tyler Neely <t@jujit.su>"]
 6 | license = "Apache 2"
 7 | build = "build.rs"
 8 | 
 9 | [lib]
10 | name = "neon_sled"
11 | crate-type = ["dylib"]
12 | 
13 | [build-dependencies]
14 | neon-build = "0.1.20"
15 | 
16 | [dependencies]
17 | neon = "0.1.20"
18 | sled = "0.14"


--------------------------------------------------------------------------------
/benchmarks/criterion/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "critter"
 3 | publish = false
 4 | version = "0.1.0"
 5 | authors = ["Tyler Neely <t@jujit.su>"]
 6 | edition = "2018"
 7 | 
 8 | [[bench]]
 9 | name = "sled"
10 | harness = false
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | 
14 | [dependencies]
15 | criterion = "0.3.0"
16 | sled = { path = "../.." }
17 | jemallocator = "0.3.2"
18 | 


--------------------------------------------------------------------------------
/benchmarks/stress2/tsan.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | echo "tsan"
 6 | export RUSTFLAGS="-Z sanitizer=thread"
 7 | export TSAN_OPTIONS="suppressions=/home/t/src/sled/tsan_suppressions.txt"
 8 | sudo rm -rf default.sled
 9 | cargo +nightly run --features=lock_free_delays,no_jemalloc --target x86_64-unknown-linux-gnu -- --duration=6
10 | cargo +nightly run --features=lock_free_delays,no_jemalloc --target x86_64-unknown-linux-gnu -- --duration=6
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bugs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report a bug
 4 | labels: bug
 5 | ---
 6 | 
 7 | Bug reports must include all following items:
 8 | 
 9 | 1. expected result
10 | 1. actual result
11 | 1. sled version
12 | 1. rustc version
13 | 1. operating system
14 | 1. minimal code sample that helps to reproduce the issue
15 | 1. logs, panic messages, stack traces
16 | 
17 | Incomplete bug reports will be closed.
18 | 
19 | Thank you for understanding :)
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/bindings/sled-native/cbindgen.toml:
--------------------------------------------------------------------------------
 1 | header = "// SPDX-License-Identifier: Apache-2.0"
 2 | sys_includes = ["stddef.h", "stdint.h", "stdlib.h"]
 3 | no_includes = true
 4 | include_guard = "SLED_H"
 5 | tab_width = 4
 6 | style = "Type"
 7 | # language = "C"
 8 | cpp_compat = true
 9 | 
10 | [parse]
11 | parse_deps = true
12 | include = ['sled']
13 | 
14 | [export]
15 | prefix = "Sled"
16 | item_types = ["enums", "structs", "unions", "typedefs", "opaque", "functions"]
17 | 
18 | [enum]
19 | rename_variants = "ScreamingSnakeCase"
20 | prefix_with_name = true
21 | 


--------------------------------------------------------------------------------
/bindings/sled-native/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sled-native"
 3 | version = "0.34.6"
 4 | authors = ["Tyler Neely <t@jujit.su>"]
 5 | description = "a C-compatible API for sled"
 6 | license = "Apache-2.0"
 7 | homepage = "https://github.com/spacejam/sled"
 8 | repository = "https://github.com/spacejam/sled/sled-native"
 9 | keywords = ["database", "embedded", "concurrent", "persistent", "c"]
10 | documentation = "https://docs.rs/sled-native/"
11 | edition = "2018"
12 | 
13 | [lib]
14 | name = "sled"
15 | crate-type = ["cdylib", "staticlib"]
16 | 
17 | [dependencies]
18 | libc = "0.2.62"
19 | sled = {version = "0.34.6", path = "../.."}
20 | 


--------------------------------------------------------------------------------
/experiments/epoch/sanitizers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | 
 4 | echo "asan"
 5 | cargo clean
 6 | export RUSTFLAGS="-Z sanitizer=address"
 7 | # export ASAN_OPTIONS="detect_odr_violation=0"
 8 | cargo +nightly run --target x86_64-unknown-linux-gnu
 9 | unset ASAN_OPTIONS
10 | 
11 | echo "lsan"
12 | cargo clean
13 | export RUSTFLAGS="-Z sanitizer=leak"
14 | cargo +nightly run --target x86_64-unknown-linux-gnu
15 | 
16 | echo "tsan"
17 | cargo clean
18 | export RUSTFLAGS="-Z sanitizer=thread"
19 | export TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt
20 | cargo +nightly run --target x86_64-unknown-linux-gnu
21 | unset RUSTFLAGS
22 | unset TSAN_OPTIONS
23 | 


--------------------------------------------------------------------------------
/scripts/ubuntu_bench:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | sudo apt-get update
 4 | sudo apt-get install htop dstat build-essential linux-tools-common linux-tools-generic linux-tools-`uname -r`
 5 | curl https://sh.rustup.rs -sSf | sh
 6 | source $HOME/.cargo/env
 7 | 
 8 | cargo install flamegraph
 9 | 
10 | git clone https://github.com/spacejam/sled.git
11 | cd sled
12 | 
13 | cores=$(grep -c ^processor /proc/cpuinfo)
14 | writers=(($cores / 5 + 1 ))
15 | readers=$(( ($cores / 5 + 1) * 4 ))
16 | 
17 | cargo build --release --bin=stress2 --features=stress
18 | 
19 | # we use sudo here to get access to symbols
20 | pushd benchmarks/stress2
21 | cargo flamegraph --release -- --get=$readers --set=$writers
22 | 


--------------------------------------------------------------------------------
/tests/test_space_leaks.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | 
 3 | #[test]
 4 | #[cfg_attr(miri, ignore)]
 5 | fn size_leak() -> sled::Result<()> {
 6 |     common::setup_logger();
 7 | 
 8 |     let tree = sled::Config::new()
 9 |         .temporary(true)
10 |         .segment_size(2048)
11 |         .flush_every_ms(None)
12 |         .open()?;
13 | 
14 |     for _ in 0..10_000 {
15 |         tree.insert(b"", b"")?;
16 |     }
17 | 
18 |     tree.flush()?;
19 | 
20 |     let sz = tree.size_on_disk()?;
21 |     assert!(
22 |         sz <= 16384,
23 |         "expected system to use less than or equal to \
24 |             16486 bytes, but actually used {}",
25 |         sz
26 |     );
27 | 
28 |     Ok(())
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/cross_compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | # checks sled's compatibility using several targets
 5 | 
 6 | targets="wasm32-wasi wasm32-unknown-unknown aarch64-fuchsia aarch64-linux-android \
 7 |          i686-linux-android i686-unknown-linux-gnu \
 8 |          x86_64-linux-android x86_64-fuchsia \
 9 |          mips-unknown-linux-musl aarch64-apple-ios"
10 | 
11 | rustup update --no-self-update
12 | 
13 | RUSTFLAGS="--cfg miri" cargo check
14 | 
15 | rustup toolchain install 1.39.0 --no-self-update
16 | cargo clean
17 | rm Cargo.lock
18 | cargo +1.39.0 check
19 | 
20 | for target in $targets; do
21 |   echo "setting up $target..."
22 |   rustup target add $target
23 |   echo "checking $target..."
24 |   cargo check --target $target
25 | done
26 | 
27 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: spacejam # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/src/measure_allocs.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unsafe_code)]
 2 | 
 3 | use std::sync::atomic::{AtomicUsize, Ordering::Release};
 4 | 
 5 | // define a passthrough allocator that tracks alloc calls.
 6 | // adapted from the flatbuffer codebase
 7 | use std::alloc::{GlobalAlloc, Layout, System};
 8 | 
 9 | pub(crate) struct TrackingAllocator;
10 | 
11 | pub static ALLOCATIONS: AtomicUsize = AtomicUsize::new(0);
12 | pub static ALLOCATED_BYTES: AtomicUsize = AtomicUsize::new(0);
13 | 
14 | unsafe impl GlobalAlloc for TrackingAllocator {
15 |     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
16 |         ALLOCATIONS.fetch_add(1, Release);
17 |         ALLOCATED_BYTES.fetch_add(layout.size(), Release);
18 |         System.alloc(layout)
19 |     }
20 |     unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
21 |         System.dealloc(ptr, layout)
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Reporting a Vulnerability
 4 | 
 5 | sled uses some unsafe functionality in the core lock-free algorithms, and in a few places to more efficiently copy data.
 6 | 
 7 | Please contact [Tyler Neely](mailto:tylerneely@gmail.com?subject=sled%20security%20issue) immediately if you find any vulnerability, and I will work with you to fix the issue rapidly and coordinate public disclosure with an expedited release including the fix.
 8 | 
 9 | If you are a bug hunter or a person with a security interest, here is my mental model of memory corruption risk in the sled codebase:
10 | 
11 | 1. memory issues relating to the lock-free data structures in their colder failure paths. these have been tested a bit by injecting delays into random places, but this is still an area with elevated risk
12 | 1. anywhere the `unsafe` keyword is used
13 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/lib/index.js:
--------------------------------------------------------------------------------
 1 | const sled = require('../native');
 2 | 
 3 | function open (path) {
 4 |     console.log("Creating a modern embedded database at", path);
 5 |     let ptr_str = sled.createDb(path);
 6 |     console.log("Sled at pointer",  ptr_str);
 7 | 
 8 |     return {
 9 |         set: (k, v) => {
10 |             //console.log("SET", ptr_str, k, v);
11 |             return sled.set(ptr_str, k, v);
12 |         },
13 |         get: (k) => {
14 |             //console.log("GET", ptr_str, k);
15 |             return sled.get(ptr_str, k);
16 |         },
17 |         del: (k) => {
18 |             return sled.del(ptr_str, k);
19 |         },
20 |         syncAndClose: () => {
21 |             console.log("Saving DB and closing", ptr_str);
22 |             sled.syncAndClose(ptr_str);
23 |             console.log("Saved DB and closing", ptr_str);
24 |         }
25 |     }
26 | }
27 | 
28 | module.exports = open;
29 | 


--------------------------------------------------------------------------------
/src/fail.rs:
--------------------------------------------------------------------------------
 1 | use parking_lot::Mutex;
 2 | 
 3 | use crate::{Lazy, Map};
 4 | 
 5 | type HM = Map<&'static str, u64>;
 6 | 
 7 | static ACTIVE: Lazy<Mutex<HM>, fn() -> Mutex<HM>> = Lazy::new(init);
 8 | 
 9 | fn init() -> Mutex<HM> {
10 |     Mutex::new(HM::default())
11 | }
12 | 
13 | /// Returns `true` if the given failpoint is active.
14 | pub fn is_active(name: &'static str) -> bool {
15 |     let mut active = ACTIVE.lock();
16 |     if let Some(bitset) = active.get_mut(&name) {
17 |         let bit = *bitset & 1;
18 |         *bitset >>= 1;
19 |         if *bitset == 0 {
20 |             active.remove(&name);
21 |         }
22 |         bit != 0
23 |     } else {
24 |         false
25 |     }
26 | }
27 | 
28 | /// Enable a particular failpoint
29 | pub fn set(name: &'static str, bitset: u64) {
30 |     ACTIVE.lock().insert(name, bitset);
31 | }
32 | 
33 | /// Clear all active failpoints.
34 | pub fn reset() {
35 |     ACTIVE.lock().clear();
36 | }
37 | 


--------------------------------------------------------------------------------
/benchmarks/stress2/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "stress2"
 3 | version = "0.1.0"
 4 | authors = ["Tyler Neely <t@jujit.su>"]
 5 | publish = false
 6 | edition = "2018"
 7 | 
 8 | [profile.release]
 9 | panic = 'abort'
10 | codegen-units = 1
11 | lto = "fat"
12 | debug = true
13 | 
14 | [features]
15 | default = []
16 | lock_free_delays = ["sled/lock_free_delays"]
17 | io_uring = ["sled/io_uring"]
18 | event_log = ["sled/event_log"]
19 | compression = ["sled/compression"]
20 | no_logs = ["sled/no_logs"]
21 | metrics = ["sled/metrics"]
22 | measure_allocs = ["sled/measure_allocs"]
23 | jemalloc = ["jemallocator"]
24 | logging = ["env_logger", "log", "color-backtrace"]
25 | 
26 | [dependencies]
27 | rand = "0.7.3"
28 | env_logger = { version = "0.7.1", optional = true }
29 | log = { version = "0.4.8", optional = true }
30 | color-backtrace = { version = "0.3.0", optional = true }
31 | jemallocator = { version = "0.3.2", optional = true }
32 | num-format = "0.4.0"
33 | 
34 | [dependencies.sled]
35 | path = "../.."
36 | 


--------------------------------------------------------------------------------
/scripts/instructions:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # counts instructions for a standard workload
 3 | set -e
 4 | 
 5 | OUTFILE="cachegrind.stress2.`git describe --always --dirty`-`date +%s`"
 6 | 
 7 | rm -rf default.sled || true
 8 | 
 9 | cargo build \
10 |   --bin=stress2 \
11 |   --release
12 | 
13 | 
14 | # --tool=callgrind --dump-instr=yes --collect-jumps=yes --simulate-cache=yes \
15 | # --callgrind-out-file="$OUTFILE" \
16 | 
17 | valgrind \
18 |   --tool=cachegrind \
19 |   --cachegrind-out-file="$OUTFILE" \
20 |   ./target/release/stress2 --total-ops=50000 --set-prop=1000000000000 --threads=1
21 | 
22 | LAST=`ls -t cachegrind.stress2.* | sed -n 2p`
23 | 
24 | echo "comparing $LAST with new $OUTFILE"
25 | 
26 | echo "--------------------------------------------------------------------------------"
27 | echo "change since last run:"
28 | echo "         Ir   I1mr  ILmr          Dr    D1mr    DLmr          Dw    D1mw    DLmw"
29 | echo "--------------------------------------------------------------------------------"
30 | cg_diff $LAST $OUTFILE | tail -1
31 | 


--------------------------------------------------------------------------------
/tsan_suppressions.txt:
--------------------------------------------------------------------------------
 1 | # This suppressions file should really only be used for things
 2 | # that TSAN can not correctly reason about, like raw memory
 3 | # fences or implicit equivalents created by performing atomic
 4 | # operations on variables.
 5 | 
 6 | # Read more about how to use this file at:
 7 | # https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions
 8 | 
 9 | # We ignore this because collect() calls functionality that relies
10 | # on atomic::fence for correctness, which doesn't get picked up by TSAN
11 | # as of Feb 1 2018 / rust 1.23.
12 | race:crossbeam_epoch::internal::Global::collect
13 | 
14 | # Arc::drop is not properly detected by TSAN due to the use
15 | # of a raw atomic Acquire fence after the strong-count
16 | # atomic subtraction with a Release fence in the Drop impl.
17 | race:Arc*drop
18 | 
19 | # lazy_static and thread_local rely on implicit barriers not
20 | # picked-up by TSAN
21 | race:lazy_static
22 | race:std::thread::local
23 | 
24 | # tsan doesn't seem to pick up parking_lot RwLock-protected accesses
25 | # that sometimes use lock elision
26 | race:current_iobuf
27 | 


--------------------------------------------------------------------------------
/src/pagecache/parallel_io_unix.rs:
--------------------------------------------------------------------------------
 1 | use std::convert::TryFrom;
 2 | use std::fs::File;
 3 | use std::io;
 4 | use std::os::unix::fs::FileExt;
 5 | 
 6 | use super::LogOffset;
 7 | 
 8 | pub(crate) fn pread_exact_or_eof(
 9 |     file: &File,
10 |     mut buf: &mut [u8],
11 |     offset: LogOffset,
12 | ) -> io::Result<usize> {
13 |     let mut total = 0_usize;
14 |     while !buf.is_empty() {
15 |         match file.read_at(buf, offset + u64::try_from(total).unwrap()) {
16 |             Ok(0) => break,
17 |             Ok(n) => {
18 |                 total += n;
19 |                 let tmp = buf;
20 |                 buf = &mut tmp[n..];
21 |             }
22 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
23 |             Err(e) => return Err(e),
24 |         }
25 |     }
26 |     Ok(total)
27 | }
28 | 
29 | pub(crate) fn pread_exact(
30 |     file: &File,
31 |     buf: &mut [u8],
32 |     offset: LogOffset,
33 | ) -> io::Result<()> {
34 |     file.read_exact_at(buf, offset)
35 | }
36 | 
37 | pub(crate) fn pwrite_all(
38 |     file: &File,
39 |     buf: &[u8],
40 |     offset: LogOffset,
41 | ) -> io::Result<()> {
42 |     file.write_all_at(buf, offset)
43 | }
44 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 Tyler Neely
 2 | Copyright (c) 2019 Tyler Neely
 3 | 
 4 | Permission is hereby granted, free of charge, to any
 5 | person obtaining a copy of this software and associated
 6 | documentation files (the "Software"), to deal in the
 7 | Software without restriction, including without
 8 | limitation the rights to use, copy, modify, merge,
 9 | publish, distribute, sublicense, and/or sell copies of
10 | the Software, and to permit persons to whom the Software
11 | is furnished to do so, subject to the following
12 | conditions:
13 | 
14 | The above copyright notice and this permission notice
15 | shall be included in all copies or substantial portions
16 | of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 | DEALINGS IN THE SOFTWARE.
27 | 


--------------------------------------------------------------------------------
/src/doc/performance_guide/mod.rs:
--------------------------------------------------------------------------------
 1 | //! ## Built-In Profiler
 2 | //!
 3 | //! To get a summary of latency histograms relating to different operations
 4 | //! you've used on a sled database, sled can print a nice table when the Db is
 5 | //! dropped by disabling the `no_metrics` default feature and setting
 6 | //! `print_profile_on_drop(true)` on a `ConfigBuilder`:
 7 | //!
 8 | //! ```rust
 9 | //! let config = sled::ConfigBuilder::new()
10 | //!     .print_profile_on_drop(true)
11 | //!     .build();
12 | //!
13 | //! let db = sled::Db::start(config).unwrap();
14 | //! ```
15 | //!
16 | //! This is useful for finding outliers, general percentiles about usage, and
17 | //! especially for debugging performance issues if you create an issue on
18 | //! github.
19 | //!
20 | //! ## Use jemalloc
21 | //!
22 | //! jemalloc can dramatically improve performance in some situations, but you
23 | //! should always measure performance before and after using it, because maybe
24 | //! for some use cases it can cause regressions.
25 | //!
26 | //! Cargo.toml:
27 | //! ```toml
28 | //! [dependencies]
29 | //! jemallocator = "0.1"
30 | //! ```
31 | //!
32 | //! `your_code.rs`:
33 | //! ```rust
34 | //! #[global_allocator]
35 | //! static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
36 | //! ```
37 | 


--------------------------------------------------------------------------------
/scripts/sanitizers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eo pipefail
 3 | 
 4 | pushd benchmarks/stress2
 5 | 
 6 | rustup toolchain install nightly --no-self-update
 7 | rustup update --no-self-update
 8 | 
 9 | export SLED_LOCK_FREE_DELAY_INTENSITY=2000
10 | 
11 | echo "asan"
12 | cargo clean
13 | export RUSTFLAGS="-Z sanitizer=address"
14 | export ASAN_OPTIONS="detect_odr_violation=0"
15 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu
16 | sudo rm -rf default.sled
17 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10
18 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6
19 | unset ASAN_OPTIONS
20 | 
21 | echo "lsan"
22 | cargo clean
23 | export RUSTFLAGS="-Z sanitizer=leak"
24 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu
25 | sudo rm -rf default.sled
26 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10
27 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6
28 | 
29 | echo "tsan"
30 | cargo clean
31 | export RUSTFLAGS="-Z sanitizer=thread"
32 | export TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt
33 | sudo rm -rf default.sled
34 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=10
35 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=6
36 | unset RUSTFLAGS
37 | unset TSAN_OPTIONS
38 | 


--------------------------------------------------------------------------------
/tests/common/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(feature = "testing"))]
 2 | compile_error!(
 3 |     "please run tests using the \"testing\" feature, \
 4 |      which enables additional checks at runtime and \
 5 |      causes more race conditions to jump out by \
 6 |      inserting delays in concurrent code."
 7 | );
 8 | 
 9 | pub fn setup_logger() {
10 |     use std::io::Write;
11 | 
12 |     fn tn() -> String {
13 |         std::thread::current().name().unwrap_or("unknown").to_owned()
14 |     }
15 | 
16 |     #[cfg(feature = "pretty_backtrace")]
17 |     color_backtrace::install();
18 | 
19 |     let mut builder = env_logger::Builder::new();
20 |     builder
21 |         .format(|buf, record| {
22 |             writeln!(
23 |                 buf,
24 |                 "{:05} {:20} {:10} {}",
25 |                 record.level(),
26 |                 tn(),
27 |                 record.module_path().unwrap().split("::").last().unwrap(),
28 |                 record.args()
29 |             )
30 |         })
31 |         .filter(None, log::LevelFilter::Info);
32 | 
33 |     if let Ok(env) = std::env::var("RUST_LOG") {
34 |         builder.parse_filters(&env);
35 |     }
36 | 
37 |     let _r = builder.try_init();
38 | }
39 | 
40 | #[allow(dead_code)]
41 | pub fn cleanup(dir: &str) {
42 |     let dir = std::path::Path::new(dir);
43 |     if dir.exists() {
44 |         std::fs::remove_dir_all(dir).unwrap();
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/pagecache/constants.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | // crc: u32 4
 4 | // kind: u8 1
 5 | // seg num: u64 9 (varint)
 6 | // pid: u64 9 (varint)
 7 | // len: u64 9 (varint)
 8 | /// Log messages have a header that might eb up to this length.
 9 | pub const MAX_MSG_HEADER_LEN: usize = 32;
10 | 
11 | /// Log segments have a header of this length.
12 | pub const SEG_HEADER_LEN: usize = 20;
13 | 
14 | /// During testing, this should never be exceeded.
15 | // TODO drop this to 3 over time
16 | #[allow(unused)]
17 | pub const MAX_SPACE_AMPLIFICATION: f64 = 10.;
18 | 
19 | pub(crate) const META_PID: PageId = 0;
20 | pub(crate) const COUNTER_PID: PageId = 1;
21 | pub(crate) const BATCH_MANIFEST_PID: PageId = PageId::max_value() - 666;
22 | 
23 | pub(crate) const PAGE_CONSOLIDATION_THRESHOLD: usize = 10;
24 | pub(crate) const SEGMENT_CLEANUP_THRESHOLD: usize = 50;
25 | 
26 | // Allows for around 1 trillion items to be stored
27 | // 2^37 * (assuming 50% node fill, 8 items per leaf)
28 | // and well below 1% of nodes being non-leaf nodes.
29 | #[cfg(target_pointer_width = "64")]
30 | pub(crate) const MAX_PID_BITS: usize = 37;
31 | 
32 | // Allows for around 32 billion items to be stored
33 | // 2^32 * (assuming 50% node fill of 8 items per leaf)
34 | // and well below 1% of nodes being non-leaf nodes.
35 | // Assumed to be enough for a 32-bit system.
36 | #[cfg(target_pointer_width = "32")]
37 | pub(crate) const MAX_PID_BITS: usize = 32;
38 | 


--------------------------------------------------------------------------------
/RELEASE_CHECKLIST.md:
--------------------------------------------------------------------------------
 1 | # Release Checklist
 2 | 
 3 | This checklist must be completed before publishing a release of any kind.
 4 | 
 5 | Over time, anything in this list that can be turned into an automated test should be, but
 6 | there are still some big blind spots.
 7 | 
 8 | ## API stability
 9 | 
10 | - [ ] rust-flavored semver respected
11 | 
12 | ## Performance
13 | 
14 | - [ ] micro-benchmark regressions should not happen unless newly discovered correctness criteria demands them
15 | - [ ] mixed point operation latency distribution should narrow over time
16 | - [ ] sequential operation average throughput should increase over time
17 | - [ ] workloads should pass TSAN and ASAN on macOS. Linux should additionally pass LSAN & MSAN.
18 | - [ ] workload write and space amplification thresholds should see no regressions
19 | 
20 | ## Concurrency Audit
21 | 
22 | - [ ] any new `Guard` objects are dropped inside the rayon threadpool
23 | - [ ] no new EBR `Collector`s, as they destroy causality. These will be optimized in-bulk in the future.
24 | - [ ] no code assumes a recently read page pointer will remain unchanged (transactions may change this if reads are inline)
25 | - [ ] no calls to `rand::thread_rng` from a droppable function (anything in the SegmentAccountant)
26 | 
27 | ## Burn-In
28 | 
29 | - [ ] fuzz tests should run at least 24 hours each with zero crashes
30 | - [ ] sequential and point workloads run at least 24 hours in constrained docker container without OOM / out of disk
31 | 


--------------------------------------------------------------------------------
/src/doc/testing_strategies/mod.rs:
--------------------------------------------------------------------------------
 1 | //! We believe operators of stateful systems should get as much sleep as they
 2 | //! want. We take testing seriously, and we take pains to avoid the pesticide
 3 | //! paradox wherever possible.
 4 | //!
 5 | //! sled uses the following testing strategies, and is eager to expand their
 6 | //! use:
 7 | //!
 8 | //! * quickcheck-based model testing on the Tree, `PageCache`, and Log
 9 | //! * proptest-based model testing on the `PageTable` using the [model](https://docs.rs/model)
10 | //!   testing library
11 | //! * linearizability testing on the `PageTable` using the [model](https://docs.rs/model)
12 | //!   testing library
13 | //! * deterministic concurrent model testing using linux realtime priorities,
14 | //!   approaching the utility of the PULSE system available for the Erlang
15 | //!   ecosystem
16 | //! * `ThreadSanitizer` on a concurrent workload
17 | //! * `LeakSanitizer` on a concurrent workload
18 | //! * failpoints with model testing: at every IO operation, a test can cause the
19 | //!   system to simulate a crash
20 | //! * crash testing: processes are quickly spun up and then `kill -9`'d while
21 | //!   recovering and writing. the recovered data is verified to recover the log
22 | //!   in-order, stopping at the first torn log message or incomplete segment
23 | //! * fuzzing: libfuzzer is used to generate sequences of operations on the Tree
24 | //! * TLA+ has been used to model some of the concurrent algorithms, but much
25 | //!   more is necessary
26 | 


--------------------------------------------------------------------------------
/src/doc/reactive_semantics/mod.rs:
--------------------------------------------------------------------------------
 1 | //! As of sled `0.16.8` we support the [`watch_prefix` feature](https://docs.rs/sled/latest/sled/struct.Tree.html#method.watch_prefix) which allows a caller to create an iterator over all events that happen to keys that begin with a specified prefix. Supplying an empty vector allows you to subscribe to all updates on the `Tree`.
 2 | //!
 3 | //! #### reactive architectures
 4 | //!
 5 | //! Subscription to keys prefixed with "topic names" can allow you to treat sled
 6 | //! as a durable message bus.
 7 | //!
 8 | //! #### replicated systems
 9 | //!
10 | //! Watching the empty prefix will subscribe to all updates on the entire
11 | //! database. You can feed this into a replication system
12 | //!
13 | //! #### analysis tools and auditing
14 | //!
15 | //! #### ordering guarantees
16 | //!
17 | //! Updates are received in-order for particular keys, but updates for different
18 | //! keys may be observed in different orders by different `Subscriber`s. As an
19 | //! example, consider updating the keys `k1` and `k2` twice, adding 1 to the
20 | //! current value. Different `Subscriber`s may observe the following histories:
21 | //!
22 | //! ```
23 | //! Set(k1, 100), Set(k1, 101), Set(k2, 200), Set(k2, 201)
24 | //! or
25 | //! Set(k1, 100), Set(k2, 200), Set(k1, 101), Set(k2, 201)
26 | //! or
27 | //! Set(k1, 100), Set(k2, 200), Set(k2, 201), Set(k1, 101)
28 | //! or
29 | //! Set(k2, 200), Set(k1, 100), Set(k1, 101), Set(k2, 201)
30 | //! or
31 | //! Set(k2, 200), Set(k1, 100), Set(k2, 201), Set(k1, 101)
32 | //! or
33 | //! Set(k2, 200), Set(k2, 201), Set(k1, 100), Set(k1, 101)
34 | //! ```
35 | 


--------------------------------------------------------------------------------
/src/fastcmp.rs:
--------------------------------------------------------------------------------
 1 | use std::cmp::Ordering;
 2 | 
 3 | #[cfg(any(unix, windows))]
 4 | #[allow(unsafe_code)]
 5 | pub(crate) fn fastcmp(l: &[u8], r: &[u8]) -> Ordering {
 6 |     let len = std::cmp::min(l.len(), r.len());
 7 |     let cmp = unsafe { libc::memcmp(l.as_ptr() as _, r.as_ptr() as _, len) };
 8 |     match cmp {
 9 |         a if a > 0 => Ordering::Greater,
10 |         a if a < 0 => Ordering::Less,
11 |         _ => l.len().cmp(&r.len()),
12 |     }
13 | }
14 | 
15 | #[cfg(not(any(unix, windows)))]
16 | #[allow(unsafe_code)]
17 | pub(crate) fn fastcmp(l: &[u8], r: &[u8]) -> Ordering {
18 |     l.cmp(r)
19 | }
20 | 
21 | #[cfg(test)]
22 | mod qc {
23 |     use super::fastcmp;
24 | 
25 |     fn prop_cmp_matches(l: &[u8], r: &[u8]) -> bool {
26 |         assert_eq!(fastcmp(l, r), l.cmp(r));
27 |         assert_eq!(fastcmp(r, l), r.cmp(l));
28 |         assert_eq!(fastcmp(l, l), l.cmp(l));
29 |         assert_eq!(fastcmp(r, r), r.cmp(r));
30 |         true
31 |     }
32 | 
33 |     #[test]
34 |     fn basic_functionality() {
35 |         let cases: [&[u8]; 8] = [
36 |             &[],
37 |             &[0],
38 |             &[1],
39 |             &[1],
40 |             &[255],
41 |             &[1, 2, 3],
42 |             &[1, 2, 3, 0],
43 |             &[1, 2, 3, 55],
44 |         ];
45 |         for pair in cases.windows(2) {
46 |             prop_cmp_matches(pair[0], pair[1]);
47 |         }
48 |     }
49 | 
50 |     quickcheck::quickcheck! {
51 |         #[cfg_attr(miri, ignore)]
52 |         fn qc_fastcmp(l: Vec<u8>, r: Vec<u8>) -> bool {
53 |             prop_cmp_matches(&l, &r)
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/doc/limits/mod.rs:
--------------------------------------------------------------------------------
 1 | //! This page documents some limitations that sled imposes on users.
 2 | //!
 3 | //! * The underlying pagecache can currently store 2^36 pages. Leaf nodes in the
 4 | //!   `Tree` tend to split when they have more than 16 keys and values. This
 5 | //!   means that sled can hold a little less than **4,294,967,296 total items**
 6 | //!   (index nodes in the tree will also consume pages, but ideally far fewer
 7 | //!   than 1%). This is easy to increase without requiring migration, as it is
 8 | //!   entirely a runtime concern, but nobody has expressed any interest in this
 9 | //!   being larger yet. Note to future folks who need to increase this: increase
10 | //!   the width of the Node1 type in the pagetable module, and correspondingly
11 | //!   increase the number of bits that are used to index into it. It's just a
12 | //!   simple wait-free grow-only 2-level pagetable.
13 | //! * keys and values use `usize` for the length fields due to the way that Rust
14 | //!   uses `usize` for slice lengths, and will be limited to the target
15 | //!   platform's pointer width. On 64-bit machines, this will be 64 bits. On
16 | //!   32-bit machines, it will be limited to `u32::max_value()`.
17 | //! * Due to the 32-bit limitation on slice sizes on 32-bit architectures, we
18 | //!   currently do not support systems large enough for the snapshot file to
19 | //!   reach over 4gb. The snapshot file tends to be a small fraction of the
20 | //!   total db size, and it's likely we'll be able to implement a streaming
21 | //!   deserializer if this ever becomes an issue, but it seems unclear if anyone
22 | //!   will encounter this limitation.
23 | 


--------------------------------------------------------------------------------
/src/fastlock.rs:
--------------------------------------------------------------------------------
 1 | use std::{
 2 |     cell::UnsafeCell,
 3 |     ops::{Deref, DerefMut},
 4 |     sync::atomic::{
 5 |         AtomicBool,
 6 |         Ordering::{Acquire, Release},
 7 |     },
 8 | };
 9 | 
10 | pub struct FastLockGuard<'a, T> {
11 |     mu: &'a FastLock<T>,
12 | }
13 | 
14 | impl<'a, T> Drop for FastLockGuard<'a, T> {
15 |     fn drop(&mut self) {
16 |         assert!(self.mu.lock.swap(false, Release));
17 |     }
18 | }
19 | 
20 | impl<'a, T> Deref for FastLockGuard<'a, T> {
21 |     type Target = T;
22 | 
23 |     fn deref(&self) -> &T {
24 |         #[allow(unsafe_code)]
25 |         unsafe {
26 |             &*self.mu.inner.get()
27 |         }
28 |     }
29 | }
30 | 
31 | impl<'a, T> DerefMut for FastLockGuard<'a, T> {
32 |     fn deref_mut(&mut self) -> &mut T {
33 |         #[allow(unsafe_code)]
34 |         unsafe {
35 |             &mut *self.mu.inner.get()
36 |         }
37 |     }
38 | }
39 | 
40 | #[repr(C)]
41 | pub struct FastLock<T> {
42 |     inner: UnsafeCell<T>,
43 |     lock: AtomicBool,
44 | }
45 | 
46 | #[allow(unsafe_code)]
47 | unsafe impl<T: Send> Sync for FastLock<T> {}
48 | 
49 | #[allow(unsafe_code)]
50 | unsafe impl<T: Send> Send for FastLock<T> {}
51 | 
52 | impl<T> FastLock<T> {
53 |     pub fn new(inner: T) -> FastLock<T> {
54 |         FastLock { lock: AtomicBool::new(false), inner: UnsafeCell::new(inner) }
55 |     }
56 | 
57 |     pub fn try_lock(&self) -> Option<FastLockGuard<'_, T>> {
58 |         let lock_result =
59 |             self.lock.compare_exchange_weak(false, true, Acquire, Acquire);
60 | 
61 |         let success = lock_result.is_ok();
62 | 
63 |         if success { Some(FastLockGuard { mu: self }) } else { None }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/pagecache/header.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | // This is the most writers in a single IO buffer
 4 | // that we have space to accommodate in the counter
 5 | // for writers in the IO buffer header.
 6 | pub(in crate::pagecache) const MAX_WRITERS: Header = 127;
 7 | 
 8 | pub(in crate::pagecache) type Header = u64;
 9 | 
10 | // salt: 31 bits
11 | // maxed: 1 bit
12 | // seal: 1 bit
13 | // n_writers: 7 bits
14 | // offset: 24 bits
15 | 
16 | pub(crate) const fn is_maxed(v: Header) -> bool {
17 |     v & (1 << 32) == 1 << 32
18 | }
19 | 
20 | pub(crate) const fn mk_maxed(v: Header) -> Header {
21 |     v | (1 << 32)
22 | }
23 | 
24 | pub(crate) const fn is_sealed(v: Header) -> bool {
25 |     v & (1 << 31) == 1 << 31
26 | }
27 | 
28 | pub(crate) const fn mk_sealed(v: Header) -> Header {
29 |     v | (1 << 31)
30 | }
31 | 
32 | pub(crate) const fn n_writers(v: Header) -> Header {
33 |     (v << 33) >> 57
34 | }
35 | 
36 | #[inline]
37 | pub(crate) fn incr_writers(v: Header) -> Header {
38 |     assert_ne!(n_writers(v), MAX_WRITERS);
39 |     v + (1 << 24)
40 | }
41 | 
42 | #[inline]
43 | pub(crate) fn decr_writers(v: Header) -> Header {
44 |     assert_ne!(n_writers(v), 0);
45 |     v - (1 << 24)
46 | }
47 | 
48 | #[inline]
49 | pub(crate) fn offset(v: Header) -> usize {
50 |     let ret = (v << 40) >> 40;
51 |     usize::try_from(ret).unwrap()
52 | }
53 | 
54 | #[inline]
55 | pub(crate) fn bump_offset(v: Header, by: usize) -> Header {
56 |     assert_eq!(by >> 24, 0);
57 |     v + (by as Header)
58 | }
59 | 
60 | pub(crate) const fn bump_salt(v: Header) -> Header {
61 |     (v + (1 << 33)) & 0xFFFF_FFFD_0000_0000
62 | }
63 | 
64 | pub(crate) const fn salt(v: Header) -> Header {
65 |     (v >> 33) << 33
66 | }
67 | 


--------------------------------------------------------------------------------
/src/batch.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_results)]
 2 | 
 3 | use super::*;
 4 | 
 5 | /// A batch of updates that will
 6 | /// be applied atomically to the
 7 | /// Tree.
 8 | ///
 9 | /// # Examples
10 | ///
11 | /// ```
12 | /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
13 | /// use sled::{Batch, open};
14 | ///
15 | /// # let _ = std::fs::remove_dir_all("batch_db_2");
16 | /// let db = open("batch_db_2")?;
17 | /// db.insert("key_0", "val_0")?;
18 | ///
19 | /// let mut batch = Batch::default();
20 | /// batch.insert("key_a", "val_a");
21 | /// batch.insert("key_b", "val_b");
22 | /// batch.insert("key_c", "val_c");
23 | /// batch.remove("key_0");
24 | ///
25 | /// db.apply_batch(batch)?;
26 | /// // key_0 no longer exists, and key_a, key_b, and key_c
27 | /// // now do exist.
28 | /// # let _ = std::fs::remove_dir_all("batch_db_2");
29 | /// # Ok(()) }
30 | /// ```
31 | #[derive(Debug, Default, Clone, PartialEq, Eq)]
32 | pub struct Batch {
33 |     pub(crate) writes: Map<IVec, Option<IVec>>,
34 | }
35 | 
36 | impl Batch {
37 |     /// Set a key to a new value
38 |     pub fn insert<K, V>(&mut self, key: K, value: V)
39 |     where
40 |         K: Into<IVec>,
41 |         V: Into<IVec>,
42 |     {
43 |         self.writes.insert(key.into(), Some(value.into()));
44 |     }
45 | 
46 |     /// Remove a key
47 |     pub fn remove<K>(&mut self, key: K)
48 |     where
49 |         K: Into<IVec>,
50 |     {
51 |         self.writes.insert(key.into(), None);
52 |     }
53 | 
54 |     /// Get a value if it is present in the `Batch`.
55 |     /// `Some(None)` means it's present as a deletion.
56 |     pub fn get<K: AsRef<[u8]>>(&self, k: K) -> Option<Option<&IVec>> {
57 |         let inner = self.writes.get(k.as_ref())?;
58 |         Some(inner.as_ref())
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/doc/engineering_practices/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Over the years that sled development has been active, some practices have
 2 | //! been collected that have helped to reduce risks throughout the codebase.
 3 | //!
 4 | //! # high-level
 5 | //!
 6 | //! * Start with the correctness requirements, ignore the performance impact
 7 | //!   until the end. You'll usually write something faster by focusing on
 8 | //!   keeping things minimal anyway.
 9 | //! * Throw away what can't be done in a day of coding. when you rewrite it
10 | //!   tomorrow, it will be simpler.
11 | //!
12 | //! # testing
13 | //!
14 | //! * Don't do what can't be tested to be correct
15 | //! * For concurrent code, it must be delayable to induce strange histories when
16 | //!   running under test
17 | //! * For IO code, it must have a failpoint so that IO errors can be injected
18 | //!   during testing, as most bugs in cloud systems happen in the untested
19 | //!   error-handling code
20 | //! * Lean heavily into model-based property testing. sled should act like a
21 | //!   `BTreeMap`, even after crashes
22 | //!
23 | //! # when testing and performance collide
24 | //!
25 | //! * cold code is buggy code
26 | //! * if you see a significant optimization that will make correctness-critical
27 | //!   codepaths harder to hit in tests, the optimization should only be created
28 | //!   if it's possible to artificially increase the chances of hitting the
29 | //!   codepath in test. Fox example, sled defaults to having an 8mb write
30 | //!   buffer, but during tests we often turn it down to 512 bytes so that we can
31 | //!   really abuse the correctness-critical aspects of its behavior.
32 | //!
33 | //! # numbers
34 | //!
35 | //! * No silent truncation should ever occur when converting numbers
36 | //! * No silent wrapping should occur
37 | //! * Crash or return a `ReportableBug` error in these cases
38 | //! * `as` is forbidden for anything that could lose information
39 | //! * Clippy's cast lints help us here, and it has been added to all pull
40 | //!   requests
41 | 
42 | //! # package
43 | //!
44 | //! * dependencies should be minimized to keep compilation simple
45 | //!
46 | //! # coding conventions
47 | //!
48 | //! * Self should be avoided. We have a lot of code, and it provides no context
49 | //!   if people are jumping around a lot. Redundancy here improves orientation.
50 | 


--------------------------------------------------------------------------------
/tests/test_quiescent.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(all(target_os = "linux", not(miri)))]
 2 | 
 3 | mod common;
 4 | 
 5 | use std::time::{Duration, Instant};
 6 | 
 7 | use common::cleanup;
 8 | 
 9 | #[test]
10 | fn quiescent_cpu_time() {
11 |     const DB_DIR: &str = "sleeper";
12 |     cleanup(DB_DIR);
13 | 
14 |     fn run() {
15 |         let start = Instant::now();
16 |         let db = sled::open(DB_DIR).unwrap();
17 |         std::thread::sleep(Duration::from_secs(10));
18 |         drop(db);
19 |         let end = Instant::now();
20 | 
21 |         let (user_cpu_time, system_cpu_time) = unsafe {
22 |             let mut resource_usage: libc::rusage = std::mem::zeroed();
23 |             let return_value = libc::getrusage(
24 |                 libc::RUSAGE_SELF,
25 |                 (&mut resource_usage) as *mut libc::rusage,
26 |             );
27 |             if return_value != 0 {
28 |                 panic!("error {} from getrusage()", *libc::__errno_location());
29 |             }
30 |             (resource_usage.ru_utime, resource_usage.ru_stime)
31 |         };
32 | 
33 |         let user_cpu_seconds =
34 |             user_cpu_time.tv_sec as f64 + user_cpu_time.tv_usec as f64 * 1e-6;
35 |         let system_cpu_seconds = system_cpu_time.tv_sec as f64
36 |             + system_cpu_time.tv_usec as f64 * 1e-6;
37 |         let real_time_elapsed = end.duration_since(start);
38 | 
39 |         if user_cpu_seconds + system_cpu_seconds > 1.0 {
40 |             panic!(
41 |                 "Database used too much CPU during a quiescent workload. User: {}s, system: {}s (wall clock: {}s)",
42 |                 user_cpu_seconds,
43 |                 system_cpu_seconds,
44 |                 real_time_elapsed.as_secs_f64(),
45 |             );
46 |         }
47 |     }
48 | 
49 |     let child = unsafe { libc::fork() };
50 |     if child == 0 {
51 |         common::setup_logger();
52 |         if let Err(e) = std::thread::spawn(run).join() {
53 |             println!("test failed: {:?}", e);
54 |             std::process::exit(15);
55 |         } else {
56 |             std::process::exit(0);
57 |         }
58 |     } else {
59 |         let mut status = 0;
60 |         unsafe {
61 |             libc::waitpid(child, &mut status as *mut libc::c_int, 0);
62 |         }
63 |         if status != 0 {
64 |             cleanup(DB_DIR);
65 |             panic!("child exited abnormally");
66 |         }
67 |     }
68 | 
69 |     cleanup(DB_DIR);
70 | }
71 | 


--------------------------------------------------------------------------------
/src/pagecache/disk_pointer.rs:
--------------------------------------------------------------------------------
 1 | use std::num::NonZeroU64;
 2 | 
 3 | use super::{HeapId, LogOffset};
 4 | use crate::*;
 5 | 
 6 | /// A pointer to a location on disk or an off-log heap item.
 7 | #[derive(Debug, Clone, PartialOrd, Ord, Copy, Eq, PartialEq)]
 8 | pub enum DiskPtr {
 9 |     /// Points to a value stored in the single-file log.
10 |     Inline(LogOffset),
11 |     /// Points to a value stored off-log in the heap.
12 |     Heap(Option<NonZeroU64>, HeapId),
13 | }
14 | 
15 | impl DiskPtr {
16 |     pub(crate) fn new_inline(l: LogOffset) -> Self {
17 |         DiskPtr::Inline(l)
18 |     }
19 | 
20 |     pub(crate) fn new_heap_item(lid: LogOffset, heap_id: HeapId) -> Self {
21 |         DiskPtr::Heap(Some(NonZeroU64::new(lid).unwrap()), heap_id)
22 |     }
23 | 
24 |     pub(crate) fn is_inline(&self) -> bool {
25 |         match self {
26 |             DiskPtr::Inline(_) => true,
27 |             DiskPtr::Heap(_, _) => false,
28 |         }
29 |     }
30 | 
31 |     pub(crate) fn is_heap_item(&self) -> bool {
32 |         match self {
33 |             DiskPtr::Inline(_) => false,
34 |             DiskPtr::Heap(_, _) => true,
35 |         }
36 |     }
37 | 
38 |     pub(crate) fn heap_id(&self) -> Option<HeapId> {
39 |         if let DiskPtr::Heap(_, heap_id) = self { Some(*heap_id) } else { None }
40 |     }
41 | 
42 |     #[doc(hidden)]
43 |     pub fn lid(&self) -> Option<LogOffset> {
44 |         match self {
45 |             DiskPtr::Inline(lid) => Some(*lid),
46 |             DiskPtr::Heap(lid, _) => lid.map(NonZeroU64::get),
47 |         }
48 |     }
49 | 
50 |     pub(crate) fn forget_heap_log_coordinates(&mut self) {
51 |         match self {
52 |             DiskPtr::Inline(_) => {}
53 |             DiskPtr::Heap(ref mut opt, _) => *opt = None,
54 |         }
55 |     }
56 | 
57 |     pub(crate) fn original_lsn(&self) -> Lsn {
58 |         match self {
59 |             DiskPtr::Heap(_, heap_id) => heap_id.original_lsn,
60 |             DiskPtr::Inline(_) => panic!("called original_lsn on non-Heap"),
61 |         }
62 |     }
63 | 
64 |     pub(crate) fn heap_pointer_merged_into_snapshot(&self) -> bool {
65 |         if let DiskPtr::Heap(None, _) = self { true } else { false }
66 |     }
67 | }
68 | 
69 | impl fmt::Display for DiskPtr {
70 |     fn fmt(
71 |         &self,
72 |         f: &mut fmt::Formatter<'_>,
73 |     ) -> std::result::Result<(), fmt::Error> {
74 |         write!(f, "{:?}", self)
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Welcome to the Project :)
 2 | 
 3 | * Don't be a jerk - here's our [code of conduct](./code-of-conduct.md).
 4 |   We have a track record of defending our community from harm.
 5 | 
 6 | There are at least three great ways to contribute to sled:
 7 | 
 8 | * [financial contribution](https://github.com/sponsors/spacejam)
 9 | * coding
10 | * conversation
11 | 
12 | #### Coding Considerations:
13 | 
14 | Please don't waste your time or ours by implementing things that
15 | we do not want to introduce and maintain. Please discuss in an
16 | issue or on chat before submitting a PR with:
17 | 
18 | * public API changes
19 | * new functionality of any sort
20 | * additional unsafe code
21 | * significant refactoring
22 | 
23 | The above changes are unlikely to be merged or receive
24 | timely attention without prior discussion.
25 | 
26 | PRs that generally require less coordination beforehand:
27 | 
28 | * Anything addressing a correctness issue.
29 | * Better docs: whatever you find confusing!
30 | * Small code changes with big performance implications, substantiated with [responsibly-gathered metrics](https://sled.rs/perf#experiment-checklist).
31 | * FFI submodule changes: these are generally less well maintained than the Rust core, and benefit more from public assistance.
32 | * Generally any new kind of test that avoids biases inherent in the others.
33 | 
34 | ####### All PRs block on failing tests!
35 | 
36 | sled has intense testing, including crash tests, multi-threaded tests with
37 | delay injection, a variety of mechanically-generated tests that combine fault
38 | injection with concurrency in interesting ways, cross-compilation and minimum
39 | supported Rust version checks, LLVM sanitizers, and more. It can sometimes be
40 | challenging to understand why something is failing these intense tests.
41 | 
42 | For better understanding test failures, please:
43 | 
44 | 1. read the failing test name and output log for clues
45 | 1. try to reproduce the failed test locally by running its assocated command from the [test script](https://github.com/spacejam/sled/blob/master/.github/workflows/test.yml)
46 | 1. If it is not clear why your test is failing, feel free to request help with understanding it either on discord or requesting help on the PR, and we will do our best to help.
47 | 
48 | Want to help sled but don't have time for individual contributions? Contribute via [GitHub Sponsors](https://github.com/sponsors/spacejam) to support the people pushing the project forward!
49 | 


--------------------------------------------------------------------------------
/examples/playground.rs:
--------------------------------------------------------------------------------
 1 | extern crate sled;
 2 | 
 3 | use sled::{Config, Result};
 4 | 
 5 | fn basic() -> Result<()> {
 6 |     let config = Config::new().temporary(true);
 7 | 
 8 |     let db = config.open()?;
 9 | 
10 |     let k = b"k".to_vec();
11 |     let v1 = b"v1".to_vec();
12 |     let v2 = b"v2".to_vec();
13 | 
14 |     // set and get
15 |     db.insert(k.clone(), v1.clone())?;
16 |     assert_eq!(db.get(&k).unwrap().unwrap(), (v1));
17 | 
18 |     // compare and swap
19 |     match db.compare_and_swap(k.clone(), Some(&v1), Some(v2.clone()))? {
20 |         Ok(()) => println!("it worked!"),
21 |         Err(sled::CompareAndSwapError { current: cur, proposed: _ }) => {
22 |             println!("the actual current value is {:?}", cur)
23 |         }
24 |     }
25 | 
26 |     // scan forward
27 |     let mut iter = db.range(k.as_slice()..);
28 |     let (k1, v1) = iter.next().unwrap().unwrap();
29 |     assert_eq!(v1, v2);
30 |     assert_eq!(k1, k);
31 |     assert_eq!(iter.next(), None);
32 | 
33 |     // deletion
34 |     db.remove(&k)?;
35 | 
36 |     Ok(())
37 | }
38 | 
39 | fn merge_operator() -> Result<()> {
40 |     fn concatenate_merge(
41 |         _key: &[u8],              // the key being merged
42 |         old_value: Option<&[u8]>, // the previous value, if one existed
43 |         merged_bytes: &[u8],      // the new bytes being merged in
44 |     ) -> Option<Vec<u8>> {
45 |         // set the new value, return None to delete
46 |         let mut ret = old_value.map_or_else(Vec::new, |ov| ov.to_vec());
47 | 
48 |         ret.extend_from_slice(merged_bytes);
49 | 
50 |         Some(ret)
51 |     }
52 | 
53 |     let config = Config::new().temporary(true);
54 | 
55 |     let db = config.open()?;
56 |     db.set_merge_operator(concatenate_merge);
57 | 
58 |     let k = b"k".to_vec();
59 | 
60 |     db.insert(k.clone(), vec![0])?;
61 |     db.merge(k.clone(), vec![1])?;
62 |     db.merge(k.clone(), vec![2])?;
63 |     assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![0, 1, 2]));
64 | 
65 |     // sets replace previously merged data,
66 |     // bypassing the merge function.
67 |     db.insert(k.clone(), vec![3])?;
68 |     assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![3]));
69 | 
70 |     // merges on non-present values will add them
71 |     db.remove(&*k)?;
72 |     db.merge(k.clone(), vec![4])?;
73 |     assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![4]));
74 | 
75 |     Ok(())
76 | }
77 | 
78 | fn main() -> Result<()> {
79 |     basic()?;
80 |     merge_operator()
81 | }
82 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "sled"
 3 | version = "0.34.6"
 4 | authors = ["Tyler Neely <t@jujit.su>"]
 5 | description = "Lightweight high-performance pure-rust transactional embedded database."
 6 | license = "MIT/Apache-2.0"
 7 | homepage = "https://github.com/spacejam/sled"
 8 | repository = "https://github.com/spacejam/sled"
 9 | keywords = ["redis", "mongo", "sqlite", "lmdb", "rocksdb"]
10 | categories = ["database-implementations", "concurrency", "data-structures", "algorithms", "caching"]
11 | documentation = "https://docs.rs/sled/"
12 | readme = "README.md"
13 | edition = "2018"
14 | exclude = ["benchmarks", "examples", "bindings", "scripts", "experiments"]
15 | 
16 | [package.metadata.docs.rs]
17 | features = ["docs"]
18 | 
19 | [badges]
20 | maintenance = { status = "actively-developed" }
21 | 
22 | [profile.release]
23 | debug = true
24 | opt-level = 3
25 | 
26 | [features]
27 | default = []
28 | # Do not use the "testing" feature in your own testing code, this is for
29 | # internal testing use only. It injects many delays and performs several
30 | # test-only configurations that cause performance to drop significantly.
31 | # It will cause your tests to take much more time, and possibly time out etc...
32 | testing = ["event_log", "lock_free_delays", "compression", "failpoints", "backtrace"]
33 | compression = ["zstd"]
34 | lock_free_delays = []
35 | failpoints = []
36 | event_log = []
37 | metrics = []
38 | no_logs = ["log/max_level_off"]
39 | no_inline = []
40 | measure_allocs = []
41 | pretty_backtrace = ["color-backtrace"]
42 | io_uring = ["rio"]
43 | docs = []
44 | miri_optimizations = []
45 | mutex = []
46 | 
47 | [dependencies]
48 | crossbeam-epoch = "0.9.1"
49 | crossbeam-utils = "0.8.1"
50 | fxhash = "0.2.1"
51 | libc = "0.2.81"
52 | zstd = { version = "0.6.0", optional = true }
53 | crc32fast = "1.2.1"
54 | log = "0.4.11"
55 | parking_lot = "0.11.1"
56 | color-backtrace = { version = "0.5.0", optional = true }
57 | rio = { version = "0.9.4", optional = true }
58 | backtrace = { version = "0.3.55", optional = true }
59 | 
60 | [target.'cfg(any(target_os = "linux", target_os = "macos", target_os="windows"))'.dependencies]
61 | fs2 = "0.4.3"
62 | 
63 | [dev-dependencies]
64 | rand = "0.7.0"
65 | rand_chacha = "0.3.0"
66 | rand_distr = "0.3.0"
67 | quickcheck = "0.9.2"
68 | log = "0.4.11"
69 | env_logger = "0.8.2"
70 | zerocopy = "0.3.0"
71 | byteorder = "1.3.4"
72 | 
73 | [[test]]
74 | name = "test_crash_recovery"
75 | path = "tests/test_crash_recovery.rs"
76 | harness = false
77 | 


--------------------------------------------------------------------------------
/src/pagecache/parallel_io_windows.rs:
--------------------------------------------------------------------------------
 1 | use std::convert::TryFrom;
 2 | use std::fs::File;
 3 | use std::io;
 4 | use std::os::windows::fs::FileExt;
 5 | 
 6 | use super::LogOffset;
 7 | 
 8 | fn seek_read_exact<F: FileExt>(
 9 |     file: &mut F,
10 |     mut buf: &mut [u8],
11 |     mut offset: u64,
12 | ) -> io::Result<()> {
13 |     while !buf.is_empty() {
14 |         match file.seek_read(buf, offset) {
15 |             Ok(0) => break,
16 |             Ok(n) => {
17 |                 let tmp = buf;
18 |                 buf = &mut tmp[n..];
19 |                 offset += n as u64;
20 |             }
21 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
22 |             Err(e) => return Err(e),
23 |         }
24 |     }
25 |     if !buf.is_empty() {
26 |         Err(io::Error::new(
27 |             io::ErrorKind::UnexpectedEof,
28 |             "failed to fill whole buffer",
29 |         ))
30 |     } else {
31 |         Ok(())
32 |     }
33 | }
34 | 
35 | fn seek_write_all<F: FileExt>(
36 |     file: &mut F,
37 |     mut buf: &[u8],
38 |     mut offset: u64,
39 | ) -> io::Result<()> {
40 |     while !buf.is_empty() {
41 |         match file.seek_write(buf, offset) {
42 |             Ok(0) => {
43 |                 return Err(io::Error::new(
44 |                     io::ErrorKind::WriteZero,
45 |                     "failed to write whole buffer",
46 |                 ));
47 |             }
48 |             Ok(n) => {
49 |                 buf = &buf[n..];
50 |                 offset += n as u64;
51 |             }
52 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
53 |             Err(e) => return Err(e),
54 |         }
55 |     }
56 |     Ok(())
57 | }
58 | 
59 | pub(crate) fn pread_exact_or_eof(
60 |     file: &File,
61 |     mut buf: &mut [u8],
62 |     offset: LogOffset,
63 | ) -> io::Result<usize> {
64 |     let mut total = 0_usize;
65 |     while !buf.is_empty() {
66 |         match file.seek_read(buf, offset + u64::try_from(total).unwrap()) {
67 |             Ok(0) => break,
68 |             Ok(n) => {
69 |                 total += n;
70 |                 let tmp = buf;
71 |                 buf = &mut tmp[n..];
72 |             }
73 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
74 |             Err(e) => return Err(e),
75 |         }
76 |     }
77 |     Ok(total)
78 | }
79 | 
80 | pub(crate) fn pread_exact(
81 |     file: &File,
82 |     buf: &mut [u8],
83 |     offset: LogOffset,
84 | ) -> io::Result<()> {
85 |     let mut f = file.try_clone()?;
86 |     seek_read_exact(&mut f, buf, offset)
87 | }
88 | 
89 | pub(crate) fn pwrite_all(
90 |     file: &File,
91 |     buf: &[u8],
92 |     offset: LogOffset,
93 | ) -> io::Result<()> {
94 |     let mut f = file.try_clone()?;
95 |     seek_write_all(&mut f, buf, offset)
96 | }
97 | 


--------------------------------------------------------------------------------
/src/pagecache/parallel_io_polyfill.rs:
--------------------------------------------------------------------------------
  1 | use std::fs::File;
  2 | use std::io::{self, Read, Seek, Write};
  3 | 
  4 | use parking_lot::Mutex;
  5 | 
  6 | use super::LogOffset;
  7 | 
  8 | fn init_mu() -> Mutex<()> {
  9 |     Mutex::new(())
 10 | }
 11 | 
 12 | type MutexInit = fn() -> Mutex<()>;
 13 | 
 14 | static GLOBAL_FILE_LOCK: crate::Lazy<Mutex<()>, MutexInit> =
 15 |     crate::Lazy::new(init_mu);
 16 | 
 17 | pub(crate) fn pread_exact_or_eof(
 18 |     file: &File,
 19 |     mut buf: &mut [u8],
 20 |     offset: LogOffset,
 21 | ) -> io::Result<usize> {
 22 |     let _lock = GLOBAL_FILE_LOCK.lock();
 23 | 
 24 |     let mut f = file.try_clone()?;
 25 | 
 26 |     let _ = f.seek(io::SeekFrom::Start(offset))?;
 27 | 
 28 |     let mut total = 0;
 29 |     while !buf.is_empty() {
 30 |         match f.read(buf) {
 31 |             Ok(0) => break,
 32 |             Ok(n) => {
 33 |                 total += n;
 34 |                 let tmp = buf;
 35 |                 buf = &mut tmp[n..];
 36 |             }
 37 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
 38 |             Err(e) => return Err(e),
 39 |         }
 40 |     }
 41 |     Ok(total)
 42 | }
 43 | 
 44 | pub(crate) fn pread_exact(
 45 |     file: &File,
 46 |     mut buf: &mut [u8],
 47 |     offset: LogOffset,
 48 | ) -> io::Result<()> {
 49 |     let _lock = GLOBAL_FILE_LOCK.lock();
 50 | 
 51 |     let mut f = file.try_clone()?;
 52 | 
 53 |     let _ = f.seek(io::SeekFrom::Start(offset))?;
 54 | 
 55 |     while !buf.is_empty() {
 56 |         match f.read(buf) {
 57 |             Ok(0) => break,
 58 |             Ok(n) => {
 59 |                 let tmp = buf;
 60 |                 buf = &mut tmp[n..];
 61 |             }
 62 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
 63 |             Err(e) => return Err(e),
 64 |         }
 65 |     }
 66 |     if !buf.is_empty() {
 67 |         Err(io::Error::new(
 68 |             io::ErrorKind::UnexpectedEof,
 69 |             "failed to fill whole buffer",
 70 |         ))
 71 |     } else {
 72 |         Ok(())
 73 |     }
 74 | }
 75 | 
 76 | pub(crate) fn pwrite_all(
 77 |     file: &File,
 78 |     mut buf: &[u8],
 79 |     offset: LogOffset,
 80 | ) -> io::Result<()> {
 81 |     let _lock = GLOBAL_FILE_LOCK.lock();
 82 | 
 83 |     let mut f = file.try_clone()?;
 84 | 
 85 |     let _ = f.seek(io::SeekFrom::Start(offset))?;
 86 | 
 87 |     while !buf.is_empty() {
 88 |         match f.write(buf) {
 89 |             Ok(0) => {
 90 |                 return Err(io::Error::new(
 91 |                     io::ErrorKind::WriteZero,
 92 |                     "failed to write whole buffer",
 93 |                 ));
 94 |             }
 95 |             Ok(n) => buf = &buf[n..],
 96 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
 97 |             Err(e) => return Err(e),
 98 |         }
 99 |     }
100 |     Ok(())
101 | }
102 | 


--------------------------------------------------------------------------------
/src/doc/mod.rs:
--------------------------------------------------------------------------------
 1 | //! #### what is sled?
 2 | //!
 3 | //! * an embedded kv store
 4 | //! * a construction kit for stateful systems
 5 | //! * ordered map API similar to a Rust `BTreeMap<Vec<u8>, Vec<u8>>`
 6 | //! * fully atomic single-key operations, supports CAS
 7 | //! * zero-copy reads
 8 | //! * merge operators
 9 | //! * forward and reverse iterators
10 | //! * a monotonic ID generator capable of giving out 75-125+ million unique IDs
11 | //!   per second, never double allocating even in the presence of crashes
12 | //! * [zstd](https://github.com/facebook/zstd) compression (use the zstd build
13 | //!   feature)
14 | //! * cpu-scalable lock-free implementation
15 | //! * SSD-optimized log-structured storage
16 | //!
17 | //! #### why another kv store?
18 | //!
19 | //! People face unnecessary hardship when working with existing embedded
20 | //! databases. They tend to have sharp performance trade-offs, are difficult to
21 | //! tune, have unclear consistency guarantees, and are generally inflexible.
22 | //! Facebook uses distributed machine learning to find configurations that
23 | //! achieve great performance for specific workloads on rocksdb.  Most engineers
24 | //! don't have access to that kind of infrastructure. We would like to build
25 | //! sled so that it can be optimized using simple local methods, with as little
26 | //! user input as possible, and in many cases exceed the performance of popular
27 | //! systems today.
28 | //!
29 | //! This is how we aim to improve the situation:
30 | //!
31 | //! 1. don't make the user think. the interface should be obvious.
32 | //! 1. don't surprise users with performance traps.
33 | //! 1. don't wake up operators. bring reliability techniques from academia into
34 | //! real-world practice. 1. don't use so much electricity. our data structures
35 | //! should play to modern hardware's strengths.
36 | //!
37 | //! sled is written by people with experience designing, building, testing, and
38 | //! operating databases at high scales. we think the situation can be improved.
39 | //!
40 | //! #### targeted toward our vision of the future
41 | //! Building a database takes years. Designers of databases make bets about
42 | //! target usage and hardware. Here are the trends that we see, which we want to
43 | //! optimize the experience around:
44 | //!
45 | //! 1. more cores on servers, spanning sockets and numa domains
46 | //! 1. the vast majority of content consumption and generation happening on
47 | //! phones 1. compute migrating to the edge, into CDNs
48 | //! 1. conflict-free and OT-based replication techniques at the edge
49 | //! 1. strongly-consistent replication techniques within and between datacenters
50 | //! 1. event-driven architectures which benefit heavily from subscriber/watch
51 | //! semantics
52 | 
53 | pub mod engineering_practices;
54 | pub mod limits;
55 | pub mod merge_operators;
56 | pub mod performance_guide;
57 | pub mod reactive_semantics;
58 | pub mod sled_architectural_outlook;
59 | pub mod testing_strategies;
60 | 


--------------------------------------------------------------------------------
/src/lazy.rs:
--------------------------------------------------------------------------------
  1 | //! This module exists because `lazy_static` causes TSAN to
  2 | //! be very unhappy. We rely heavily on TSAN for finding
  3 | //! races, so we don't use `lazy_static`.
  4 | 
  5 | use std::sync::atomic::{
  6 |     AtomicBool, AtomicPtr,
  7 |     Ordering::{Acquire, SeqCst},
  8 | };
  9 | 
 10 | /// A lazily initialized value
 11 | pub struct Lazy<T, F> {
 12 |     value: AtomicPtr<T>,
 13 |     init_mu: AtomicBool,
 14 |     init: F,
 15 | }
 16 | 
 17 | impl<T, F> Lazy<T, F> {
 18 |     /// Create a new Lazy
 19 |     pub const fn new(init: F) -> Self
 20 |     where
 21 |         F: Sized,
 22 |     {
 23 |         Self {
 24 |             value: AtomicPtr::new(std::ptr::null_mut()),
 25 |             init_mu: AtomicBool::new(false),
 26 |             init,
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | impl<T, F> Drop for Lazy<T, F> {
 32 |     fn drop(&mut self) {
 33 |         let value_ptr = self.value.load(Acquire);
 34 |         if !value_ptr.is_null() {
 35 |             #[allow(unsafe_code)]
 36 |             unsafe {
 37 |                 drop(Box::from_raw(value_ptr))
 38 |             }
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | impl<T, F> std::ops::Deref for Lazy<T, F>
 44 | where
 45 |     F: Fn() -> T,
 46 | {
 47 |     type Target = T;
 48 | 
 49 |     fn deref(&self) -> &T {
 50 |         {
 51 |             let value_ptr = self.value.load(Acquire);
 52 |             if !value_ptr.is_null() {
 53 |                 #[allow(unsafe_code)]
 54 |                 unsafe {
 55 |                     return &*value_ptr;
 56 |                 }
 57 |             }
 58 |         }
 59 | 
 60 |         // We want to keep looping as long as it returns true,
 61 |         // so we don't need any explicit conversion here.
 62 |         while self
 63 |             .init_mu
 64 |             .compare_exchange(false, true, SeqCst, SeqCst)
 65 |             .is_err()
 66 |         {
 67 |             std::sync::atomic::spin_loop_hint();
 68 |         }
 69 | 
 70 |         {
 71 |             let value_ptr = self.value.load(Acquire);
 72 |             // we need to check this again because
 73 |             // maybe some other thread completed
 74 |             // the initialization already.
 75 |             if !value_ptr.is_null() {
 76 |                 let unlock = self.init_mu.swap(false, SeqCst);
 77 |                 assert!(unlock);
 78 |                 #[allow(unsafe_code)]
 79 |                 unsafe {
 80 |                     return &*value_ptr;
 81 |                 }
 82 |             }
 83 |         }
 84 | 
 85 |         {
 86 |             let value = (self.init)();
 87 |             let value_ptr = Box::into_raw(Box::new(value));
 88 | 
 89 |             let old = self.value.swap(value_ptr, SeqCst);
 90 |             assert!(old.is_null());
 91 | 
 92 |             let unlock = self.init_mu.swap(false, SeqCst);
 93 |             assert!(unlock);
 94 | 
 95 |             #[allow(unsafe_code)]
 96 |             unsafe {
 97 |                 &*value_ptr
 98 |             }
 99 |         }
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/experiments/new_segment_ownership/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::{
  2 |     atomic::{AtomicUsize, Ordering},
  3 |     Arc,
  4 | };
  5 | 
  6 | const SZ: usize = 128;
  7 | 
  8 | #[derive(Default, Debug)]
  9 | struct Log {
 10 |     segment_accountant: Arc<SegmentAccountant>,
 11 |     io_buf: Arc<IoBuf>,
 12 | }
 13 | 
 14 | impl Log {
 15 |     fn new() -> Log {
 16 |         let io_buf = Arc::new(IoBuf::default());
 17 |         let segment_accountant = io_buf.segment.segment_accountant.clone();
 18 |         Log { io_buf, segment_accountant }
 19 |     }
 20 | 
 21 |     fn reserve(&mut self, size: usize) -> Reservation {
 22 |         assert!(size <= SZ);
 23 |         if self.io_buf.buf.load(Ordering::SeqCst) + size > SZ {
 24 |             let segment = self.segment_accountant.clone().next_segment();
 25 |             let buf = AtomicUsize::new(0);
 26 |             self.io_buf = Arc::new(IoBuf { segment, buf });
 27 |         }
 28 |         let io_buf = self.io_buf.clone();
 29 |         io_buf.buf.fetch_add(size, Ordering::SeqCst);
 30 |         Reservation { io_buf }
 31 |     }
 32 | }
 33 | 
 34 | #[derive(Default, Debug)]
 35 | struct Reservation {
 36 |     io_buf: Arc<IoBuf>,
 37 | }
 38 | 
 39 | #[derive(Default, Debug)]
 40 | struct IoBuf {
 41 |     segment: Arc<Segment>,
 42 |     buf: AtomicUsize,
 43 | }
 44 | 
 45 | #[derive(Default, Debug)]
 46 | struct Segment {
 47 |     offset: usize,
 48 |     segment_accountant: Arc<SegmentAccountant>,
 49 | }
 50 | 
 51 | #[derive(Default, Debug)]
 52 | struct SegmentAccountant {
 53 |     tip: AtomicUsize,
 54 |     free: Vec<Segment>,
 55 | }
 56 | 
 57 | impl SegmentAccountant {
 58 |     fn next_segment(self: Arc<SegmentAccountant>) -> Arc<Segment> {
 59 |         let offset = SZ + self.tip.fetch_add(SZ, Ordering::SeqCst);
 60 |         println!("setting new segment {}", offset);
 61 |         Arc::new(Segment { segment_accountant: self, offset })
 62 |     }
 63 | }
 64 | 
 65 | fn main() {
 66 |     let mut log = Log::new();
 67 |     {
 68 |         let _ = log.reserve(64);
 69 |         let _ = log.reserve(64);
 70 |     }
 71 |     println!("src/main.rs:70");
 72 |     {
 73 |         let _ = log.reserve(128);
 74 |     }
 75 |     println!("src/main.rs:74");
 76 |     {
 77 |         let _ = log.reserve(128);
 78 |     }
 79 |     println!("src/main.rs:78");
 80 |     {
 81 |         let _ = log.reserve(128);
 82 |     }
 83 |     println!("src/main.rs:77");
 84 | }
 85 | 
 86 | mod dropz {
 87 |     use super::*;
 88 | 
 89 |     impl Drop for IoBuf {
 90 |         fn drop(&mut self) {
 91 |             println!("IoBuf::drop");
 92 |         }
 93 |     }
 94 |     impl Drop for Segment {
 95 |         fn drop(&mut self) {
 96 |             println!("dropping Segment {:?}", self.offset);
 97 |         }
 98 |     }
 99 |     impl Drop for SegmentAccountant {
100 |         fn drop(&mut self) {
101 |             println!("SegmentAccountant::drop");
102 |         }
103 |     }
104 |     impl Drop for Reservation {
105 |         fn drop(&mut self) {
106 |             println!("Reservation::drop");
107 |         }
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/src/concurrency_control.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "testing")]
  2 | use std::cell::RefCell;
  3 | use std::sync::atomic::AtomicBool;
  4 | 
  5 | use parking_lot::{RwLockReadGuard, RwLockWriteGuard};
  6 | 
  7 | use super::*;
  8 | 
  9 | #[cfg(feature = "testing")]
 10 | thread_local! {
 11 |     pub static COUNT: RefCell<u32> = RefCell::new(0);
 12 | }
 13 | 
 14 | const RW_REQUIRED_BIT: usize = 1 << 31;
 15 | 
 16 | #[derive(Default)]
 17 | pub(crate) struct ConcurrencyControl {
 18 |     active: AtomicUsize,
 19 |     upgrade_complete: AtomicBool,
 20 |     rw: RwLock<()>,
 21 | }
 22 | 
 23 | static CONCURRENCY_CONTROL: Lazy<
 24 |     ConcurrencyControl,
 25 |     fn() -> ConcurrencyControl,
 26 | > = Lazy::new(init_cc);
 27 | 
 28 | fn init_cc() -> ConcurrencyControl {
 29 |     ConcurrencyControl::default()
 30 | }
 31 | 
 32 | #[derive(Debug)]
 33 | #[must_use]
 34 | pub(crate) enum Protector<'a> {
 35 |     Write(RwLockWriteGuard<'a, ()>),
 36 |     Read(RwLockReadGuard<'a, ()>),
 37 |     None(&'a AtomicUsize),
 38 | }
 39 | 
 40 | impl<'a> Drop for Protector<'a> {
 41 |     fn drop(&mut self) {
 42 |         if let Protector::None(active) = self {
 43 |             active.fetch_sub(1, Release);
 44 |         }
 45 |         #[cfg(feature = "testing")]
 46 |         COUNT.with(|c| {
 47 |             let mut c = c.borrow_mut();
 48 |             *c -= 1;
 49 |             assert_eq!(*c, 0);
 50 |         });
 51 |     }
 52 | }
 53 | 
 54 | pub(crate) fn read<'a>() -> Protector<'a> {
 55 |     CONCURRENCY_CONTROL.read()
 56 | }
 57 | 
 58 | pub(crate) fn write<'a>() -> Protector<'a> {
 59 |     CONCURRENCY_CONTROL.write()
 60 | }
 61 | 
 62 | impl ConcurrencyControl {
 63 |     fn enable(&self) {
 64 |         if self.active.fetch_or(RW_REQUIRED_BIT, SeqCst) < RW_REQUIRED_BIT {
 65 |             // we are the first to set this bit
 66 |             while self.active.load(Acquire) != RW_REQUIRED_BIT {
 67 |                 std::sync::atomic::spin_loop_hint()
 68 |             }
 69 |             self.upgrade_complete.store(true, Release);
 70 |         }
 71 |     }
 72 | 
 73 |     fn read(&self) -> Protector<'_> {
 74 |         #[cfg(feature = "testing")]
 75 |         COUNT.with(|c| {
 76 |             let mut c = c.borrow_mut();
 77 |             *c += 1;
 78 |             assert_eq!(*c, 1);
 79 |         });
 80 | 
 81 |         let active = self.active.fetch_add(1, Release);
 82 | 
 83 |         if active >= RW_REQUIRED_BIT {
 84 |             self.active.fetch_sub(1, Release);
 85 |             Protector::Read(self.rw.read())
 86 |         } else {
 87 |             Protector::None(&self.active)
 88 |         }
 89 |     }
 90 | 
 91 |     fn write(&self) -> Protector<'_> {
 92 |         #[cfg(feature = "testing")]
 93 |         COUNT.with(|c| {
 94 |             let mut c = c.borrow_mut();
 95 |             *c += 1;
 96 |             assert_eq!(*c, 1);
 97 |         });
 98 |         self.enable();
 99 |         while !self.upgrade_complete.load(Acquire) {
100 |             std::sync::atomic::spin_loop_hint()
101 |         }
102 |         Protector::Write(self.rw.write())
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/varint.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryFrom;
  2 | 
  3 | /// Returns the number of bytes that this varint will need
  4 | pub fn size(int: u64) -> usize {
  5 |     if int <= 240 {
  6 |         1
  7 |     } else if int <= 2287 {
  8 |         2
  9 |     } else if int <= 67823 {
 10 |         3
 11 |     } else if int <= 0x00FF_FFFF {
 12 |         4
 13 |     } else if int <= 0xFFFF_FFFF {
 14 |         5
 15 |     } else if int <= 0x00FF_FFFF_FFFF {
 16 |         6
 17 |     } else if int <= 0xFFFF_FFFF_FFFF {
 18 |         7
 19 |     } else if int <= 0x00FF_FFFF_FFFF_FFFF {
 20 |         8
 21 |     } else {
 22 |         9
 23 |     }
 24 | }
 25 | 
 26 | /// Returns how many bytes the varint consumed while serializing
 27 | pub fn serialize_into(int: u64, buf: &mut [u8]) -> usize {
 28 |     if int <= 240 {
 29 |         buf[0] = u8::try_from(int).unwrap();
 30 |         1
 31 |     } else if int <= 2287 {
 32 |         buf[0] = u8::try_from((int - 240) / 256 + 241).unwrap();
 33 |         buf[1] = u8::try_from((int - 240) % 256).unwrap();
 34 |         2
 35 |     } else if int <= 67823 {
 36 |         buf[0] = 249;
 37 |         buf[1] = u8::try_from((int - 2288) / 256).unwrap();
 38 |         buf[2] = u8::try_from((int - 2288) % 256).unwrap();
 39 |         3
 40 |     } else if int <= 0x00FF_FFFF {
 41 |         buf[0] = 250;
 42 |         let bytes = int.to_le_bytes();
 43 |         buf[1..4].copy_from_slice(&bytes[..3]);
 44 |         4
 45 |     } else if int <= 0xFFFF_FFFF {
 46 |         buf[0] = 251;
 47 |         let bytes = int.to_le_bytes();
 48 |         buf[1..5].copy_from_slice(&bytes[..4]);
 49 |         5
 50 |     } else if int <= 0x00FF_FFFF_FFFF {
 51 |         buf[0] = 252;
 52 |         let bytes = int.to_le_bytes();
 53 |         buf[1..6].copy_from_slice(&bytes[..5]);
 54 |         6
 55 |     } else if int <= 0xFFFF_FFFF_FFFF {
 56 |         buf[0] = 253;
 57 |         let bytes = int.to_le_bytes();
 58 |         buf[1..7].copy_from_slice(&bytes[..6]);
 59 |         7
 60 |     } else if int <= 0x00FF_FFFF_FFFF_FFFF {
 61 |         buf[0] = 254;
 62 |         let bytes = int.to_le_bytes();
 63 |         buf[1..8].copy_from_slice(&bytes[..7]);
 64 |         8
 65 |     } else {
 66 |         buf[0] = 255;
 67 |         let bytes = int.to_le_bytes();
 68 |         buf[1..9].copy_from_slice(&bytes[..8]);
 69 |         9
 70 |     }
 71 | }
 72 | 
 73 | /// Returns the deserialized varint, along with how many bytes
 74 | /// were taken up by the varint.
 75 | pub fn deserialize(buf: &[u8]) -> crate::Result<(u64, usize)> {
 76 |     if buf.is_empty() {
 77 |         return Err(crate::Error::corruption(None));
 78 |     }
 79 |     let res = match buf[0] {
 80 |         0..=240 => (u64::from(buf[0]), 1),
 81 |         241..=248 => {
 82 |             let varint =
 83 |                 240 + 256 * (u64::from(buf[0]) - 241) + u64::from(buf[1]);
 84 |             (varint, 2)
 85 |         }
 86 |         249 => {
 87 |             let varint = 2288 + 256 * u64::from(buf[1]) + u64::from(buf[2]);
 88 |             (varint, 3)
 89 |         }
 90 |         other => {
 91 |             let sz = other as usize - 247;
 92 |             let mut aligned = [0; 8];
 93 |             aligned[..sz].copy_from_slice(&buf[1..=sz]);
 94 |             let varint = u64::from_le_bytes(aligned);
 95 |             (varint, sz + 1)
 96 |         }
 97 |     };
 98 |     Ok(res)
 99 | }
100 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | name: Rust
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     branches:
  6 |     - master
  7 | 
  8 | jobs:
  9 |   clippy_check:
 10 |     runs-on: ubuntu-latest
 11 |     steps:
 12 |       - uses: actions/checkout@v1
 13 |       - uses: actions-rs/toolchain@v1
 14 |         with:
 15 |             toolchain: nightly
 16 |             components: clippy
 17 |             override: true
 18 |       - run: rustup component add clippy
 19 |       - uses: actions-rs/clippy-check@v1
 20 |         with:
 21 |           token: ${{ secrets.GITHUB_TOKEN }}
 22 |           args: --all-features
 23 |   default:
 24 |     name: Cargo Test on ${{ matrix.os }}
 25 |     runs-on: ${{ matrix.os }}
 26 |     strategy:
 27 |       fail-fast: false
 28 |       matrix:
 29 |         os: [ubuntu-latest, macos-latest, windows-latest]
 30 |     steps:
 31 |     - uses: actions/checkout@v1
 32 |     - name: Cache target
 33 |       uses: actions/cache@v1
 34 |       env:
 35 |         cache-name: cache-target
 36 |         RUST_BACKTRACE: 1
 37 |       with:
 38 |         path: target
 39 |         key: ${{ runner.os }}-${{ env.cache-name }}
 40 |         restore-keys: |
 41 |           ${{ runner.os }}-
 42 |     - name: cargo test
 43 |       run: |
 44 |         rustup update --no-self-update
 45 |         cargo test --release --no-default-features --features=testing -- --nocapture
 46 |   examples:
 47 |     name: Example Tests
 48 |     runs-on: ubuntu-latest
 49 |     steps:
 50 |     - uses: actions/checkout@v1
 51 |     - name: Cache target
 52 |       uses: actions/cache@v1
 53 |       env:
 54 |         cache-name: cache-target
 55 |       with:
 56 |         path: target
 57 |         key: ${{ runner.os }}-${{ env.cache-name }}
 58 |         restore-keys: |
 59 |           ${{ runner.os }}-
 60 |     - name: example tests
 61 |       run: |
 62 |         rustup update --no-self-update
 63 |         cargo run --example playground
 64 |         cargo run --example structured
 65 |   cross-compile:
 66 |     name: Cross Compile
 67 |     runs-on: macos-latest
 68 |     steps:
 69 |     - uses: actions/checkout@v1
 70 |     - name: cross compile
 71 |       run: |
 72 |         set -eo pipefail
 73 |         echo "cross build"
 74 |         scripts/cross_compile.sh
 75 |   burn-in:
 76 |     name: Burn In
 77 |     runs-on: ubuntu-latest
 78 |     steps:
 79 |     - uses: actions/checkout@v1
 80 |     - name: Cache target
 81 |       uses: actions/cache@v1
 82 |       env:
 83 |         cache-name: cache-target
 84 |       with:
 85 |         path: target
 86 |         key: ${{ runner.os }}-${{ env.cache-name }}
 87 |         restore-keys: |
 88 |           ${{ runner.os }}-
 89 |     - name: burn in
 90 |       run: |
 91 |         set -eo pipefail
 92 |         pushd benchmarks/stress2
 93 |         cargo run --release -- --duration=60
 94 |         rm -rf default.sled
 95 |   sanitizers:
 96 |     name: Sanitizers
 97 |     runs-on: ubuntu-latest
 98 |     steps:
 99 |     - uses: actions/checkout@v1
100 |     - name: Cache rustup
101 |       uses: actions/cache@v1
102 |       env:
103 |         cache-name: cache-target
104 |       with:
105 |         path: ~/.rustup
106 |         key: ${{ runner.os }}-${{ env.cache-name }}
107 |         restore-keys: |
108 |           ${{ runner.os }}-
109 |     - name: sanitizers
110 |       run: |
111 |         set -eo pipefail
112 |         scripts/sanitizers.sh
113 | 


--------------------------------------------------------------------------------
/src/doc/merge_operators/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Merge operators are an extremely powerful tool for use in embedded kv
 2 | //! stores. They allow users to specify custom logic for combining multiple
 3 | //! versions of a value into one.
 4 | //!
 5 | //! As a motivating example, imagine that you have a counter. In a traditional
 6 | //! kv store, you would need to read the old value, modify it, then write it
 7 | //! back (RMW). If you want to increment the counter from multiple threads, you
 8 | //! would need to either use higher-level locking or you need to spin in a CAS
 9 | //! loop until your increment is successful. Merge operators remove the need for
10 | //! all of this by allowing multiple threads to "merge" in the desired
11 | //! operation, rather than performing a read, then modification, then later
12 | //! writing. `+1 -> +1 -> +1` instead of `w(r(key) + 1) -> w(r(key)+ 1) ->
13 | //! w(r(key) + 1)`.
14 | //!
15 | //! Here's an example of using a merge operator to just concatenate merged bytes
16 | //! together. Note that calling `set` acts as a value replacement, bypassing the
17 | //! merging logic and replacing previously merged values. Calling `merge` is
18 | //! like `set` but when the key is fetched, it will use the merge operator to
19 | //! combine all `merge`'s since the last `set`.
20 | //!
21 | //! ```rust
22 | //! fn concatenate_merge(
23 | //! _key: &[u8],               // the key being merged
24 | //! old_value: Option<&[u8]>,  // the previous value, if one existed
25 | //! merged_bytes: &[u8]        // the new bytes being merged in
26 | //! ) -> Option<Vec<u8>> {       // set the new value, return None to delete
27 | //! let mut ret = old_value
28 | //!     .map(|ov| ov.to_vec())
29 | //!     .unwrap_or_else(|| vec![]);
30 | //!
31 | //! ret.extend_from_slice(merged_bytes);
32 | //!
33 | //! Some(ret)
34 | //! }
35 | //!
36 | //! let config = ConfigBuilder::new()
37 | //! .temporary(true)
38 | //! .build();
39 | //!
40 | //! let tree = Tree::start(config).unwrap();
41 | //! tree.set_merge_operator(concatenate_merge);
42 | //!
43 | //! tree.set(k, vec![0]);
44 | //! tree.merge(k, vec![1]);
45 | //! tree.merge(k, vec![2]);
46 | //! assert_eq!(tree.get(&k), Ok(Some(vec![0, 1, 2])));
47 | //!
48 | //! // sets replace previously merged data,
49 | //! // bypassing the merge function.
50 | //! tree.set(k, vec![3]);
51 | //! assert_eq!(tree.get(&k), Ok(Some(vec![3])));
52 | //!
53 | //! // merges on non-present values will add them
54 | //! tree.del(&k);
55 | //! tree.merge(k, vec![4]);
56 | //! assert_eq!(tree.get(&k), Ok(Some(vec![4])));
57 | //! ```
58 | //!
59 | //! ### beyond the basics
60 | //!
61 | //! Merge operators can be used to express arbitrarily complex logic. You can
62 | //! use them to implement any sort of high-level data structure on top of sled,
63 | //! using merges of different values to represent your desired operations.
64 | //! Similar to the above example, you could implement a list that lets you push
65 | //! items. Bloom filters are particularly easy to implement, and merge operators
66 | //! also are quite handy for building persistent CRDTs.
67 | //!
68 | //! ### warnings
69 | //!
70 | //! If you call `merge` without setting a merge operator, an error will be
71 | //! returned. Merge operators may be changed over time, but make sure you do
72 | //! this carefully to avoid race conditions. If you need to push a one-time
73 | //! operation to a value, use `update_and_fetch` or `fetch_and_update` instead.
74 | 


--------------------------------------------------------------------------------
/src/context.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[derive(Debug, Clone)]
 4 | #[doc(hidden)]
 5 | pub struct Context {
 6 |     // TODO file from config should be in here
 7 |     config: RunningConfig,
 8 |     /// Periodically flushes dirty data. We keep this in an
 9 |     /// Arc separate from the PageCache below to separate
10 |     /// "high-level" references from Db, Tree etc... from
11 |     /// "low-level" references like background threads.
12 |     /// When the last high-level reference is dropped, it
13 |     /// should trigger all background threads to clean
14 |     /// up synchronously.
15 |     #[cfg(all(
16 |         not(miri),
17 |         any(
18 |             windows,
19 |             target_os = "linux",
20 |             target_os = "macos",
21 |             target_os = "dragonfly",
22 |             target_os = "freebsd",
23 |             target_os = "openbsd",
24 |             target_os = "netbsd",
25 |         )
26 |     ))]
27 |     pub(crate) flusher: Arc<Mutex<Option<flusher::Flusher>>>,
28 |     #[doc(hidden)]
29 |     pub pagecache: PageCache,
30 | }
31 | 
32 | impl std::ops::Deref for Context {
33 |     type Target = RunningConfig;
34 | 
35 |     fn deref(&self) -> &RunningConfig {
36 |         &self.config
37 |     }
38 | }
39 | 
40 | impl Context {
41 |     pub(crate) fn start(config: RunningConfig) -> Result<Self> {
42 |         trace!("starting context");
43 | 
44 |         let pagecache = PageCache::start(config.clone())?;
45 | 
46 |         Ok(Self {
47 |             config,
48 |             pagecache,
49 |             #[cfg(all(
50 |                 not(miri),
51 |                 any(
52 |                     windows,
53 |                     target_os = "linux",
54 |                     target_os = "macos",
55 |                     target_os = "dragonfly",
56 |                     target_os = "freebsd",
57 |                     target_os = "openbsd",
58 |                     target_os = "netbsd",
59 |                 )
60 |             ))]
61 |             flusher: Arc::new(parking_lot::Mutex::new(None)),
62 |         })
63 |     }
64 | 
65 |     /// Returns `true` if the database was
66 |     /// recovered from a previous process.
67 |     /// Note that database state is only
68 |     /// guaranteed to be present up to the
69 |     /// last call to `flush`! Otherwise state
70 |     /// is synced to disk periodically if the
71 |     /// `sync_every_ms` configuration option
72 |     /// is set to `Some(number_of_ms_between_syncs)`
73 |     /// or if the IO buffer gets filled to
74 |     /// capacity before being rotated.
75 |     pub fn was_recovered(&self) -> bool {
76 |         self.pagecache.was_recovered()
77 |     }
78 | 
79 |     /// Generate a monotonic ID. Not guaranteed to be
80 |     /// contiguous. Written to disk every `idgen_persist_interval`
81 |     /// operations, followed by a blocking flush. During recovery, we
82 |     /// take the last recovered generated ID and add 2x
83 |     /// the `idgen_persist_interval` to it. While persisting, if the
84 |     /// previous persisted counter wasn't synced to disk yet, we will do
85 |     /// a blocking flush to fsync the latest counter, ensuring
86 |     /// that we will never give out the same counter twice.
87 |     pub fn generate_id(&self) -> Result<u64> {
88 |         let _cc = concurrency_control::read();
89 |         self.pagecache.generate_id_inner()
90 |     }
91 | 
92 |     pub(crate) fn pin_log(&self, guard: &Guard) -> Result<RecoveryGuard<'_>> {
93 |         self.pagecache.pin_log(guard)
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/bindings/neon-sled/native/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate neon;
  3 | extern crate sled;
  4 | 
  5 | use neon::js::JsNull;
  6 | use neon::js::JsString;
  7 | use neon::js::Value;
  8 | use neon::vm::{Call, JsResult};
  9 | 
 10 | fn extract_arg(call: &mut Call, idx: i32) -> Result<String, ()> {
 11 |     let args = &call.arguments;
 12 |     let handle = args.get(call.guard, idx).ok_or(())?;
 13 |     Ok((*handle).to_string(call.scope).map_err(|_| ())?.value())
 14 | }
 15 | 
 16 | fn create_db(mut call: Call) -> JsResult<JsString> {
 17 |     let path = extract_arg(&mut call, 0).unwrap();
 18 |     let t = sled::Config::default().path(path).tree();
 19 | 
 20 |     let ptr = Box::into_raw(Box::new(t));
 21 |     let ptr_string = format!("{}", ptr as usize);
 22 |     Ok(JsString::new(call.guard, &*ptr_string).unwrap())
 23 | }
 24 | 
 25 | fn cast_string_to_ptr<'a>(ptr_str: String) -> &'a sled::Tree {
 26 |     let ptr_from_str = ptr_str.parse::<usize>().unwrap();
 27 |     //println!("ptr_from_str: {}", ptr_from_str);
 28 | 
 29 |     let ptr = ptr_from_str as *mut sled::Tree;
 30 |     unsafe { &*ptr }
 31 | }
 32 | 
 33 | fn set(mut call: Call) -> JsResult<JsString> {
 34 |     let arg0 = extract_arg(&mut call, 0).unwrap();
 35 |     let arg1 = extract_arg(&mut call, 1);
 36 |     let arg2 = extract_arg(&mut call, 2);
 37 | 
 38 |     //println!("SET args {:?} {:?}", arg0, arg1);
 39 | 
 40 |     let t = cast_string_to_ptr(arg0);
 41 | 
 42 |     let k = arg1.unwrap().into_bytes();
 43 |     let v = arg2.unwrap().into_bytes();
 44 | 
 45 |     t.set(k.clone(), v);
 46 | 
 47 |     let from_db = t
 48 |         .get(&*k)
 49 |         .and_then(|from_db| {
 50 |             let str = unsafe { std::str::from_utf8_unchecked(&*from_db) };
 51 |             JsString::new(call.guard, str)
 52 |         })
 53 |         .unwrap_or_else(|| JsString::new(call.guard, "").unwrap());
 54 | 
 55 |     Ok(from_db)
 56 | }
 57 | 
 58 | fn get(mut call: Call) -> JsResult<JsString> {
 59 |     let arg0 = extract_arg(&mut call, 0).unwrap();
 60 |     let arg1 = extract_arg(&mut call, 1);
 61 | 
 62 |     //println!("GET args {:?}", arg0);
 63 | 
 64 |     let t = cast_string_to_ptr(arg0);
 65 |     let k = arg1.unwrap().into_bytes();
 66 | 
 67 |     let from_db = t
 68 |         .get(&*k)
 69 |         .map(|from_db| {
 70 |             let str = unsafe { std::str::from_utf8_unchecked(&*from_db) };
 71 |             JsString::new(call.guard, str).unwrap()
 72 |         })
 73 |         .unwrap_or_else(|| JsString::new(call.guard, "").unwrap());
 74 | 
 75 |     Ok(from_db)
 76 | }
 77 | 
 78 | fn del(mut call: Call) -> JsResult<JsNull> {
 79 |     let arg0 = extract_arg(&mut call, 0).unwrap();
 80 |     let arg1 = extract_arg(&mut call, 1);
 81 | 
 82 |     let t = cast_string_to_ptr(arg0);
 83 |     let k = arg1.unwrap().into_bytes();
 84 | 
 85 |     t.del(&*k);
 86 | 
 87 |     Ok(JsNull::new())
 88 | }
 89 | 
 90 | fn sync_and_close(mut call: Call) -> JsResult<JsNull> {
 91 |     let arg0 = extract_arg(&mut call, 0).unwrap();
 92 |     let ptr_from_str = arg0.parse::<usize>().unwrap();
 93 |     let ptr = ptr_from_str as *mut sled::Tree;
 94 | 
 95 |     unsafe {
 96 |         let t = Box::from_raw(ptr);
 97 |         drop(t);
 98 |     }
 99 |     Ok(JsNull::new())
100 | }
101 | 
102 | register_module!(m, {
103 |     m.export("get", get)?;
104 |     m.export("set", set)?;
105 |     m.export("del", del)?;
106 |     m.export("createDb", create_db)?;
107 |     m.export("syncAndClose", sync_and_close)
108 | });
109 | 


--------------------------------------------------------------------------------
/bindings/python/rsdb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from ctypes import *
  4 | import os
  5 | 
  6 | sled = CDLL("./libsled_native.so")
  7 | 
  8 | sled.sled_create_config.argtypes = ()
  9 | sled.sled_create_config.restype = c_void_p
 10 | 
 11 | sled.sled_config_set_path.argtypes = (c_void_p, c_char_p)
 12 | 
 13 | sled.sled_free_config.argtypes = (c_void_p,)
 14 | 
 15 | sled.sled_open_tree.argtypes = (c_void_p,)
 16 | sled.sled_open_tree.restype = c_void_p
 17 | 
 18 | sled.sled_free_tree.argtypes = (c_void_p,)
 19 | 
 20 | sled.sled_get.argtypes = (c_void_p, c_char_p, c_size_t, POINTER(c_size_t))
 21 | sled.sled_get.restype = c_char_p
 22 | 
 23 | sled.sled_scan.argtypes = (c_void_p, c_char_p, c_size_t, POINTER(c_size_t))
 24 | sled.sled_scan.restype = c_void_p
 25 | 
 26 | sled.sled_set.argtypes = (c_void_p, c_char_p, c_size_t, c_char_p, c_size_t)
 27 | sled.sled_set.restype = None
 28 | 
 29 | sled.sled_del.argtypes = (c_void_p, c_char_p, c_size_t)
 30 | sled.sled_del.restype = None
 31 | 
 32 | sled.sled_cas.argtypes = (c_void_p,
 33 |                           c_char_p, c_size_t,  # key
 34 |                           c_char_p, c_size_t,  # old
 35 |                           c_char_p, c_size_t,  # new
 36 |                           POINTER(c_char_p), POINTER(c_size_t),  # actual ret
 37 |                           )
 38 | sled.sled_cas.restype = c_ubyte
 39 | 
 40 | 
 41 | class Conf:
 42 |     def __init__(self):
 43 |         self.ptr = c_void_p(sled.sled_create_config())
 44 | 
 45 |     def tree(self):
 46 |         tree_ptr = sled.sled_open_tree(self.ptr)
 47 |         return Tree(c_void_p(tree_ptr))
 48 | 
 49 |     def path(self, path):
 50 |         sled.sled_config_set_path(self.ptr, path)
 51 | 
 52 |     def __del__(self):
 53 |         sled.sled_free_config(self.ptr)
 54 | 
 55 | 
 56 | class TreeIterator:
 57 |     def __init__(self, ptr):
 58 |         self.ptr = ptr
 59 | 
 60 |     def __del__(self):
 61 |         sled.sled_free_iter(self.ptr)
 62 | 
 63 | 
 64 | class Tree:
 65 |     def __init__(self, ptr):
 66 |         self.ptr = ptr
 67 | 
 68 |     def __del__(self):
 69 |         if self.ptr:
 70 |             sled.sled_free_tree(self.ptr)
 71 | 
 72 |     def close(self):
 73 |         self.__del__()
 74 |         self.ptr = None
 75 | 
 76 |     def set(self, key, val):
 77 |         sled.sled_set(self.ptr, key, len(key), val, len(val))
 78 | 
 79 |     def get(self, key):
 80 |         vallen = c_size_t(0)
 81 |         ptr = sled.sled_get(self.ptr, key, len(key), byref(vallen))
 82 |         return ptr[:vallen.value]
 83 | 
 84 |     def delete(self, key):
 85 |         sled.sled_del(self.ptr, key, len(key))
 86 | 
 87 |     def cas(self, key, old, new):
 88 |         actual_vallen = c_size_t(0)
 89 |         actual_val = c_char_p(0)
 90 | 
 91 |         if old is None:
 92 |             old = b""
 93 | 
 94 |         if new is None:
 95 |             new = b""
 96 | 
 97 |         success = sled.sled_compare_and_swap(
 98 |                                 self.ptr, key,
 99 |                                 len(key),
100 |                                 old, len(old),
101 |                                 new, len(new),
102 |                                 byref(actual_val), byref(actual_vallen))
103 | 
104 |         if actual_vallen.value == 0:
105 |             return (None, success == 1)
106 |         else:
107 |             return (actual_val.value[:actual_vallen.value], success == 1)
108 | 
109 |     def scan(self, key):
110 |         return sled.sled_scan(self.ptr, key, len(key))
111 | 


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at tylerneely@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | 


--------------------------------------------------------------------------------
/SAFETY.md:
--------------------------------------------------------------------------------
  1 | # sled safety model
  2 | 
  3 | This document applies
  4 | [STPA](http://psas.scripts.mit.edu/home/get_file.php?name=STPA_handbook.pdf)-style
  5 | hazard analysis to the sled embedded database for the purpose of guiding
  6 | design and testing efforts to prevent unacceptable losses.
  7 | 
  8 | Outline
  9 | 
 10 | * [purpose of analysis](#purpose-of-analysis)
 11 |   * [losses](#losses)
 12 |   * [system boundary](#system-boundary)
 13 |   * [hazards](#hazards)
 14 |   * [leading indicators](#leading-indicators)
 15 |   * [constraints](#constraints)
 16 | * [model of control structure](#model-of-control-structure)
 17 | * [identify unsafe control actions](#identify-unsafe-control-actions)
 18 | * [identify loss scenarios][#identify-loss-scenarios)
 19 | * [resources for learning more about STAMP, STPA, and CAST](#resources)
 20 | 
 21 | # Purpose of Analysis
 22 | 
 23 | ## Losses
 24 | 
 25 | We wish to prevent the following undesirable situations:
 26 | 
 27 | * data loss
 28 | * inconsistent (non-linearizable) data access
 29 | * process crash
 30 | * resource exhaustion
 31 | 
 32 | ## System Boundary
 33 | 
 34 | We draw the line between system and environment where we can reasonably
 35 | invest our efforts to prevent losses.
 36 | 
 37 | Inside the boundary:
 38 | 
 39 | * codebase
 40 |   * put safe control actions into place that prevent losses
 41 | * documentation
 42 |   * show users how to use sled safely
 43 |   * recommend hardware, kernels, user code
 44 | 
 45 | Outside the boundary:
 46 | 
 47 | * Direct changes to hardware, kernels, user code
 48 | 
 49 | ## Hazards
 50 | 
 51 | These hazards can result in the above losses:
 52 | 
 53 | * data may be lost if
 54 |   * bugs in the logging system
 55 |     * `Db::flush` fails to make previous writes durable
 56 |   * bugs in the GC system
 57 |     * the old location is overwritten before the defragmented location becomes durable
 58 |   * bugs in the recovery system
 59 |   * hardare failures
 60 | * consistency violations may be caused by
 61 |   * transaction concurrency control failure to enforce linearizability (strict serializability)
 62 |   * non-linearizable lock-free single-key operations
 63 | * panic
 64 |   * of user threads
 65 |   * IO threads
 66 |   * flusher & GC thread
 67 |   * indexing
 68 |   * unwraps/expects
 69 |   * failed TryInto/TryFrom + unwrap
 70 | * persistent storage exceeding (2 + N concurrent writers) * logical data size
 71 | * in-memory cache exceeding the configured cache size
 72 |   * caused by incorrect calculation of cache
 73 | * use-after-free
 74 | * data race
 75 | * memory leak
 76 | * integer overflow
 77 | * buffer overrun
 78 | * uninitialized memory access
 79 | 
 80 | ## Constraints
 81 | 
 82 | # Models of Control Structures
 83 | 
 84 | for each control action we have, consider:
 85 | 
 86 | 1. what hazards happen when we fail to apply it / it does not exist?
 87 | 2. what hazards happen when we do apply it
 88 | 3. what hazards happen when we apply it too early or too late?
 89 | 4. what hazards happen if we apply it for too long or not long enough?
 90 | 
 91 | durability model
 92 | 
 93 |   * recovery
 94 |     * LogIter::max_lsn
 95 |       * return None if last_lsn_in_batch >= self.max_lsn
 96 |     * batch requirement set to last reservation base + inline len - 1
 97 |       * reserve bumps
 98 |         * bump_atomic_lsn(&self.iobufs.max_reserved_lsn, reservation_lsn + inline_buf_len as Lsn - 1);
 99 | 
100 | lock-free linearizability model
101 | 
102 | transactional linearizability (strict serializability) model
103 | 
104 | panic model
105 | 
106 | memory usage model
107 | 
108 | storage usage model
109 | 
110 | 


--------------------------------------------------------------------------------
/src/debug_delay.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::float_arithmetic)]
  2 | 
  3 | use std::sync::atomic::{AtomicUsize, Ordering::Relaxed};
  4 | 
  5 | use crate::Lazy;
  6 | 
  7 | /// This function is useful for inducing random jitter into our atomic
  8 | /// operations, shaking out more possible interleavings quickly. It gets
  9 | /// fully eliminated by the compiler in non-test code.
 10 | pub fn debug_delay() {
 11 |     use std::thread;
 12 |     use std::time::Duration;
 13 | 
 14 |     static GLOBAL_DELAYS: AtomicUsize = AtomicUsize::new(0);
 15 | 
 16 |     static INTENSITY: Lazy<u32, fn() -> u32> = Lazy::new(|| {
 17 |         std::env::var("SLED_LOCK_FREE_DELAY_INTENSITY")
 18 |             .unwrap_or_else(|_| "100".into())
 19 |             .parse()
 20 |             .expect(
 21 |                 "SLED_LOCK_FREE_DELAY_INTENSITY must be set to a \
 22 |                  non-negative integer (ideally below 1,000,000)",
 23 |             )
 24 |     });
 25 | 
 26 |     static CRASH_CHANCE: Lazy<u32, fn() -> u32> = Lazy::new(|| {
 27 |         std::env::var("SLED_CRASH_CHANCE")
 28 |             .unwrap_or_else(|_| "0".into())
 29 |             .parse()
 30 |             .expect(
 31 |                 "SLED_CRASH_CHANCE must be set to a \
 32 |                  non-negative integer (ideally below 50,000)",
 33 |             )
 34 |     });
 35 | 
 36 |     thread_local!(
 37 |         static LOCAL_DELAYS: std::cell::RefCell<usize> = std::cell::RefCell::new(0)
 38 |     );
 39 | 
 40 |     if cfg!(feature = "miri_optimizations") {
 41 |         // Each interaction with LOCAL_DELAYS adds more stacked borrows
 42 |         // tracking information, and Miri is single-threaded anyway.
 43 |         return;
 44 |     }
 45 | 
 46 |     let global_delays = GLOBAL_DELAYS.fetch_add(1, Relaxed);
 47 |     let local_delays = LOCAL_DELAYS.with(|ld| {
 48 |         let mut ld = ld.borrow_mut();
 49 |         let old = *ld;
 50 |         *ld = std::cmp::max(global_delays + 1, *ld + 1);
 51 |         old
 52 |     });
 53 | 
 54 |     if *CRASH_CHANCE > 0 && random(*CRASH_CHANCE) == 0 {
 55 |         std::process::exit(9)
 56 |     }
 57 | 
 58 |     if global_delays == local_delays {
 59 |         // no other threads seem to be
 60 |         // calling this, so we may as
 61 |         // well skip it
 62 |         return;
 63 |     }
 64 | 
 65 |     if random(1000) == 1 {
 66 |         let duration = random(*INTENSITY);
 67 | 
 68 |         #[allow(clippy::cast_possible_truncation)]
 69 |         #[allow(clippy::cast_sign_loss)]
 70 |         thread::sleep(Duration::from_micros(u64::from(duration)));
 71 |     }
 72 | 
 73 |     if random(2) == 0 {
 74 |         thread::yield_now();
 75 |     }
 76 | }
 77 | 
 78 | /// Generates a random number in `0..n`.
 79 | fn random(n: u32) -> u32 {
 80 |     use std::cell::Cell;
 81 |     use std::num::Wrapping;
 82 | 
 83 |     thread_local! {
 84 |         static RNG: Cell<Wrapping<u32>> = Cell::new(Wrapping(1_406_868_647));
 85 |     }
 86 | 
 87 |     #[allow(clippy::cast_possible_truncation)]
 88 |     RNG.try_with(|rng| {
 89 |         // This is the 32-bit variant of Xorshift.
 90 |         //
 91 |         // Source: https://en.wikipedia.org/wiki/Xorshift
 92 |         let mut x = rng.get();
 93 |         x ^= x << 13;
 94 |         x ^= x >> 17;
 95 |         x ^= x << 5;
 96 |         rng.set(x);
 97 | 
 98 |         // This is a fast alternative to `x % n`.
 99 |         //
100 |         // Author: Daniel Lemire
101 |         // Source: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
102 |         (u64::from(x.0).wrapping_mul(u64::from(n)) >> 32) as u32
103 |     })
104 |     .unwrap_or(0)
105 | }
106 | 


--------------------------------------------------------------------------------
/src/meta.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | 
  3 | /// A simple map that can be used to store metadata
  4 | /// for the pagecache tenant.
  5 | #[derive(Clone, Debug, Eq, PartialEq, Default)]
  6 | pub struct Meta {
  7 |     pub(crate) inner: BTreeMap<IVec, PageId>,
  8 | }
  9 | 
 10 | impl Meta {
 11 |     /// Retrieve the `PageId` associated with an identifier
 12 |     pub(crate) fn get_root(&self, table: &[u8]) -> Option<PageId> {
 13 |         self.inner.get(table).cloned()
 14 |     }
 15 | 
 16 |     /// Set the `PageId` associated with an identifier
 17 |     pub(crate) fn set_root(
 18 |         &mut self,
 19 |         name: IVec,
 20 |         pid: PageId,
 21 |     ) -> Option<PageId> {
 22 |         self.inner.insert(name, pid)
 23 |     }
 24 | 
 25 |     /// Remove the page mapping for a given identifier
 26 |     pub(crate) fn del_root(&mut self, name: &[u8]) -> Option<PageId> {
 27 |         self.inner.remove(name)
 28 |     }
 29 | 
 30 |     /// Return the current rooted tenants in Meta
 31 |     pub(crate) fn tenants(&self) -> BTreeMap<IVec, PageId> {
 32 |         self.inner.clone()
 33 |     }
 34 | 
 35 |     pub(crate) fn rss(&self) -> u64 {
 36 |         self.inner
 37 |             .iter()
 38 |             .map(|(k, _pid)| {
 39 |                 k.len() as u64 + std::mem::size_of::<PageId>() as u64
 40 |             })
 41 |             .sum()
 42 |     }
 43 | }
 44 | 
 45 | /// Open or create a new disk-backed Tree with its own keyspace,
 46 | /// accessible from the `Db` via the provided identifier.
 47 | pub(crate) fn open_tree<V>(
 48 |     context: &Context,
 49 |     raw_name: V,
 50 |     guard: &Guard,
 51 | ) -> Result<Tree>
 52 | where
 53 |     V: Into<IVec>,
 54 | {
 55 |     let name = raw_name.into();
 56 | 
 57 |     // we loop because creating this Tree may race with
 58 |     // concurrent attempts to open the same one.
 59 |     loop {
 60 |         match context.pagecache.meta_pid_for_name(&name, guard) {
 61 |             Ok(root_id) => {
 62 |                 assert_ne!(root_id, 0);
 63 |                 return Ok(Tree(Arc::new(TreeInner {
 64 |                     tree_id: name,
 65 |                     context: context.clone(),
 66 |                     subscribers: Subscribers::default(),
 67 |                     root: AtomicU64::new(root_id),
 68 |                     merge_operator: RwLock::new(None),
 69 |                 })));
 70 |             }
 71 |             Err(Error::CollectionNotFound(_)) => {}
 72 |             Err(other) => return Err(other),
 73 |         }
 74 | 
 75 |         // set up empty leaf
 76 |         let mut leaf = Node::new_empty_leaf();
 77 |         leaf.is_index = false;
 78 |         let (leaf_id, leaf_ptr) = context.pagecache.allocate(leaf, guard)?;
 79 | 
 80 |         trace!(
 81 |             "allocated pid {} for leaf in new_tree for namespace {:?}",
 82 |             leaf_id,
 83 |             name
 84 |         );
 85 | 
 86 |         // set up root index
 87 | 
 88 |         // vec![0] represents a prefix-encoded empty prefix
 89 |         let root = Node::new_root(leaf_id);
 90 |         let (root_id, root_ptr) = context.pagecache.allocate(root, guard)?;
 91 | 
 92 |         debug!("allocated pid {} for root of new_tree {:?}", root_id, name);
 93 | 
 94 |         let res = context.pagecache.cas_root_in_meta(
 95 |             &name,
 96 |             None,
 97 |             Some(root_id),
 98 |             guard,
 99 |         )?;
100 | 
101 |         if res.is_err() {
102 |             // clean up the tree we just created if we couldn't
103 |             // install it.
104 |             let _ = context
105 |                 .pagecache
106 |                 .free(root_id, root_ptr, guard)?
107 |                 .expect("could not free allocated page");
108 |             let _ = context
109 |                 .pagecache
110 |                 .free(leaf_id, leaf_ptr, guard)?
111 |                 .expect("could not free allocated page");
112 |             continue;
113 |         }
114 | 
115 |         return Ok(Tree(Arc::new(TreeInner {
116 |             tree_id: name,
117 |             subscribers: Subscribers::default(),
118 |             context: context.clone(),
119 |             root: AtomicU64::new(root_id),
120 |             merge_operator: RwLock::new(None),
121 |         })));
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/experiments/epoch/Cargo.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Cargo.
 2 | # It is not intended for manual editing.
 3 | [[package]]
 4 | name = "autocfg"
 5 | version = "0.1.7"
 6 | source = "registry+https://github.com/rust-lang/crates.io-index"
 7 | 
 8 | [[package]]
 9 | name = "cfg-if"
10 | version = "0.1.10"
11 | source = "registry+https://github.com/rust-lang/crates.io-index"
12 | 
13 | [[package]]
14 | name = "crossbeam-epoch"
15 | version = "0.8.0"
16 | source = "registry+https://github.com/rust-lang/crates.io-index"
17 | dependencies = [
18 |  "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
19 |  "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
20 |  "crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
21 |  "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
22 |  "memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
23 |  "scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
24 | ]
25 | 
26 | [[package]]
27 | name = "crossbeam-utils"
28 | version = "0.7.0"
29 | source = "registry+https://github.com/rust-lang/crates.io-index"
30 | dependencies = [
31 |  "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
32 |  "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
33 |  "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
34 | ]
35 | 
36 | [[package]]
37 | name = "epoch"
38 | version = "0.1.0"
39 | dependencies = [
40 |  "crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
41 | ]
42 | 
43 | [[package]]
44 | name = "lazy_static"
45 | version = "1.4.0"
46 | source = "registry+https://github.com/rust-lang/crates.io-index"
47 | 
48 | [[package]]
49 | name = "memoffset"
50 | version = "0.5.3"
51 | source = "registry+https://github.com/rust-lang/crates.io-index"
52 | dependencies = [
53 |  "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
54 | ]
55 | 
56 | [[package]]
57 | name = "rustc_version"
58 | version = "0.2.3"
59 | source = "registry+https://github.com/rust-lang/crates.io-index"
60 | dependencies = [
61 |  "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
62 | ]
63 | 
64 | [[package]]
65 | name = "scopeguard"
66 | version = "1.0.0"
67 | source = "registry+https://github.com/rust-lang/crates.io-index"
68 | 
69 | [[package]]
70 | name = "semver"
71 | version = "0.9.0"
72 | source = "registry+https://github.com/rust-lang/crates.io-index"
73 | dependencies = [
74 |  "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
75 | ]
76 | 
77 | [[package]]
78 | name = "semver-parser"
79 | version = "0.7.0"
80 | source = "registry+https://github.com/rust-lang/crates.io-index"
81 | 
82 | [metadata]
83 | "checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
84 | "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
85 | "checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac"
86 | "checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4"
87 | "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
88 | "checksum memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "75189eb85871ea5c2e2c15abbdd541185f63b408415e5051f5cac122d8c774b9"
89 | "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
90 | "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
91 | "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
92 | "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
93 | 


--------------------------------------------------------------------------------
/src/oneshot.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     future::Future,
  3 |     pin::Pin,
  4 |     task::{Context, Poll, Waker},
  5 |     time::{Duration, Instant},
  6 | };
  7 | 
  8 | use parking_lot::{Condvar, Mutex};
  9 | 
 10 | use crate::Arc;
 11 | 
 12 | #[derive(Debug)]
 13 | struct OneShotState<T> {
 14 |     filled: bool,
 15 |     fused: bool,
 16 |     item: Option<T>,
 17 |     waker: Option<Waker>,
 18 | }
 19 | 
 20 | impl<T> Default for OneShotState<T> {
 21 |     fn default() -> OneShotState<T> {
 22 |         OneShotState { filled: false, fused: false, item: None, waker: None }
 23 |     }
 24 | }
 25 | 
 26 | /// A Future value which may or may not be filled
 27 | #[derive(Debug)]
 28 | pub struct OneShot<T> {
 29 |     mu: Arc<Mutex<OneShotState<T>>>,
 30 |     cv: Arc<Condvar>,
 31 | }
 32 | 
 33 | /// The completer side of the Future
 34 | pub struct OneShotFiller<T> {
 35 |     mu: Arc<Mutex<OneShotState<T>>>,
 36 |     cv: Arc<Condvar>,
 37 | }
 38 | 
 39 | impl<T> OneShot<T> {
 40 |     /// Create a new `OneShotFiller` and the `OneShot`
 41 |     /// that will be filled by its completion.
 42 |     pub fn pair() -> (OneShotFiller<T>, Self) {
 43 |         let mu = Arc::new(Mutex::new(OneShotState::default()));
 44 |         let cv = Arc::new(Condvar::new());
 45 |         let future = Self { mu: mu.clone(), cv: cv.clone() };
 46 |         let filler = OneShotFiller { mu, cv };
 47 | 
 48 |         (filler, future)
 49 |     }
 50 | 
 51 |     /// Block on the `OneShot`'s completion
 52 |     /// or dropping of the `OneShotFiller`
 53 |     pub fn wait(self) -> Option<T> {
 54 |         let mut inner = self.mu.lock();
 55 |         while !inner.filled {
 56 |             self.cv.wait(&mut inner);
 57 |         }
 58 |         inner.item.take()
 59 |     }
 60 | 
 61 |     /// Block on the `OneShot`'s completion
 62 |     /// or dropping of the `OneShotFiller`,
 63 |     /// returning an error if not filled
 64 |     /// before a given timeout or if the
 65 |     /// system shuts down before then.
 66 |     pub fn wait_timeout(
 67 |         self,
 68 |         mut timeout: Duration,
 69 |     ) -> Result<T, std::sync::mpsc::RecvTimeoutError> {
 70 |         let mut inner = self.mu.lock();
 71 |         while !inner.filled {
 72 |             let start = Instant::now();
 73 |             let res = self.cv.wait_for(&mut inner, timeout);
 74 |             if res.timed_out() {
 75 |                 return Err(std::sync::mpsc::RecvTimeoutError::Disconnected);
 76 |             }
 77 |             timeout =
 78 |                 if let Some(timeout) = timeout.checked_sub(start.elapsed()) {
 79 |                     timeout
 80 |                 } else {
 81 |                     Duration::from_nanos(0)
 82 |                 };
 83 |         }
 84 |         if let Some(item) = inner.item.take() {
 85 |             Ok(item)
 86 |         } else {
 87 |             Err(std::sync::mpsc::RecvTimeoutError::Disconnected)
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | impl<T> Future for OneShot<T> {
 93 |     type Output = Option<T>;
 94 | 
 95 |     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
 96 |         let mut state = self.mu.lock();
 97 |         if state.fused {
 98 |             return Poll::Pending;
 99 |         }
100 |         if state.filled {
101 |             state.fused = true;
102 |             Poll::Ready(state.item.take())
103 |         } else {
104 |             state.waker = Some(cx.waker().clone());
105 |             Poll::Pending
106 |         }
107 |     }
108 | }
109 | 
110 | impl<T> OneShotFiller<T> {
111 |     /// Complete the `OneShot`
112 |     pub fn fill(self, inner: T) {
113 |         let mut state = self.mu.lock();
114 | 
115 |         if let Some(waker) = state.waker.take() {
116 |             waker.wake();
117 |         }
118 | 
119 |         state.filled = true;
120 |         state.item = Some(inner);
121 | 
122 |         // having held the mutex makes this linearized
123 |         // with the notify below.
124 |         drop(state);
125 | 
126 |         let _notified = self.cv.notify_all();
127 |     }
128 | }
129 | 
130 | impl<T> Drop for OneShotFiller<T> {
131 |     fn drop(&mut self) {
132 |         let mut state = self.mu.lock();
133 | 
134 |         if state.filled {
135 |             return;
136 |         }
137 | 
138 |         if let Some(waker) = state.waker.take() {
139 |             waker.wake();
140 |         }
141 | 
142 |         state.filled = true;
143 | 
144 |         // having held the mutex makes this linearized
145 |         // with the notify below.
146 |         drop(state);
147 | 
148 |         let _notified = self.cv.notify_all();
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/src/sys_limits.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unsafe_code)]
  2 | 
  3 | #[cfg(any(target_os = "linux", target_os = "macos"))]
  4 | use std::io;
  5 | #[cfg(any(target_os = "linux"))]
  6 | use {std::fs::File, std::io::Read};
  7 | 
  8 | /// See the Kernel's documentation for more information about this subsystem,
  9 | /// found at:  [Documentation/cgroup-v1/memory.txt](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt)
 10 | ///
 11 | /// If there's no memory limit specified on the container this may return
 12 | /// 0x7FFFFFFFFFFFF000 (2^63-1 rounded down to 4k which is a common page size).
 13 | /// So we know we are not running in a memory restricted environment.
 14 | #[cfg(target_os = "linux")]
 15 | fn get_cgroup_memory_limit() -> io::Result<u64> {
 16 |     File::open("/sys/fs/cgroup/memory/memory.limit_in_bytes")
 17 |         .and_then(read_u64_from)
 18 | }
 19 | 
 20 | #[cfg(target_os = "linux")]
 21 | fn read_u64_from(mut file: File) -> io::Result<u64> {
 22 |     let mut s = String::new();
 23 |     file.read_to_string(&mut s).and_then(|_| {
 24 |         s.trim()
 25 |             .parse()
 26 |             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
 27 |     })
 28 | }
 29 | 
 30 | /// Returns the maximum size of total available memory of the process, in bytes.
 31 | /// If this limit is exceeded, the malloc() and mmap() functions shall fail with
 32 | /// errno set to [ENOMEM].
 33 | #[cfg(any(target_os = "linux", target_os = "macos"))]
 34 | fn get_rlimit_as() -> io::Result<libc::rlimit> {
 35 |     let mut limit = std::mem::MaybeUninit::<libc::rlimit>::uninit();
 36 | 
 37 |     let ret = unsafe { libc::getrlimit(libc::RLIMIT_AS, limit.as_mut_ptr()) };
 38 | 
 39 |     if ret == 0 {
 40 |         Ok(unsafe { limit.assume_init() })
 41 |     } else {
 42 |         Err(io::Error::last_os_error())
 43 |     }
 44 | }
 45 | 
 46 | #[cfg(any(target_os = "linux", target_os = "macos"))]
 47 | pub fn get_available_memory() -> io::Result<u64> {
 48 |     use std::convert::TryFrom;
 49 | 
 50 |     let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) };
 51 |     if pages == -1 {
 52 |         return Err(io::Error::last_os_error());
 53 |     }
 54 | 
 55 |     let page_size = unsafe { libc::sysconf(libc::_SC_PAGE_SIZE) };
 56 |     if page_size == -1 {
 57 |         return Err(io::Error::last_os_error());
 58 |     }
 59 | 
 60 |     Ok(u64::try_from(pages).unwrap() * u64::try_from(page_size).unwrap())
 61 | }
 62 | 
 63 | pub fn get_memory_limit() -> u64 {
 64 |     // Maximum addressable memory space limit in u64
 65 |     static MAX_USIZE: u64 = usize::max_value() as u64;
 66 | 
 67 |     let mut max: u64 = 0;
 68 | 
 69 |     #[cfg(target_os = "linux")]
 70 |     {
 71 |         if let Ok(mem) = get_cgroup_memory_limit() {
 72 |             max = mem;
 73 |         }
 74 | 
 75 |         // If there's no memory limit specified on the container this
 76 |         // actually returns 0x7FFFFFFFFFFFF000 (2^63-1 rounded down to
 77 |         // 4k which is a common page size). So we know we are not
 78 |         // running in a memory restricted environment.
 79 |         // src: https://github.com/dotnet/coreclr/blob/master/src/pal/src/misc/cgroup.cpp#L385-L428
 80 |         if max > 0x7FFF_FFFF_0000_0000 {
 81 |             return 0;
 82 |         }
 83 |     }
 84 | 
 85 |     #[cfg(any(target_os = "linux", target_os = "macos"))]
 86 |     {
 87 |         if let Ok(rlim) = get_rlimit_as() {
 88 |             let rlim_cur = Into::<u64>::into(rlim.rlim_cur);
 89 |             if rlim_cur < max || max == 0 {
 90 |                 max = rlim_cur;
 91 |             }
 92 |         }
 93 | 
 94 |         if let Ok(available) = get_available_memory() {
 95 |             if available < max || max == 0 {
 96 |                 max = available;
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     if max > MAX_USIZE {
102 |         // It is observed in practice when the memory is unrestricted, Linux
103 |         // control group returns a physical limit that is bigger than
104 |         // the address space
105 |         max = MAX_USIZE;
106 |     }
107 | 
108 |     #[cfg(miri)]
109 |     {
110 |         // Miri has a significant memory consumption overhead. During a small
111 |         // test run, a memory amplification of ~35x was observed. Certain
112 |         // memory overheads may increase asymptotically with longer test runs,
113 |         // such as the interpreter's dead_alloc_map. Memory overhead is
114 |         // dominated by stacked borrows tags; the asymptotic behavior of this
115 |         // overhead needs further investigation.
116 |         max /= 40;
117 |     }
118 | 
119 |     max
120 | }
121 | 


--------------------------------------------------------------------------------
/src/doc/motivating_experiences/mod.rs:
--------------------------------------------------------------------------------
 1 | //! <p align="center">
 2 | //!   <img src="https://raw.githubusercontent.com/spacejam/sled/master/art/tree_face.png" width="20%" height="auto" />
 3 | //! </p>
 4 | //!
 5 | //! # Experiences with Other Systems
 6 | //!
 7 | //! sled is motivated by the experiences gained while working with other
 8 | //! stateful systems, outlined below.
 9 | //!
10 | //! Most of the points below are learned from being burned, rather than
11 | //! delighted.
12 | //!
13 | //! #### MySQL
14 | //!
15 | //! * make it easy to tail the replication stream in flexible topologies
16 | //! * support merging shards a la MariaDB
17 | //! * support mechanisms for live, lock-free schema updates a la
18 | //!   pt-online-schema-change
19 | //! * include GTID in all replication information
20 | //! * actively reduce tree fragmentation
21 | //! * give operators and distributed database creators first-class support for
22 | //!   replication, sharding, backup, tuning, and diagnosis
23 | //! * O_DIRECT + real linux AIO is worth the effort
24 | //!
25 | //! #### Redis
26 | //!
27 | //! * provide high-level collections that let engineers get to their business
28 | //!   logic as quickly as possible instead of forcing them to define a schema in
29 | //!   a relational system (usually spending an hour+ googling how to even do it)
30 | //! * don't let single slow requests block all other requests to a shard
31 | //! * let operators peer into the sequence of operations that hit the database
32 | //!   to track down bad usage
33 | //! * don't force replicas to retrieve the entire state of the leader when they
34 | //!   begin replication
35 | //!
36 | //! #### HBase
37 | //!
38 | //! * don't split "the source of truth" across too many decoupled systems or you
39 | //!   will always have downtime
40 | //! * give users first-class APIs to peer into their system state without
41 | //!   forcing them to write scrapers
42 | //! * serve http pages for high-level overviews and possibly log access
43 | //! * coprocessors are awesome but people should have easy ways of doing
44 | //!   secondary indexing
45 | //!
46 | //! #### RocksDB
47 | //!
48 | //! * give users tons of flexibility with different usage patterns
49 | //! * don't force users to use distributed machine learning to discover
50 | //!   configurations that work for their use cases
51 | //! * merge operators are extremely powerful
52 | //! * merge operators should be usable from serial transactions across multiple
53 | //!   keys
54 | //!
55 | //! #### etcd
56 | //!
57 | //! * raft makes operating replicated systems SO MUCH EASIER than popular
58 | //!   relational systems / redis etc...
59 | //! * modify raft to use leader leases instead of using the paxos register,
60 | //!   avoiding livelocks in the presence of simple partitions
61 | //! * give users flexible interfaces
62 | //! * reactive semantics are awesome, but access must be done through smart
63 | //!   clients, because users will assume watches are reliable
64 | //! * if we have smart clients anyway, quorum reads can be cheap by
65 | //!   lower-bounding future reads to the raft id last observed
66 | //! * expose the metrics and operational levers required to build a self-driving
67 | //!   stateful system on top of k8s/mesos/cloud providers/etc...
68 | //!
69 | //! #### Tendermint
70 | //!
71 | //! * build things in a testable way from the beginning
72 | //! * don't seek gratuitous concurrency
73 | //! * allow replication streams to be used in flexible ways
74 | //! * instant finality (or interface finality, the thing should be done by the
75 | //!   time the request successfully returns to the client) is mandatory for nice
76 | //!   high-level interfaces that don't push optimism (and rollbacks) into
77 | //!   interfacing systems
78 | //!
79 | //! #### LMDB
80 | //!
81 | //! * approach a wait-free tree traversal for reads
82 | //! * use modern tree structures that can support concurrent writers
83 | //! * multi-process is nice for browsers etc...
84 | //! * people value read performance and are often forgiving of terrible write
85 | //!   performance for most workloads
86 | //!
87 | //! #### Zookeeper
88 | //! * reactive semantics are awesome, but access must be done through smart
89 | //!   clients, because users will assume watches are reliable
90 | //! * the more important the system, the more you should keep old snapshots
91 | //!   around for emergency recovery
92 | //! * never assume a hostname that was resolvable in the past will be resolvable
93 | //!   in the future
94 | //! * if a critical thread dies, bring down the entire system
95 | //! * make replication configuration as simple as possible. people will mess up
96 | //!   the order and cause split brains if this is not automated.
97 | 


--------------------------------------------------------------------------------
/src/pagecache/reservation.rs:
--------------------------------------------------------------------------------
  1 | use crate::{pagecache::*, *};
  2 | 
  3 | /// A pending log reservation which can be aborted or completed.
  4 | /// NB the holder should quickly call `complete` or `abort` as
  5 | /// taking too long to decide will cause the underlying IO
  6 | /// buffer to become blocked.
  7 | #[derive(Debug)]
  8 | pub struct Reservation<'a> {
  9 |     pub(super) log: &'a Log,
 10 |     pub(super) iobuf: Arc<IoBuf>,
 11 |     pub(super) buf: &'a mut [u8],
 12 |     pub(super) flushed: bool,
 13 |     pub pointer: DiskPtr,
 14 |     pub lsn: Lsn,
 15 |     pub(super) is_heap_item_rewrite: bool,
 16 |     pub(super) header_len: usize,
 17 | }
 18 | 
 19 | impl<'a> Drop for Reservation<'a> {
 20 |     fn drop(&mut self) {
 21 |         // We auto-abort if the user never uses a reservation.
 22 |         if !self.flushed {
 23 |             if let Err(e) = self.flush(false) {
 24 |                 self.log.config.set_global_error(e);
 25 |             }
 26 |         }
 27 |     }
 28 | }
 29 | 
 30 | impl<'a> Reservation<'a> {
 31 |     /// Cancel the reservation, placing a failed flush on disk, returning
 32 |     /// the (cancelled) log sequence number and file offset.
 33 |     pub fn abort(mut self) -> Result<(Lsn, DiskPtr)> {
 34 |         if self.pointer.is_heap_item() && !self.is_heap_item_rewrite {
 35 |             // we can instantly free this heap item because its pointer
 36 |             // is assumed to have failed to have been installed into
 37 |             // the pagetable, so we can assume nobody is operating
 38 |             // on it.
 39 | 
 40 |             trace!(
 41 |                 "removing heap item for aborted reservation at lsn {}",
 42 |                 self.pointer
 43 |             );
 44 | 
 45 |             self.log.config.heap.free(self.pointer.heap_id().unwrap());
 46 |         }
 47 | 
 48 |         self.flush(false)
 49 |     }
 50 | 
 51 |     /// Complete the reservation, placing the buffer on disk. returns
 52 |     /// the log sequence number of the write, and the file offset.
 53 |     pub fn complete(mut self) -> Result<(Lsn, DiskPtr)> {
 54 |         self.flush(true)
 55 |     }
 56 | 
 57 |     /// Returns the length of the on-log reservation.
 58 |     pub(crate) fn reservation_len(&self) -> usize {
 59 |         self.buf.len()
 60 |     }
 61 | 
 62 |     /// Refills the reservation buffer with new data.
 63 |     /// Must supply a buffer of an identical length
 64 |     /// as the one initially provided. Don't use this
 65 |     /// on messages subject to compression etc...
 66 |     ///
 67 |     /// # Panics
 68 |     ///
 69 |     /// Will panic if the reservation is not the correct
 70 |     /// size to hold a serialized Lsn.
 71 |     #[doc(hidden)]
 72 |     pub fn mark_writebatch(self, peg_lsn: Lsn) -> Result<(Lsn, DiskPtr)> {
 73 |         trace!(
 74 |             "writing batch required stable lsn {} into \
 75 |              BatchManifest at lid {:?} peg_lsn {}",
 76 |             peg_lsn,
 77 |             self.pointer.lid(),
 78 |             self.lsn
 79 |         );
 80 | 
 81 |         if self.lsn == peg_lsn {
 82 |             // this can happen because high-level tree updates
 83 |             // may result in no work happening.
 84 |             self.abort()
 85 |         } else {
 86 |             self.buf[4] = MessageKind::BatchManifest.into();
 87 | 
 88 |             let buf = lsn_to_arr(peg_lsn);
 89 | 
 90 |             let dst = &mut self.buf[self.header_len..];
 91 | 
 92 |             dst.copy_from_slice(&buf);
 93 | 
 94 |             let mut intervals = self.log.iobufs.intervals.lock();
 95 |             intervals.mark_batch((self.lsn, peg_lsn));
 96 |             drop(intervals);
 97 | 
 98 |             self.complete()
 99 |         }
100 |     }
101 | 
102 |     fn flush(&mut self, valid: bool) -> Result<(Lsn, DiskPtr)> {
103 |         if self.flushed {
104 |             panic!("flushing already-flushed reservation!");
105 |         }
106 | 
107 |         self.flushed = true;
108 | 
109 |         if !valid {
110 |             // don't actually zero the message, still check its hash
111 |             // on recovery to find corruption.
112 |             self.buf[4] = MessageKind::Canceled.into();
113 |         }
114 | 
115 |         let crc32 = calculate_message_crc32(
116 |             self.buf[..self.header_len].as_ref(),
117 |             &self.buf[self.header_len..],
118 |         );
119 |         let crc32_arr = u32_to_arr(crc32);
120 | 
121 |         #[allow(unsafe_code)]
122 |         unsafe {
123 |             std::ptr::copy_nonoverlapping(
124 |                 crc32_arr.as_ptr(),
125 |                 self.buf.as_mut_ptr(),
126 |                 std::mem::size_of::<u32>(),
127 |             );
128 |         }
129 |         self.log.exit_reservation(&self.iobuf)?;
130 | 
131 |         Ok((self.lsn, self.pointer))
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/benchmarks/criterion/benches/sled.rs:
--------------------------------------------------------------------------------
  1 | use criterion::{criterion_group, criterion_main, Criterion};
  2 | 
  3 | use jemallocator::Jemalloc;
  4 | 
  5 | use sled::Config;
  6 | 
  7 | #[cfg_attr(
  8 |     // only enable jemalloc on linux and macos by default
  9 |     any(target_os = "linux", target_os = "macos"),
 10 |     global_allocator
 11 | )]
 12 | static ALLOC: Jemalloc = Jemalloc;
 13 | 
 14 | fn counter() -> usize {
 15 |     use std::sync::atomic::{AtomicUsize, Ordering::Relaxed};
 16 | 
 17 |     static C: AtomicUsize = AtomicUsize::new(0);
 18 | 
 19 |     C.fetch_add(1, Relaxed)
 20 | }
 21 | 
 22 | /// Generates a random number in `0..n`.
 23 | fn random(n: u32) -> u32 {
 24 |     use std::cell::Cell;
 25 |     use std::num::Wrapping;
 26 | 
 27 |     thread_local! {
 28 |         static RNG: Cell<Wrapping<u32>> = Cell::new(Wrapping(1406868647));
 29 |     }
 30 | 
 31 |     RNG.with(|rng| {
 32 |         // This is the 32-bit variant of Xorshift.
 33 |         //
 34 |         // Source: https://en.wikipedia.org/wiki/Xorshift
 35 |         let mut x = rng.get();
 36 |         x ^= x << 13;
 37 |         x ^= x >> 17;
 38 |         x ^= x << 5;
 39 |         rng.set(x);
 40 | 
 41 |         // This is a fast alternative to `x % n`.
 42 |         //
 43 |         // Author: Daniel Lemire
 44 |         // Source: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
 45 |         ((x.0 as u64).wrapping_mul(n as u64) >> 32) as u32
 46 |     })
 47 | }
 48 | 
 49 | fn sled_bulk_load(c: &mut Criterion) {
 50 |     let mut count = 0_u32;
 51 |     let mut bytes = |len| -> Vec<u8> {
 52 |         count += 1;
 53 |         count.to_be_bytes().into_iter().cycle().take(len).copied().collect()
 54 |     };
 55 | 
 56 |     let mut bench = |key_len, val_len| {
 57 |         let db = Config::new()
 58 |             .path(format!("bulk_k{}_v{}", key_len, val_len))
 59 |             .temporary(true)
 60 |             .flush_every_ms(None)
 61 |             .open()
 62 |             .unwrap();
 63 | 
 64 |         c.bench_function(
 65 |             &format!("bulk load key/value lengths {}/{}", key_len, val_len),
 66 |             |b| {
 67 |                 b.iter(|| {
 68 |                     db.insert(bytes(key_len), bytes(val_len)).unwrap();
 69 |                 })
 70 |             },
 71 |         );
 72 |     };
 73 | 
 74 |     for key_len in &[10_usize, 128, 256, 512] {
 75 |         for val_len in &[0_usize, 10, 128, 256, 512, 1024, 2048, 4096, 8192] {
 76 |             bench(*key_len, *val_len)
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | fn sled_monotonic_crud(c: &mut Criterion) {
 82 |     let db = Config::new().temporary(true).flush_every_ms(None).open().unwrap();
 83 | 
 84 |     c.bench_function("monotonic inserts", |b| {
 85 |         let mut count = 0_u32;
 86 |         b.iter(|| {
 87 |             count += 1;
 88 |             db.insert(count.to_be_bytes(), vec![]).unwrap();
 89 |         })
 90 |     });
 91 | 
 92 |     c.bench_function("monotonic gets", |b| {
 93 |         let mut count = 0_u32;
 94 |         b.iter(|| {
 95 |             count += 1;
 96 |             db.get(count.to_be_bytes()).unwrap();
 97 |         })
 98 |     });
 99 | 
100 |     c.bench_function("monotonic removals", |b| {
101 |         let mut count = 0_u32;
102 |         b.iter(|| {
103 |             count += 1;
104 |             db.remove(count.to_be_bytes()).unwrap();
105 |         })
106 |     });
107 | }
108 | 
109 | fn sled_random_crud(c: &mut Criterion) {
110 |     const SIZE: u32 = 65536;
111 | 
112 |     let db = Config::new().temporary(true).flush_every_ms(None).open().unwrap();
113 | 
114 |     c.bench_function("random inserts", |b| {
115 |         b.iter(|| {
116 |             let k = random(SIZE).to_be_bytes();
117 |             db.insert(k, vec![]).unwrap();
118 |         })
119 |     });
120 | 
121 |     c.bench_function("random gets", |b| {
122 |         b.iter(|| {
123 |             let k = random(SIZE).to_be_bytes();
124 |             db.get(k).unwrap();
125 |         })
126 |     });
127 | 
128 |     c.bench_function("random removals", |b| {
129 |         b.iter(|| {
130 |             let k = random(SIZE).to_be_bytes();
131 |             db.remove(k).unwrap();
132 |         })
133 |     });
134 | }
135 | 
136 | fn sled_empty_opens(c: &mut Criterion) {
137 |     let _ = std::fs::remove_dir_all("empty_opens");
138 |     c.bench_function("empty opens", |b| {
139 |         b.iter(|| {
140 |             Config::new()
141 |                 .path(format!("empty_opens/{}.db", counter()))
142 |                 .flush_every_ms(None)
143 |                 .open()
144 |                 .unwrap()
145 |         })
146 |     });
147 |     let _ = std::fs::remove_dir_all("empty_opens");
148 | }
149 | 
150 | criterion_group!(
151 |     benches,
152 |     sled_bulk_load,
153 |     sled_monotonic_crud,
154 |     sled_random_crud,
155 |     sled_empty_opens
156 | );
157 | criterion_main!(benches);
158 | 


--------------------------------------------------------------------------------
/src/dll.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unsafe_code)]
  2 | 
  3 | use std::ptr;
  4 | 
  5 | use crate::PageId;
  6 | 
  7 | /// A simple doubly linked list for use in the `Lru`
  8 | #[derive(Debug)]
  9 | pub(crate) struct Node {
 10 |     inner: PageId,
 11 |     next: *mut Node,
 12 |     prev: *mut Node,
 13 | }
 14 | 
 15 | impl Node {
 16 |     fn unwire(&mut self) {
 17 |         unsafe {
 18 |             if !self.prev.is_null() {
 19 |                 (*self.prev).next = self.next;
 20 |             }
 21 | 
 22 |             if !self.next.is_null() {
 23 |                 (*self.next).prev = self.prev;
 24 |             }
 25 |         }
 26 | 
 27 |         self.next = ptr::null_mut();
 28 |         self.prev = ptr::null_mut();
 29 |     }
 30 | }
 31 | 
 32 | /// A simple non-cyclical doubly linked
 33 | /// list where items can be efficiently
 34 | /// removed from the middle, for the purposes
 35 | /// of backing an LRU cache.
 36 | pub struct DoublyLinkedList {
 37 |     head: *mut Node,
 38 |     tail: *mut Node,
 39 |     len: usize,
 40 | }
 41 | 
 42 | unsafe impl Send for DoublyLinkedList {}
 43 | 
 44 | impl Drop for DoublyLinkedList {
 45 |     fn drop(&mut self) {
 46 |         let mut cursor = self.head;
 47 |         while !cursor.is_null() {
 48 |             unsafe {
 49 |                 let node = Box::from_raw(cursor);
 50 | 
 51 |                 // don't need to check for cycles
 52 |                 // because this Dll is non-cyclical
 53 |                 cursor = node.prev;
 54 | 
 55 |                 // this happens without the manual drop,
 56 |                 // but we keep it for explicitness
 57 |                 drop(node);
 58 |             }
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | impl Default for DoublyLinkedList {
 64 |     fn default() -> Self {
 65 |         Self { head: ptr::null_mut(), tail: ptr::null_mut(), len: 0 }
 66 |     }
 67 | }
 68 | 
 69 | impl DoublyLinkedList {
 70 |     pub(crate) const fn len(&self) -> usize {
 71 |         self.len
 72 |     }
 73 | 
 74 |     pub(crate) fn push_head(&mut self, item: PageId) -> *mut Node {
 75 |         self.len += 1;
 76 | 
 77 |         let node = Node { inner: item, next: ptr::null_mut(), prev: self.head };
 78 | 
 79 |         let ptr = Box::into_raw(Box::new(node));
 80 | 
 81 |         self.push_head_ptr(ptr)
 82 |     }
 83 | 
 84 |     fn push_head_ptr(&mut self, ptr: *mut Node) -> *mut Node {
 85 |         if !self.head.is_null() {
 86 |             unsafe {
 87 |                 (*self.head).next = ptr;
 88 |                 (*ptr).prev = self.head;
 89 |             }
 90 |         }
 91 | 
 92 |         if self.tail.is_null() {
 93 |             self.tail = ptr;
 94 |         }
 95 | 
 96 |         self.head = ptr;
 97 | 
 98 |         ptr
 99 |     }
100 | 
101 |     #[cfg(test)]
102 |     pub(crate) fn push_tail(&mut self, item: PageId) {
103 |         self.len += 1;
104 | 
105 |         let node = Node { inner: item, next: self.tail, prev: ptr::null_mut() };
106 | 
107 |         let ptr = Box::into_raw(Box::new(node));
108 | 
109 |         if !self.tail.is_null() {
110 |             unsafe {
111 |                 (*self.tail).prev = ptr;
112 |             }
113 |         }
114 | 
115 |         if self.head.is_null() {
116 |             self.head = ptr;
117 |         }
118 | 
119 |         self.tail = ptr;
120 |     }
121 | 
122 |     pub(crate) fn promote(&mut self, ptr: *mut Node) -> *mut Node {
123 |         if self.head == ptr {
124 |             return ptr;
125 |         }
126 | 
127 |         unsafe {
128 |             if self.tail == ptr {
129 |                 self.tail = (*ptr).next;
130 |             }
131 | 
132 |             if self.head == ptr {
133 |                 self.head = (*ptr).prev;
134 |             }
135 | 
136 |             (*ptr).unwire();
137 | 
138 |             self.push_head_ptr(ptr)
139 |         }
140 |     }
141 | 
142 |     #[cfg(test)]
143 |     pub(crate) fn pop_head(&mut self) -> Option<PageId> {
144 |         if self.head.is_null() {
145 |             return None;
146 |         }
147 | 
148 |         self.len -= 1;
149 | 
150 |         unsafe {
151 |             let mut head = Box::from_raw(self.head);
152 | 
153 |             if self.head == self.tail {
154 |                 self.tail = ptr::null_mut();
155 |             }
156 | 
157 |             self.head = head.prev;
158 | 
159 |             head.unwire();
160 | 
161 |             Some(head.inner)
162 |         }
163 |     }
164 | 
165 |     pub(crate) fn pop_tail(&mut self) -> Option<PageId> {
166 |         if self.tail.is_null() {
167 |             return None;
168 |         }
169 | 
170 |         self.len -= 1;
171 | 
172 |         unsafe {
173 |             let mut tail = Box::from_raw(self.tail);
174 | 
175 |             if self.head == self.tail {
176 |                 self.head = ptr::null_mut();
177 |             }
178 | 
179 |             self.tail = tail.next;
180 | 
181 |             tail.unwire();
182 | 
183 |             Some(tail.inner)
184 |         }
185 |     }
186 | 
187 |     #[cfg(test)]
188 |     pub(crate) fn into_vec(mut self) -> Vec<PageId> {
189 |         let mut res = vec![];
190 |         while let Some(val) = self.pop_head() {
191 |             res.push(val);
192 |         }
193 |         res
194 |     }
195 | }
196 | 
197 | #[allow(unused_results)]
198 | #[test]
199 | fn basic_functionality() {
200 |     let mut dll = DoublyLinkedList::default();
201 |     dll.push_head(5);
202 |     dll.push_tail(6);
203 |     dll.push_head(4);
204 |     dll.push_tail(7);
205 |     dll.push_tail(8);
206 |     dll.push_head(3);
207 |     dll.push_tail(9);
208 |     dll.push_head(2);
209 |     dll.push_head(1);
210 |     assert_eq!(dll.len(), 9);
211 |     assert_eq!(dll.into_vec(), vec![1, 2, 3, 4, 5, 6, 7, 8, 9]);
212 | }
213 | 


--------------------------------------------------------------------------------
/src/flusher.rs:
--------------------------------------------------------------------------------
  1 | use std::thread;
  2 | use std::time::Duration;
  3 | 
  4 | use parking_lot::{Condvar, Mutex};
  5 | 
  6 | use super::*;
  7 | 
  8 | #[derive(Debug, Clone, Copy)]
  9 | pub(crate) enum ShutdownState {
 10 |     Running,
 11 |     ShuttingDown,
 12 |     ShutDown,
 13 | }
 14 | 
 15 | impl ShutdownState {
 16 |     fn is_running(self) -> bool {
 17 |         if let ShutdownState::Running = self { true } else { false }
 18 |     }
 19 | 
 20 |     fn is_shutdown(self) -> bool {
 21 |         if let ShutdownState::ShutDown = self { true } else { false }
 22 |     }
 23 | }
 24 | 
 25 | #[derive(Debug)]
 26 | pub(crate) struct Flusher {
 27 |     shutdown: Arc<Mutex<ShutdownState>>,
 28 |     sc: Arc<Condvar>,
 29 |     join_handle: Mutex<Option<std::thread::JoinHandle<()>>>,
 30 | }
 31 | 
 32 | impl Flusher {
 33 |     /// Spawns a thread that periodically calls `callback` until dropped.
 34 |     pub(crate) fn new(
 35 |         name: String,
 36 |         pagecache: PageCache,
 37 |         flush_every_ms: u64,
 38 |     ) -> Self {
 39 |         #[allow(clippy::mutex_atomic)] // mutex used in CondVar below
 40 |         let shutdown = Arc::new(Mutex::new(ShutdownState::Running));
 41 |         let sc = Arc::new(Condvar::new());
 42 | 
 43 |         let join_handle = thread::Builder::new()
 44 |             .name(name)
 45 |             .spawn({
 46 |                 let shutdown = shutdown.clone();
 47 |                 let sc = sc.clone();
 48 |                 move || run(&shutdown, &sc, &pagecache, flush_every_ms)
 49 |             })
 50 |             .unwrap();
 51 | 
 52 |         Self { shutdown, sc, join_handle: Mutex::new(Some(join_handle)) }
 53 |     }
 54 | }
 55 | 
 56 | fn run(
 57 |     shutdown: &Arc<Mutex<ShutdownState>>,
 58 |     sc: &Arc<Condvar>,
 59 |     pagecache: &PageCache,
 60 |     flush_every_ms: u64,
 61 | ) {
 62 |     let flush_every = Duration::from_millis(flush_every_ms);
 63 |     let mut shutdown = shutdown.lock();
 64 |     let mut wrote_data = false;
 65 |     while shutdown.is_running() || wrote_data {
 66 |         let before = std::time::Instant::now();
 67 |         let cc = concurrency_control::read();
 68 |         match pagecache.log.roll_iobuf() {
 69 |             Ok(0) => {
 70 |                 wrote_data = false;
 71 |                 if !shutdown.is_running() {
 72 |                     break;
 73 |                 }
 74 |             }
 75 |             Ok(_) => {
 76 |                 wrote_data = true;
 77 |                 if !shutdown.is_running() {
 78 |                     // loop right away if we're in
 79 |                     // shutdown mode, to flush data
 80 |                     // more quickly.
 81 |                     continue;
 82 |                 }
 83 |             }
 84 |             Err(e) => {
 85 |                 error!("failed to flush from periodic flush thread: {}", e);
 86 | 
 87 |                 #[cfg(feature = "failpoints")]
 88 |                 pagecache.set_failpoint(e);
 89 | 
 90 |                 *shutdown = ShutdownState::ShutDown;
 91 | 
 92 |                 // having held the mutex makes this linearized
 93 |                 // with the notify below.
 94 |                 drop(shutdown);
 95 | 
 96 |                 let _notified = sc.notify_all();
 97 |                 return;
 98 |             }
 99 |         }
100 |         drop(cc);
101 | 
102 |         // so we can spend a little effort
103 |         // cleaning up the segments. try not to
104 |         // spend more than half of our sleep
105 |         // time rewriting pages though.
106 |         //
107 |         // this looks weird because it's a rust-style do-while
108 |         // where the conditional is the full body
109 |         while {
110 |             let made_progress = match pagecache.attempt_gc() {
111 |                 Err(e) => {
112 |                     error!(
113 |                         "failed to clean file from periodic flush thread: {}",
114 |                         e
115 |                     );
116 | 
117 |                     #[cfg(feature = "failpoints")]
118 |                     pagecache.set_failpoint(e);
119 | 
120 |                     *shutdown = ShutdownState::ShutDown;
121 | 
122 |                     // having held the mutex makes this linearized
123 |                     // with the notify below.
124 |                     drop(shutdown);
125 | 
126 |                     let _notified = sc.notify_all();
127 |                     return;
128 |                 }
129 |                 Ok(false) => false,
130 |                 Ok(true) => true,
131 |             };
132 |             made_progress
133 |                 && shutdown.is_running()
134 |                 && before.elapsed() < flush_every / 2
135 |         } {}
136 | 
137 |         if let Err(e) = pagecache.config.file.sync_all() {
138 |             error!("failed to fsync from periodic flush thread: {}", e);
139 |         }
140 | 
141 |         let sleep_duration = flush_every
142 |             .checked_sub(before.elapsed())
143 |             .unwrap_or_else(|| Duration::from_millis(1));
144 | 
145 |         if shutdown.is_running() {
146 |             // only sleep before the next flush if we are
147 |             // running normally. if we're shutting down,
148 |             // flush faster.
149 |             sc.wait_for(&mut shutdown, sleep_duration);
150 |         }
151 |     }
152 | 
153 |     *shutdown = ShutdownState::ShutDown;
154 | 
155 |     // having held the mutex makes this linearized
156 |     // with the notify below.
157 |     drop(shutdown);
158 | 
159 |     let _notified = sc.notify_all();
160 | }
161 | 
162 | impl Drop for Flusher {
163 |     fn drop(&mut self) {
164 |         let mut shutdown = self.shutdown.lock();
165 |         if shutdown.is_running() {
166 |             *shutdown = ShutdownState::ShuttingDown;
167 |             let _notified = self.sc.notify_all();
168 |         }
169 | 
170 |         #[allow(unused_variables)]
171 |         let mut count = 0;
172 |         while !shutdown.is_shutdown() {
173 |             let _ = self.sc.wait_for(&mut shutdown, Duration::from_millis(100));
174 |             count += 1;
175 | 
176 |             testing_assert!(count < 5);
177 |         }
178 | 
179 |         let mut join_handle_opt = self.join_handle.lock();
180 |         if let Some(join_handle) = join_handle_opt.take() {
181 |             if let Err(e) = join_handle.join() {
182 |                 error!("error joining Periodic thread: {:?}", e);
183 |             }
184 |         }
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/src/threadpool.rs:
--------------------------------------------------------------------------------
  1 | //! A simple adaptive threadpool that returns a oneshot future.
  2 | 
  3 | use std::{
  4 |     collections::VecDeque,
  5 |     sync::atomic::{
  6 |         AtomicBool,
  7 |         Ordering::{Acquire, Relaxed, Release, SeqCst},
  8 |     },
  9 |     thread,
 10 |     time::{Duration, Instant},
 11 | };
 12 | 
 13 | use parking_lot::{Condvar, Mutex};
 14 | 
 15 | use crate::{
 16 |     debug_delay, warn, AtomicU64, AtomicUsize, Error, Lazy, OneShot, Result,
 17 | };
 18 | 
 19 | // This is lower for CI reasons.
 20 | #[cfg(windows)]
 21 | const MAX_THREADS: usize = 16;
 22 | 
 23 | #[cfg(not(windows))]
 24 | const MAX_THREADS: usize = 128;
 25 | 
 26 | const DESIRED_WAITING_THREADS: usize = 7;
 27 | 
 28 | static WAITING_THREAD_COUNT: AtomicUsize = AtomicUsize::new(0);
 29 | static TOTAL_THREAD_COUNT: AtomicUsize = AtomicUsize::new(0);
 30 | static SPAWNS: AtomicUsize = AtomicUsize::new(0);
 31 | static SPAWNING: AtomicBool = AtomicBool::new(false);
 32 | static SUBMITTED: AtomicU64 = AtomicU64::new(0);
 33 | static COMPLETED: AtomicU64 = AtomicU64::new(0);
 34 | static QUEUE: Lazy<Queue, fn() -> Queue> = Lazy::new(init_queue);
 35 | static BROKEN: AtomicBool = AtomicBool::new(false);
 36 | 
 37 | type Work = Box<dyn FnOnce() + Send + 'static>;
 38 | 
 39 | fn init_queue() -> Queue {
 40 |     debug_delay();
 41 |     for _ in 0..DESIRED_WAITING_THREADS {
 42 |         debug_delay();
 43 |         if let Err(e) = spawn_new_thread(true) {
 44 |             log::error!("failed to initialize threadpool: {:?}", e);
 45 |         }
 46 |     }
 47 |     Queue { cv: Condvar::new(), mu: Mutex::new(VecDeque::new()) }
 48 | }
 49 | 
 50 | struct Queue {
 51 |     cv: Condvar,
 52 |     mu: Mutex<VecDeque<Work>>,
 53 | }
 54 | 
 55 | impl Queue {
 56 |     fn recv_timeout(&self, duration: Duration) -> Option<Work> {
 57 |         let mut queue = self.mu.lock();
 58 | 
 59 |         let cutoff = Instant::now() + duration;
 60 | 
 61 |         while queue.is_empty() {
 62 |             WAITING_THREAD_COUNT.fetch_add(1, SeqCst);
 63 |             let res = self.cv.wait_until(&mut queue, cutoff);
 64 |             WAITING_THREAD_COUNT.fetch_sub(1, SeqCst);
 65 |             if res.timed_out() {
 66 |                 break;
 67 |             }
 68 |         }
 69 | 
 70 |         queue.pop_front()
 71 |     }
 72 | 
 73 |     fn try_recv(&self) -> Option<Work> {
 74 |         let mut queue = self.mu.lock();
 75 |         queue.pop_front()
 76 |     }
 77 | 
 78 |     fn send(&self, work: Work) -> usize {
 79 |         let mut queue = self.mu.lock();
 80 |         queue.push_back(work);
 81 | 
 82 |         let len = queue.len();
 83 | 
 84 |         // having held the mutex makes this linearized
 85 |         // with the notify below.
 86 |         drop(queue);
 87 | 
 88 |         self.cv.notify_all();
 89 | 
 90 |         len
 91 |     }
 92 | }
 93 | 
 94 | fn perform_work(is_immortal: bool) {
 95 |     let wait_limit = Duration::from_secs(1);
 96 | 
 97 |     let mut performed = 0;
 98 |     let mut contiguous_overshoots = 0;
 99 | 
100 |     while is_immortal || performed < 5 || contiguous_overshoots < 3 {
101 |         debug_delay();
102 |         let task_res = QUEUE.recv_timeout(wait_limit);
103 | 
104 |         if let Some(task) = task_res {
105 |             WAITING_THREAD_COUNT.fetch_sub(1, SeqCst);
106 |             (task)();
107 |             COMPLETED.fetch_add(1, Release);
108 |             WAITING_THREAD_COUNT.fetch_add(1, SeqCst);
109 |             performed += 1;
110 |         }
111 | 
112 |         while let Some(task) = QUEUE.try_recv() {
113 |             debug_delay();
114 |             WAITING_THREAD_COUNT.fetch_sub(1, SeqCst);
115 |             (task)();
116 |             COMPLETED.fetch_add(1, Release);
117 |             WAITING_THREAD_COUNT.fetch_add(1, SeqCst);
118 |             performed += 1;
119 |         }
120 | 
121 |         debug_delay();
122 | 
123 |         let waiting = WAITING_THREAD_COUNT.load(Acquire);
124 | 
125 |         if waiting > DESIRED_WAITING_THREADS {
126 |             contiguous_overshoots += 1;
127 |         } else {
128 |             contiguous_overshoots = 0;
129 |         }
130 |     }
131 | }
132 | 
133 | // Create up to MAX_THREADS dynamic blocking task worker threads.
134 | // Dynamic threads will terminate themselves if they don't
135 | // receive any work after one second.
136 | fn maybe_spawn_new_thread() -> Result<()> {
137 |     debug_delay();
138 |     let total_workers = TOTAL_THREAD_COUNT.load(Acquire);
139 |     debug_delay();
140 |     let waiting_threads = WAITING_THREAD_COUNT.load(Acquire);
141 | 
142 |     if waiting_threads >= DESIRED_WAITING_THREADS
143 |         || total_workers >= MAX_THREADS
144 |     {
145 |         return Ok(());
146 |     }
147 | 
148 |     if SPAWNING.compare_exchange_weak(false, true, Acquire, Acquire).is_ok() {
149 |         spawn_new_thread(false)?;
150 |     }
151 | 
152 |     Ok(())
153 | }
154 | 
155 | fn spawn_new_thread(is_immortal: bool) -> Result<()> {
156 |     if BROKEN.load(Relaxed) {
157 |         return Err(Error::ReportableBug(
158 |             "IO thread unexpectedly panicked. please report \
159 |             this bug on the sled github repo."
160 |                 .to_string(),
161 |         ));
162 |     }
163 | 
164 |     let spawn_id = SPAWNS.fetch_add(1, SeqCst);
165 | 
166 |     TOTAL_THREAD_COUNT.fetch_add(1, SeqCst);
167 |     let spawn_res = thread::Builder::new()
168 |         .name(format!("sled-io-{}", spawn_id))
169 |         .spawn(move || {
170 |             SPAWNING.store(false, SeqCst);
171 |             debug_delay();
172 |             let res = std::panic::catch_unwind(|| perform_work(is_immortal));
173 |             TOTAL_THREAD_COUNT.fetch_sub(1, SeqCst);
174 |             if is_immortal {
175 |                 // IO thread panicked, shut down the system
176 |                 BROKEN.store(true, SeqCst);
177 |                 panic!(
178 |                     "IO thread unexpectedly panicked. please report \
179 |                     this bug on the sled github repo. error: {:?}",
180 |                     res
181 |                 );
182 |             }
183 |         });
184 | 
185 |     if let Err(e) = spawn_res {
186 |         static E: AtomicBool = AtomicBool::new(false);
187 | 
188 |         SPAWNING.store(false, SeqCst);
189 | 
190 |         if E.compare_exchange(false, true, Relaxed, Relaxed).is_ok() {
191 |             // only execute this once
192 |             warn!(
193 |                 "Failed to dynamically increase the threadpool size: {:?}.",
194 |                 e,
195 |             )
196 |         }
197 |     }
198 | 
199 |     Ok(())
200 | }
201 | 
202 | /// Spawn a function on the threadpool.
203 | pub fn spawn<F, R>(work: F) -> Result<OneShot<R>>
204 | where
205 |     F: FnOnce() -> R + Send + 'static,
206 |     R: Send + 'static + Sized,
207 | {
208 |     SUBMITTED.fetch_add(1, Acquire);
209 |     let (promise_filler, promise) = OneShot::pair();
210 |     let task = move || {
211 |         promise_filler.fill((work)());
212 |     };
213 | 
214 |     let depth = QUEUE.send(Box::new(task));
215 | 
216 |     if depth > DESIRED_WAITING_THREADS {
217 |         maybe_spawn_new_thread()?;
218 |     }
219 | 
220 |     Ok(promise)
221 | }
222 | 


--------------------------------------------------------------------------------
/scripts/execution_explorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/gdb --command
  2 | 
  3 | """
  4 | a simple python GDB script for running multithreaded
  5 | programs in a way that is "deterministic enough"
  6 | to tease out and replay interesting bugs.
  7 | 
  8 | Tyler Neely 25 Sept 2017
  9 | t@jujit.su
 10 | 
 11 | references:
 12 |     https://sourceware.org/gdb/onlinedocs/gdb/All_002dStop-Mode.html
 13 |     https://sourceware.org/gdb/onlinedocs/gdb/Non_002dStop-Mode.html
 14 |     https://sourceware.org/gdb/onlinedocs/gdb/Threads-In-Python.html
 15 |     https://sourceware.org/gdb/onlinedocs/gdb/Events-In-Python.html
 16 |     https://blog.0x972.info/index.php?tag=gdb.py
 17 | """
 18 | 
 19 | import gdb
 20 | import random
 21 | 
 22 | ###############################################################################
 23 | #                                   config                                    #
 24 | ###############################################################################
 25 | # set this to a number for reproducing results or None to explore randomly
 26 | seed = 156112673742  # None  # 951931004895
 27 | 
 28 | # set this to the number of valid threads in the program
 29 | # {2, 3} assumes a main thread that waits on 2 workers.
 30 | # {1, ... N} assumes all of the first N threads are to be explored
 31 | threads_whitelist = {2, 3}
 32 | 
 33 | # set this to the file of the binary to explore
 34 | filename = "target/debug/binary"
 35 | 
 36 | # set this to the place the threads should rendezvous before exploring
 37 | entrypoint = "src/main.rs:8"
 38 | 
 39 | # set this to after the threads are done
 40 | exitpoint = "src/main.rs:12"
 41 | 
 42 | # invariant unreachable points that should never be accessed
 43 | unreachable = [
 44 |         "panic_unwind::imp::panic"
 45 |         ]
 46 | 
 47 | # set this to the locations you want to test interleavings for
 48 | interesting = [
 49 |         "src/main.rs:8",
 50 |         "src/main.rs:9"
 51 |         ]
 52 | 
 53 | # uncomment this to output the specific commands issued to gdb
 54 | gdb.execute("set trace-commands on")
 55 | 
 56 | ###############################################################################
 57 | ###############################################################################
 58 | 
 59 | 
 60 | class UnreachableBreakpoint(gdb.Breakpoint):
 61 |     pass
 62 | 
 63 | 
 64 | class DoneBreakpoint(gdb.Breakpoint):
 65 |     pass
 66 | 
 67 | 
 68 | class InterestingBreakpoint(gdb.Breakpoint):
 69 |     pass
 70 | 
 71 | 
 72 | class DeterministicExecutor:
 73 |     def __init__(self, seed=None):
 74 |         if seed:
 75 |             print("seeding with", seed)
 76 |             self.seed = seed
 77 |             random.seed(seed)
 78 |         else:
 79 |             # pick a random new seed if not provided with one
 80 |             self.reseed()
 81 | 
 82 |         gdb.execute("file " + filename)
 83 | 
 84 |         # non-stop is necessary to provide thread-specific
 85 |         # information when breakpoints are hit.
 86 |         gdb.execute("set non-stop on")
 87 |         gdb.execute("set confirm off")
 88 | 
 89 |         self.ready = set()
 90 |         self.finished = set()
 91 | 
 92 |     def reseed(self):
 93 |         random.seed()
 94 |         self.seed = random.randrange(1e12)
 95 |         print("reseeding with", self.seed)
 96 |         random.seed(self.seed)
 97 | 
 98 |     def restart(self):
 99 |         # reset inner state
100 |         self.ready = set()
101 |         self.finished = set()
102 | 
103 |         # disconnect callbacks
104 |         gdb.events.stop.disconnect(self.scheduler_callback)
105 |         gdb.events.exited.disconnect(self.exit_callback)
106 | 
107 |         # nuke all breakpoints
108 |         gdb.execute("d")
109 | 
110 |         # end execution
111 |         gdb.execute("k")
112 | 
113 |         # pick new seed
114 |         self.reseed()
115 | 
116 |         self.run()
117 | 
118 |     def rendezvous_callback(self, event):
119 |         try:
120 |             self.ready.add(event.inferior_thread.num)
121 |             if len(self.ready) == len(threads_whitelist):
122 |                 self.run_schedule()
123 |         except Exception as e:
124 |             # this will be thrown if breakpoint is not a part of event,
125 |             # like when the event was stopped for another reason.
126 |             print(e)
127 | 
128 |     def run(self):
129 |         gdb.execute("b " + entrypoint)
130 | 
131 |         gdb.events.stop.connect(self.rendezvous_callback)
132 |         gdb.events.exited.connect(self.exit_callback)
133 | 
134 |         gdb.execute("r")
135 | 
136 |     def run_schedule(self):
137 |         print("running schedule")
138 |         gdb.execute("d")
139 |         gdb.events.stop.disconnect(self.rendezvous_callback)
140 |         gdb.events.stop.connect(self.scheduler_callback)
141 | 
142 |         for bp in interesting:
143 |             InterestingBreakpoint(bp)
144 | 
145 |         for bp in unreachable:
146 |             UnreachableBreakpoint(bp)
147 | 
148 |         DoneBreakpoint(exitpoint)
149 | 
150 |         self.pick()
151 | 
152 |     def pick(self):
153 |         threads = self.runnable_threads()
154 |         if not threads:
155 |             print("restarting execution after running out of valid threads")
156 |             self.restart()
157 |             return
158 | 
159 |         thread = random.choice(threads)
160 | 
161 |         gdb.execute("t " + str(thread.num))
162 |         gdb.execute("c")
163 | 
164 |     def scheduler_callback(self, event):
165 |         if not isinstance(event, gdb.BreakpointEvent):
166 |             print("WTF sched callback got", event.__dict__)
167 |             return
168 | 
169 |         if isinstance(event.breakpoint, DoneBreakpoint):
170 |             self.finished.add(event.inferior_thread.num)
171 |         elif isinstance(event.breakpoint, UnreachableBreakpoint):
172 |             print("!" * 80)
173 |             print("unreachable breakpoint triggered with seed", self.seed)
174 |             print("!" * 80)
175 |             gdb.events.exited.disconnect(self.exit_callback)
176 |             gdb.execute("q")
177 |         else:
178 |             print("thread", event.inferior_thread.num,
179 |                   "hit breakpoint at", event.breakpoint.location)
180 | 
181 |         self.pick()
182 | 
183 |     def runnable_threads(self):
184 |         threads = gdb.selected_inferior().threads()
185 | 
186 |         def f(it):
187 |             return (it.is_valid() and not
188 |                     it.is_exited() and
189 |                     it.num in threads_whitelist and
190 |                     it.num not in self.finished)
191 | 
192 |         good_threads = [it for it in threads if f(it)]
193 |         good_threads.sort(key=lambda it: it.num)
194 | 
195 |         return good_threads
196 | 
197 |     def exit_callback(self, event):
198 |         try:
199 |             if event.exit_code != 0:
200 |                 print("!" * 80)
201 |                 print("interesting exit with seed", self.seed)
202 |                 print("!" * 80)
203 |             else:
204 |                 print("happy exit")
205 |                 self.restart()
206 | 
207 |             gdb.execute("q")
208 |         except Exception as e:
209 |             pass
210 | 
211 | de = DeterministicExecutor(seed)
212 | de.run()
213 | 


--------------------------------------------------------------------------------
/src/pagecache/pagetable.rs:
--------------------------------------------------------------------------------
  1 | //! A simple wait-free, grow-only pagetable, assumes a dense keyspace.
  2 | #![allow(unsafe_code)]
  3 | 
  4 | use std::{
  5 |     alloc::{alloc_zeroed, Layout},
  6 |     convert::TryFrom,
  7 |     mem::{align_of, size_of},
  8 |     sync::atomic::Ordering::{Acquire, Relaxed, Release},
  9 | };
 10 | 
 11 | use crossbeam_epoch::{pin, Atomic, Guard, Owned, Shared};
 12 | 
 13 | use crate::{
 14 |     debug_delay,
 15 |     pagecache::{constants::MAX_PID_BITS, Page, PageView},
 16 | };
 17 | 
 18 | #[cfg(feature = "metrics")]
 19 | use crate::{Measure, M};
 20 | 
 21 | #[allow(unused)]
 22 | #[doc(hidden)]
 23 | pub const PAGETABLE_NODE_SZ: usize = size_of::<Node1>();
 24 | 
 25 | const NODE2_FAN_FACTOR: usize = 18;
 26 | const NODE1_FAN_OUT: usize = 1 << (MAX_PID_BITS - NODE2_FAN_FACTOR);
 27 | const NODE2_FAN_OUT: usize = 1 << NODE2_FAN_FACTOR;
 28 | const FAN_MASK: u64 = (NODE2_FAN_OUT - 1) as u64;
 29 | 
 30 | pub type PageId = u64;
 31 | 
 32 | struct Node1 {
 33 |     children: [Atomic<Node2>; NODE1_FAN_OUT],
 34 | }
 35 | 
 36 | struct Node2 {
 37 |     children: [Atomic<Page>; NODE2_FAN_OUT],
 38 | }
 39 | 
 40 | impl Node1 {
 41 |     fn new() -> Owned<Self> {
 42 |         let size = size_of::<Self>();
 43 |         let align = align_of::<Self>();
 44 | 
 45 |         unsafe {
 46 |             let layout = Layout::from_size_align_unchecked(size, align);
 47 | 
 48 |             #[allow(clippy::cast_ptr_alignment)]
 49 |             let ptr = alloc_zeroed(layout) as *mut Self;
 50 | 
 51 |             Owned::from_raw(ptr)
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | impl Node2 {
 57 |     fn new() -> Owned<Node2> {
 58 |         let size = size_of::<Self>();
 59 |         let align = align_of::<Self>();
 60 | 
 61 |         unsafe {
 62 |             let layout = Layout::from_size_align_unchecked(size, align);
 63 | 
 64 |             #[allow(clippy::cast_ptr_alignment)]
 65 |             let ptr = alloc_zeroed(layout) as *mut Self;
 66 | 
 67 |             Owned::from_raw(ptr)
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | impl Drop for Node1 {
 73 |     fn drop(&mut self) {
 74 |         drop_iter(self.children.iter());
 75 |     }
 76 | }
 77 | 
 78 | impl Drop for Node2 {
 79 |     fn drop(&mut self) {
 80 |         drop_iter(self.children.iter());
 81 |     }
 82 | }
 83 | 
 84 | fn drop_iter<T>(iter: core::slice::Iter<'_, Atomic<T>>) {
 85 |     let guard = pin();
 86 |     for child in iter {
 87 |         let shared_child = child.load(Relaxed, &guard);
 88 |         if shared_child.is_null() {
 89 |             // this does not leak because the PageTable is
 90 |             // assumed to be dense.
 91 |             break;
 92 |         }
 93 |         unsafe {
 94 |             drop(shared_child.into_owned());
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | /// A simple lock-free radix tree.
100 | pub struct PageTable {
101 |     head: Atomic<Node1>,
102 | }
103 | 
104 | impl Default for PageTable {
105 |     fn default() -> Self {
106 |         let head = Node1::new();
107 |         Self { head: Atomic::from(head) }
108 |     }
109 | }
110 | 
111 | impl PageTable {
112 |     /// # Panics
113 |     ///
114 |     /// will panic if the item is not null already,
115 |     /// which represents a serious failure to
116 |     /// properly handle lifecycles of pages in the
117 |     /// using system.
118 |     pub(crate) fn insert<'g>(
119 |         &self,
120 |         pid: PageId,
121 |         item: Page,
122 |         guard: &'g Guard,
123 |     ) -> PageView<'g> {
124 |         debug_delay();
125 |         let tip = self.traverse(pid, guard);
126 | 
127 |         let shared = Owned::new(item).into_shared(guard);
128 |         let old = tip.swap(shared, Release, guard);
129 |         assert!(old.is_null());
130 | 
131 |         PageView { read: shared, entry: tip }
132 |     }
133 | 
134 |     /// Try to get a value from the tree.
135 |     ///
136 |     /// # Panics
137 |     ///
138 |     /// Panics if the page has never been allocated.
139 |     pub(crate) fn get<'g>(
140 |         &self,
141 |         pid: PageId,
142 |         guard: &'g Guard,
143 |     ) -> PageView<'g> {
144 |         #[cfg(feature = "metrics")]
145 |         let _measure = Measure::new(&M.get_pagetable);
146 |         debug_delay();
147 |         let tip = self.traverse(pid, guard);
148 | 
149 |         debug_delay();
150 |         let res = tip.load(Acquire, guard);
151 | 
152 |         assert!(!res.is_null());
153 | 
154 |         PageView { read: res, entry: tip }
155 |     }
156 | 
157 |     pub(crate) fn contains_pid(&self, pid: PageId, guard: &Guard) -> bool {
158 |         #[cfg(feature = "metrics")]
159 |         let _measure = Measure::new(&M.get_pagetable);
160 |         debug_delay();
161 |         let tip = self.traverse(pid, guard);
162 | 
163 |         debug_delay();
164 |         let res = tip.load(Acquire, guard);
165 | 
166 |         !res.is_null()
167 |     }
168 | 
169 |     fn traverse<'g>(&self, k: PageId, guard: &'g Guard) -> &'g Atomic<Page> {
170 |         let (l1k, l2k) = split_fanout(k);
171 | 
172 |         debug_delay();
173 |         let head = self.head.load(Acquire, guard);
174 | 
175 |         debug_delay();
176 |         let l1 = unsafe { &head.deref().children };
177 | 
178 |         debug_delay();
179 |         let mut l2_ptr = l1[l1k].load(Acquire, guard);
180 | 
181 |         if l2_ptr.is_null() {
182 |             let next_child = Node2::new();
183 | 
184 |             debug_delay();
185 |             let ret = l1[l1k].compare_and_set(
186 |                 Shared::null(),
187 |                 next_child,
188 |                 Release,
189 |                 guard,
190 |             );
191 | 
192 |             l2_ptr = match ret {
193 |                 Ok(next_child) => next_child,
194 |                 Err(returned) => {
195 |                     drop(returned.new);
196 |                     returned.current
197 |                 }
198 |             };
199 |         }
200 | 
201 |         debug_delay();
202 |         let l2 = unsafe { &l2_ptr.deref().children };
203 | 
204 |         &l2[l2k]
205 |     }
206 | }
207 | 
208 | #[inline]
209 | fn split_fanout(id: PageId) -> (usize, usize) {
210 |     // right shift 32 on 32-bit pointer systems panics
211 |     #[cfg(target_pointer_width = "64")]
212 |     assert!(
213 |         id <= 1 << MAX_PID_BITS,
214 |         "trying to access key of {}, which is \
215 |          higher than 2 ^ {}",
216 |         id,
217 |         MAX_PID_BITS,
218 |     );
219 | 
220 |     let left = id >> NODE2_FAN_FACTOR;
221 |     let right = id & FAN_MASK;
222 | 
223 |     (safe_usize(left), safe_usize(right))
224 | }
225 | 
226 | #[inline]
227 | fn safe_usize(value: PageId) -> usize {
228 |     usize::try_from(value).unwrap()
229 | }
230 | 
231 | impl Drop for PageTable {
232 |     fn drop(&mut self) {
233 |         let guard = pin();
234 |         let head = self.head.load(Relaxed, &guard);
235 |         unsafe {
236 |             drop(head.into_owned());
237 |         }
238 |     }
239 | }
240 | 
241 | #[test]
242 | fn fanout_functionality() {
243 |     assert_eq!(
244 |         split_fanout(0b11_1111_1111_1111_1111),
245 |         (0, 0b11_1111_1111_1111_1111)
246 |     );
247 |     assert_eq!(
248 |         split_fanout(0b111_1111_1111_1111_1111),
249 |         (0b1, 0b11_1111_1111_1111_1111)
250 |     );
251 | }
252 | 


--------------------------------------------------------------------------------
/src/result.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     cmp::PartialEq,
  3 |     error::Error as StdError,
  4 |     fmt::{self, Display},
  5 |     io,
  6 | };
  7 | 
  8 | #[cfg(feature = "testing")]
  9 | use backtrace::Backtrace;
 10 | 
 11 | use crate::{
 12 |     pagecache::{DiskPtr, PageView},
 13 |     IVec,
 14 | };
 15 | 
 16 | /// The top-level result type for dealing with
 17 | /// fallible operations. The errors tend to
 18 | /// be fail-stop, and nested results are used
 19 | /// in cases where the outer fail-stop error can
 20 | /// have try `?` used on it, exposing the inner
 21 | /// operation that is expected to fail under
 22 | /// normal operation. The philosophy behind this
 23 | /// is detailed [on the sled blog](https://sled.rs/errors).
 24 | pub type Result<T> = std::result::Result<T, Error>;
 25 | 
 26 | /// A compare and swap result.  If the CAS is successful,
 27 | /// the new `PagePtr` will be returned as `Ok`.  Otherwise,
 28 | /// the `Err` will contain a tuple of the current `PagePtr`
 29 | /// and the old value that could not be set atomically.
 30 | pub(crate) type CasResult<'a, R> =
 31 |     std::result::Result<PageView<'a>, Option<(PageView<'a>, R)>>;
 32 | 
 33 | /// An Error type encapsulating various issues that may come up
 34 | /// in the operation of a `Db`.
 35 | #[derive(Debug)]
 36 | pub enum Error {
 37 |     /// The underlying collection no longer exists.
 38 |     CollectionNotFound(IVec),
 39 |     /// The system has been used in an unsupported way.
 40 |     Unsupported(String),
 41 |     /// An unexpected bug has happened. Please open an issue on github!
 42 |     ReportableBug(String),
 43 |     /// A read or write error has happened when interacting with the file
 44 |     /// system.
 45 |     Io(io::Error),
 46 |     /// Corruption has been detected in the storage file.
 47 |     Corruption {
 48 |         /// The file location that corrupted data was found at.
 49 |         at: Option<DiskPtr>,
 50 |         /// A backtrace for where the corruption was encountered.
 51 |         #[cfg(feature = "testing")]
 52 |         bt: Backtrace,
 53 |         /// A backtrace for where the corruption was encountered.
 54 |         #[cfg(not(feature = "testing"))]
 55 |         bt: (),
 56 |     },
 57 |     // a failpoint has been triggered for testing purposes
 58 |     #[doc(hidden)]
 59 |     #[cfg(feature = "failpoints")]
 60 |     FailPoint,
 61 | }
 62 | 
 63 | impl Error {
 64 |     pub(crate) fn corruption(at: Option<DiskPtr>) -> Error {
 65 |         Error::Corruption {
 66 |             at,
 67 |             #[cfg(feature = "testing")]
 68 |             bt: Backtrace::new(),
 69 |             #[cfg(not(feature = "testing"))]
 70 |             bt: (),
 71 |         }
 72 |     }
 73 | }
 74 | 
 75 | impl Clone for Error {
 76 |     fn clone(&self) -> Self {
 77 |         use self::Error::*;
 78 | 
 79 |         match self {
 80 |             Io(ioe) => Io(io::Error::new(ioe.kind(), format!("{:?}", ioe))),
 81 |             CollectionNotFound(name) => CollectionNotFound(name.clone()),
 82 |             Unsupported(why) => Unsupported(why.clone()),
 83 |             ReportableBug(what) => ReportableBug(what.clone()),
 84 |             Corruption { at, bt } => Corruption { at: *at, bt: bt.clone() },
 85 |             #[cfg(feature = "failpoints")]
 86 |             FailPoint => FailPoint,
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | impl Eq for Error {}
 92 | 
 93 | impl PartialEq for Error {
 94 |     fn eq(&self, other: &Self) -> bool {
 95 |         use self::Error::*;
 96 | 
 97 |         match *self {
 98 |             CollectionNotFound(ref l) => {
 99 |                 if let CollectionNotFound(ref r) = *other {
100 |                     l == r
101 |                 } else {
102 |                     false
103 |                 }
104 |             }
105 |             Unsupported(ref l) => {
106 |                 if let Unsupported(ref r) = *other {
107 |                     l == r
108 |                 } else {
109 |                     false
110 |                 }
111 |             }
112 |             ReportableBug(ref l) => {
113 |                 if let ReportableBug(ref r) = *other {
114 |                     l == r
115 |                 } else {
116 |                     false
117 |                 }
118 |             }
119 |             #[cfg(feature = "failpoints")]
120 |             FailPoint => {
121 |                 if let FailPoint = *other {
122 |                     true
123 |                 } else {
124 |                     false
125 |                 }
126 |             }
127 |             Corruption { at: l, .. } => {
128 |                 if let Corruption { at: r, .. } = *other {
129 |                     l == r
130 |                 } else {
131 |                     false
132 |                 }
133 |             }
134 |             Io(_) => false,
135 |         }
136 |     }
137 | }
138 | 
139 | impl From<io::Error> for Error {
140 |     #[inline]
141 |     fn from(io_error: io::Error) -> Self {
142 |         Error::Io(io_error)
143 |     }
144 | }
145 | 
146 | impl From<Error> for io::Error {
147 |     fn from(error: Error) -> io::Error {
148 |         use self::Error::*;
149 |         use std::io::ErrorKind;
150 |         match error {
151 |             Io(ioe) => ioe,
152 |             CollectionNotFound(name) => io::Error::new(
153 |                 ErrorKind::NotFound,
154 |                 format!("collection not found: {:?}", name),
155 |             ),
156 |             Unsupported(why) => io::Error::new(
157 |                 ErrorKind::InvalidInput,
158 |                 format!("operation not supported: {:?}", why),
159 |             ),
160 |             ReportableBug(what) => io::Error::new(
161 |                 ErrorKind::Other,
162 |                 format!(
163 |                     "unexpected bug! please report this bug at <github.rs/spacejam/sled>: {:?}",
164 |                     what
165 |                 ),
166 |             ),
167 |             Corruption { .. } => io::Error::new(
168 |                 ErrorKind::InvalidData,
169 |                 format!("corruption encountered: {:?}", error),
170 |             ),
171 |             #[cfg(feature = "failpoints")]
172 |             FailPoint => io::Error::new(ErrorKind::Other, "failpoint"),
173 |         }
174 |     }
175 | }
176 | 
177 | impl StdError for Error {}
178 | 
179 | impl Display for Error {
180 |     fn fmt(
181 |         &self,
182 |         f: &mut fmt::Formatter<'_>,
183 |     ) -> std::result::Result<(), fmt::Error> {
184 |         use self::Error::*;
185 | 
186 |         match *self {
187 |             CollectionNotFound(ref name) => {
188 |                 write!(f, "Collection {:?} does not exist", name,)
189 |             }
190 |             Unsupported(ref e) => write!(f, "Unsupported: {}", e),
191 |             ReportableBug(ref e) => write!(
192 |                 f,
193 |                 "Unexpected bug has happened: {}. \
194 |                  PLEASE REPORT THIS BUG!",
195 |                 e
196 |             ),
197 |             #[cfg(feature = "failpoints")]
198 |             FailPoint => write!(f, "Fail point has been triggered."),
199 |             Io(ref e) => write!(f, "IO error: {}", e),
200 |             Corruption { at, ref bt } => write!(
201 |                 f,
202 |                 "Read corrupted data at file offset {:?} backtrace {:?}",
203 |                 at, bt
204 |             ),
205 |         }
206 |     }
207 | }
208 | 


--------------------------------------------------------------------------------
/experiments/epoch/src/main.rs:
--------------------------------------------------------------------------------
  1 | /// A simple implementation of epoch-based reclamation.
  2 | ///
  3 | /// Using the `pin` method, a thread checks into an epoch
  4 | /// before operating on a shared resource. If that thread
  5 | /// makes a shared resource inaccessible, it can defer its
  6 | /// destruction until all threads that may have already
  7 | /// checked in have moved on.
  8 | use std::{
  9 |     cell::RefCell,
 10 |     sync::{
 11 |         atomic::{AtomicPtr, AtomicUsize, Ordering::SeqCst},
 12 |         Arc,
 13 |     },
 14 | };
 15 | 
 16 | const EPOCH_SZ: usize = 16;
 17 | 
 18 | #[derive(Default)]
 19 | struct Epoch {
 20 |     garbage: [AtomicPtr<Box<dyn FnOnce()>>; EPOCH_SZ],
 21 |     offset: AtomicUsize,
 22 |     next: AtomicPtr<Epoch>,
 23 |     id: u64,
 24 | }
 25 | 
 26 | impl Drop for Epoch {
 27 |     fn drop(&mut self) {
 28 |         let count = std::cmp::min(EPOCH_SZ, self.offset.load(SeqCst));
 29 |         for offset in 0..count {
 30 |             let mut garbage_ptr: *mut Box<dyn FnOnce()> =
 31 |                 self.garbage[offset].load(SeqCst);
 32 |             while garbage_ptr.is_null() {
 33 |                 // maybe this is impossible, but this is to
 34 |                 // be defensive against race conditions.
 35 |                 garbage_ptr = self.garbage[offset].load(SeqCst);
 36 |             }
 37 | 
 38 |             let garbage: Box<Box<dyn FnOnce()>> =
 39 |                 unsafe { Box::from_raw(garbage_ptr) };
 40 | 
 41 |             drop(garbage);
 42 |         }
 43 | 
 44 |         let next = self.next.swap(std::ptr::null_mut(), SeqCst);
 45 |         if !next.is_null() {
 46 |             let arc = unsafe { Arc::from_raw(next) };
 47 |             drop(arc);
 48 |         }
 49 |     }
 50 | }
 51 | 
 52 | struct Collector {
 53 |     head: AtomicPtr<Epoch>,
 54 | }
 55 | 
 56 | unsafe impl Send for Collector {}
 57 | unsafe impl Sync for Collector {}
 58 | 
 59 | impl Default for Collector {
 60 |     fn default() -> Collector {
 61 |         let ptr = Arc::into_raw(Arc::new(Epoch::default())) as *mut Epoch;
 62 |         Collector { head: AtomicPtr::new(ptr) }
 63 |     }
 64 | }
 65 | 
 66 | impl Collector {
 67 |     fn pin(&self) -> Guard {
 68 |         let head_ptr = self.head.load(SeqCst);
 69 |         assert!(!head_ptr.is_null());
 70 |         let mut head = unsafe { Arc::from_raw(head_ptr) };
 71 |         let mut next = head.next.load(SeqCst);
 72 |         let mut last_head = head_ptr;
 73 | 
 74 |         // forward head to current tip
 75 |         while !next.is_null() {
 76 |             std::mem::forget(head);
 77 | 
 78 |             let res = self.head.compare_and_swap(last_head, next, SeqCst);
 79 |             if res == last_head {
 80 |                 head = unsafe { Arc::from_raw(next) };
 81 |                 last_head = next;
 82 |             } else {
 83 |                 head = unsafe { Arc::from_raw(res) };
 84 |                 last_head = res;
 85 |             }
 86 | 
 87 |             next = head.next.load(SeqCst);
 88 |         }
 89 | 
 90 |         let (a1, a2) = (head.clone(), head.clone());
 91 |         std::mem::forget(head);
 92 | 
 93 |         Guard {
 94 |             _entry_epoch: a1,
 95 |             current_epoch: a2,
 96 |             trash_sack: RefCell::new(vec![]),
 97 |         }
 98 |     }
 99 | }
100 | 
101 | impl Drop for Collector {
102 |     fn drop(&mut self) {
103 |         let head_ptr = self.head.load(SeqCst);
104 |         assert!(!head_ptr.is_null());
105 |         unsafe {
106 |             let head = Arc::from_raw(head_ptr);
107 |             drop(head);
108 |         }
109 |     }
110 | }
111 | 
112 | pub(crate) struct Guard {
113 |     _entry_epoch: Arc<Epoch>,
114 |     current_epoch: Arc<Epoch>,
115 |     trash_sack: RefCell<Vec<*mut Box<dyn FnOnce()>>>,
116 | }
117 | 
118 | impl Guard {
119 |     pub fn defer<F>(&self, f: F)
120 |     where
121 |         F: FnOnce() + Send + 'static,
122 |     {
123 |         let garbage_ptr =
124 |             Box::into_raw(Box::new(Box::new(f) as Box<dyn FnOnce()>));
125 |         let mut trash_sack = self.trash_sack.borrow_mut();
126 |         trash_sack.push(garbage_ptr);
127 |     }
128 | }
129 | 
130 | impl Drop for Guard {
131 |     fn drop(&mut self) {
132 |         let trash_sack = self.trash_sack.replace(vec![]);
133 | 
134 |         for garbage_ptr in trash_sack.into_iter() {
135 |             // try to reserve
136 |             let mut offset = self.current_epoch.offset.fetch_add(1, SeqCst);
137 |             while offset >= EPOCH_SZ {
138 |                 let next = self.current_epoch.next.load(SeqCst);
139 |                 if !next.is_null() {
140 |                     unsafe {
141 |                         let raced_arc = Arc::from_raw(next);
142 |                         self.current_epoch = raced_arc.clone();
143 |                         std::mem::forget(raced_arc);
144 |                     }
145 |                     offset = self.current_epoch.offset.fetch_add(1, SeqCst);
146 |                     continue;
147 |                 }
148 | 
149 |                 // push epoch forward if we're full
150 |                 let mut next_epoch = Epoch::default();
151 |                 next_epoch.id = self.current_epoch.id + 1;
152 | 
153 |                 let next_epoch_arc = Arc::new(next_epoch);
154 |                 let next_ptr =
155 |                     Arc::into_raw(next_epoch_arc.clone()) as *mut Epoch;
156 |                 let old = self.current_epoch.next.compare_and_swap(
157 |                     std::ptr::null_mut(),
158 |                     next_ptr,
159 |                     SeqCst,
160 |                 );
161 |                 if old != std::ptr::null_mut() {
162 |                     // somebody else already installed a new segment
163 |                     unsafe {
164 |                         let unneeded = Arc::from_raw(next_ptr);
165 |                         drop(unneeded);
166 | 
167 |                         let raced_arc = Arc::from_raw(old);
168 |                         self.current_epoch = raced_arc.clone();
169 |                         std::mem::forget(raced_arc);
170 |                     }
171 |                     offset = self.current_epoch.offset.fetch_add(1, SeqCst);
172 |                     continue;
173 |                 }
174 | 
175 |                 self.current_epoch = next_epoch_arc;
176 |                 offset = self.current_epoch.offset.fetch_add(1, SeqCst);
177 |             }
178 | 
179 |             let old =
180 |                 self.current_epoch.garbage[offset].swap(garbage_ptr, SeqCst);
181 |             assert!(old.is_null());
182 |         }
183 |     }
184 | }
185 | 
186 | #[derive(Debug)]
187 | struct S(usize);
188 | 
189 | fn main() {
190 |     let collector = Arc::new(Collector::default());
191 | 
192 |     let mut threads = vec![];
193 | 
194 |     for t in 0..100 {
195 |         use std::thread::spawn;
196 | 
197 |         let collector = collector.clone();
198 |         let thread = spawn(move || {
199 |             for _ in 0..1000000 {
200 |                 let guard = collector.pin();
201 |                 guard.defer(move || {
202 |                     S(t as usize);
203 |                 });
204 | 
205 |                 let guard = crossbeam_epoch::pin();
206 |                 guard.defer(move || {
207 |                     S(t as usize);
208 |                 });
209 |             }
210 |         });
211 | 
212 |         threads.push(thread);
213 |     }
214 | 
215 |     for thread in threads.into_iter() {
216 |         thread.join().unwrap();
217 |     }
218 | }
219 | 


--------------------------------------------------------------------------------
/src/event_log.rs:
--------------------------------------------------------------------------------
  1 | //! The `EventLog` lets us cheaply record and query behavior
  2 | //! in a concurrent system. It lets us reconstruct stories about
  3 | //! what happened to our data. It lets us write tests like:
  4 | //! 1. no keys are lost through tree structural modifications
  5 | //! 2. no nodes are made inaccessible through structural modifications
  6 | //! 3. no segments are zeroed and reused before all resident
  7 | //!    pages have been relocated and stabilized.
  8 | //! 4. recovery does not skip active segments
  9 | //! 5. no page is double-allocated or double-freed
 10 | //! 6. pages before restart match pages after restart
 11 | //!
 12 | //! What does it mean for data to be accessible?
 13 | //! 1. key -> page
 14 | //! 2. page -> lid
 15 | //! 3. lid ranges get stabiized over time
 16 | //! 4. lid ranges get zeroed over time
 17 | //! 5. segment trailers get written over time
 18 | //! 6. if a page's old location is zeroed before
 19 | //!    `io_bufs` segment trailers have been written,
 20 | //!    we are vulnerable to data loss
 21 | //! 3. segments have lifespans from fsync to zero
 22 | //! 4.
 23 | #![allow(missing_docs)]
 24 | 
 25 | use crate::pagecache::DiskPtr;
 26 | use crate::*;
 27 | 
 28 | use crate::stack::{Iter as StackIter, Stack};
 29 | 
 30 | /// A thing that happens at a certain time.
 31 | #[derive(Debug, Clone)]
 32 | enum Event {
 33 |     PagesOnShutdown { pages: Map<PageId, Vec<DiskPtr>> },
 34 |     PagesOnRecovery { pages: Map<PageId, Vec<DiskPtr>> },
 35 |     MetaOnShutdown { meta: Meta },
 36 |     MetaOnRecovery { meta: Meta },
 37 |     RecoveredLsn(Lsn),
 38 |     Stabilized(Lsn),
 39 | }
 40 | 
 41 | /// A lock-free queue of Events.
 42 | #[derive(Default, Debug)]
 43 | pub struct EventLog {
 44 |     inner: Stack<Event>,
 45 | }
 46 | 
 47 | impl EventLog {
 48 |     pub(crate) fn reset(&self) {
 49 |         self.verify();
 50 |         let guard = pin();
 51 |         while self.inner.pop(&guard).is_some() {}
 52 |     }
 53 | 
 54 |     fn iter<'a>(&self, guard: &'a Guard) -> StackIter<'a, Event> {
 55 |         let head = self.inner.head(guard);
 56 |         StackIter::from_ptr(head, guard)
 57 |     }
 58 | 
 59 |     pub(crate) fn verify(&self) {
 60 |         let guard = pin();
 61 |         let iter = self.iter(&guard);
 62 | 
 63 |         // if we encounter a `PagesOnRecovery`, then we should
 64 |         // compare it to any subsequent `PagesOnShutdown`
 65 | 
 66 |         let mut recovered_pages = None;
 67 |         let mut recovered_meta = None;
 68 |         let mut minimum_lsn = None;
 69 | 
 70 |         for event in iter {
 71 |             match event {
 72 |                 Event::Stabilized(lsn) | Event::RecoveredLsn(lsn) => {
 73 |                     if let Some(later_lsn) = minimum_lsn {
 74 |                         assert!(
 75 |                             later_lsn >= lsn,
 76 |                             "lsn must never go down between recoveries \
 77 |                             or stabilizations. It was {} but later became {}. history: {:?}",
 78 |                             lsn,
 79 |                             later_lsn,
 80 |                             self.iter(&guard)
 81 |                                 .filter(|e| matches!(e, Event::Stabilized(_))
 82 |                                     || matches!(e, Event::RecoveredLsn(_)))
 83 |                                 .collect::<Vec<_>>(),
 84 |                         );
 85 |                     }
 86 |                     minimum_lsn = Some(lsn);
 87 |                 }
 88 |                 Event::PagesOnRecovery { pages } => {
 89 |                     recovered_pages = Some(pages.clone());
 90 |                 }
 91 |                 Event::PagesOnShutdown { pages } => {
 92 |                     if let Some(ref par) = recovered_pages {
 93 |                         let pids = par
 94 |                             .iter()
 95 |                             .map(|(pid, _frag_locations)| *pid)
 96 |                             .chain(
 97 |                                 pages.iter().map(|(pid, _frag_locations)| *pid),
 98 |                             )
 99 |                             .collect::<Set<_>>()
100 |                             .into_iter();
101 | 
102 |                         for pid in pids {
103 |                             // we filter out the blob pointer in the log
104 |                             // because it is expected that upon recovery,
105 |                             // any blob pointers will be forgotten from
106 |                             // the log now that they are present in the
107 |                             // snapshot.
108 |                             let locations_before_restart: Vec<_> = pages
109 |                                 .get(&pid)
110 |                                 .unwrap()
111 |                                 .iter()
112 |                                 .map(|ptr| {
113 |                                     let mut ptr = *ptr;
114 |                                     ptr.forget_heap_log_coordinates();
115 |                                     ptr
116 |                                 })
117 |                                 .collect();
118 |                             let locations_after_restart: Vec<_> = par
119 |                                 .get(&pid)
120 |                                 .unwrap()
121 |                                 .iter()
122 |                                 .copied()
123 |                                 .collect();
124 |                             assert_eq!(
125 |                                 locations_before_restart,
126 |                                 locations_after_restart,
127 |                                 "page {} had frag locations {:?} before \
128 |                                  restart, but {:?} after restart",
129 |                                 pid,
130 |                                 locations_before_restart,
131 |                                 locations_after_restart
132 |                             );
133 |                         }
134 |                     }
135 |                 }
136 |                 Event::MetaOnRecovery { meta } => {
137 |                     recovered_meta = Some(meta);
138 |                 }
139 |                 Event::MetaOnShutdown { meta } => {
140 |                     if let Some(rec_meta) = recovered_meta {
141 |                         assert_eq!(meta, rec_meta);
142 |                     }
143 |                 }
144 |             }
145 |         }
146 | 
147 |         debug!("event log verified \u{2713}");
148 |     }
149 | 
150 |     pub(crate) fn stabilized_lsn(&self, lsn: Lsn) {
151 |         let guard = pin();
152 |         self.inner.push(Event::Stabilized(lsn), &guard);
153 |     }
154 | 
155 |     pub(crate) fn recovered_lsn(&self, lsn: Lsn) {
156 |         let guard = pin();
157 |         self.inner.push(Event::RecoveredLsn(lsn), &guard);
158 |     }
159 | 
160 |     pub(crate) fn pages_before_restart(
161 |         &self,
162 |         pages: Map<PageId, Vec<DiskPtr>>,
163 |     ) {
164 |         let guard = pin();
165 |         self.inner.push(Event::PagesOnShutdown { pages }, &guard);
166 |     }
167 | 
168 |     pub(crate) fn pages_after_restart(&self, pages: Map<PageId, Vec<DiskPtr>>) {
169 |         let guard = pin();
170 |         self.inner.push(Event::PagesOnRecovery { pages }, &guard);
171 |     }
172 | 
173 |     pub fn meta_before_restart(&self, meta: Meta) {
174 |         let guard = pin();
175 |         self.inner.push(Event::MetaOnShutdown { meta }, &guard);
176 |     }
177 | 
178 |     pub fn meta_after_restart(&self, meta: Meta) {
179 |         let guard = pin();
180 |         self.inner.push(Event::MetaOnRecovery { meta }, &guard);
181 |     }
182 | }
183 | 


--------------------------------------------------------------------------------
/bindings/sled-native/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use sled;
  2 | 
  3 | use std::ffi::CString;
  4 | use std::mem;
  5 | use std::ptr;
  6 | use std::slice;
  7 | 
  8 | use libc::*;
  9 | 
 10 | use sled::{Config, Db, IVec, Iter};
 11 | 
 12 | fn leak_buf(v: Vec<u8>, vallen: *mut size_t) -> *mut c_char {
 13 |     unsafe {
 14 |         *vallen = v.len();
 15 |     }
 16 |     let mut bsv = v.into_boxed_slice();
 17 |     let val = bsv.as_mut_ptr() as *mut _;
 18 |     mem::forget(bsv);
 19 |     val
 20 | }
 21 | 
 22 | /// Create a new configuration.
 23 | #[no_mangle]
 24 | pub unsafe extern "C" fn sled_create_config() -> *mut Config {
 25 |     Box::into_raw(Box::new(Config::new()))
 26 | }
 27 | 
 28 | /// Destroy a configuration.
 29 | #[no_mangle]
 30 | pub unsafe extern "C" fn sled_free_config(config: *mut Config) {
 31 |     drop(Box::from_raw(config));
 32 | }
 33 | 
 34 | /// Set the configured file path. The caller is responsible for freeing the path
 35 | /// string after calling this (it is copied in this function).
 36 | #[no_mangle]
 37 | pub unsafe extern "C" fn sled_config_set_path(
 38 |     config: *mut Config,
 39 |     path: *const c_char,
 40 | ) -> *mut Config {
 41 |     let c_str = CString::from_raw(path as *mut _);
 42 |     let value = c_str.into_string().unwrap();
 43 | 
 44 |     let config = Box::from_raw(config);
 45 |     Box::into_raw(Box::from(config.path(value)))
 46 | }
 47 | 
 48 | /// Set the configured cache capacity in bytes.
 49 | #[no_mangle]
 50 | pub unsafe extern "C" fn sled_config_set_cache_capacity(
 51 |     config: *mut Config,
 52 |     capacity: size_t,
 53 | ) -> *mut Config {
 54 |     let config = Box::from_raw(config);
 55 |     Box::into_raw(Box::from(config.cache_capacity(capacity as u64)))
 56 | }
 57 | 
 58 | /// Configure the use of the zstd compression library.
 59 | #[no_mangle]
 60 | pub unsafe extern "C" fn sled_config_use_compression(
 61 |     config: *mut Config,
 62 |     use_compression: c_uchar,
 63 | ) -> *mut Config {
 64 |     let config = Box::from_raw(config);
 65 |     Box::into_raw(Box::from(config.use_compression(use_compression == 1)))
 66 | }
 67 | 
 68 | /// Set the configured IO buffer flush interval in milliseconds.
 69 | #[no_mangle]
 70 | pub unsafe extern "C" fn sled_config_flush_every_ms(
 71 |     config: *mut Config,
 72 |     flush_every: c_int,
 73 | ) -> *mut Config {
 74 |     let val = if flush_every < 0 { None } else { Some(flush_every as u64) };
 75 |     let config = Box::from_raw(config);
 76 |     Box::into_raw(Box::from(config.flush_every_ms(val)))
 77 | }
 78 | 
 79 | /// Open a sled lock-free log-structured tree. Consumes the passed-in config.
 80 | #[no_mangle]
 81 | pub unsafe extern "C" fn sled_open_db(config: *mut Config) -> *mut Db {
 82 |     let config = Box::from_raw(config);
 83 |     Box::into_raw(Box::new(config.open().unwrap()))
 84 | }
 85 | 
 86 | /// Close a sled lock-free log-structured tree.
 87 | #[no_mangle]
 88 | pub unsafe extern "C" fn sled_close(db: *mut Db) {
 89 |     drop(Box::from_raw(db));
 90 | }
 91 | 
 92 | /// Free a buffer originally allocated by sled.
 93 | #[no_mangle]
 94 | pub unsafe extern "C" fn sled_free_buf(buf: *mut c_char, sz: size_t) {
 95 |     drop(Vec::from_raw_parts(buf, sz, sz));
 96 | }
 97 | 
 98 | /// Free an iterator.
 99 | #[no_mangle]
100 | pub unsafe extern "C" fn sled_free_iter(iter: *mut Iter) {
101 |     drop(Box::from_raw(iter));
102 | }
103 | 
104 | /// Set a key to a value.
105 | #[no_mangle]
106 | pub unsafe extern "C" fn sled_set(
107 |     db: *mut Db,
108 |     key: *const c_uchar,
109 |     keylen: size_t,
110 |     val: *const c_uchar,
111 |     vallen: size_t,
112 | ) {
113 |     let k = IVec::from(slice::from_raw_parts(key, keylen));
114 |     let v = IVec::from(slice::from_raw_parts(val, vallen));
115 |     (*db).insert(k, v).unwrap();
116 | }
117 | 
118 | /// Get the value of a key.
119 | /// Caller is responsible for freeing the returned value with `sled_free_buf` if
120 | /// it's non-null.
121 | #[no_mangle]
122 | pub unsafe extern "C" fn sled_get(
123 |     db: *mut Db,
124 |     key: *const c_char,
125 |     keylen: size_t,
126 |     vallen: *mut size_t,
127 | ) -> *mut c_char {
128 |     let k = slice::from_raw_parts(key as *const u8, keylen);
129 |     let res = (*db).get(k);
130 |     match res {
131 |         Ok(Some(v)) => leak_buf(v.to_vec(), vallen),
132 |         Ok(None) => ptr::null_mut(),
133 |         // TODO proper error propagation
134 |         Err(e) => panic!("{:?}", e),
135 |     }
136 | }
137 | 
138 | /// Delete the value of a key.
139 | #[no_mangle]
140 | pub unsafe extern "C" fn sled_del(
141 |     db: *mut Db,
142 |     key: *const c_char,
143 |     keylen: size_t,
144 | ) {
145 |     let k = slice::from_raw_parts(key as *const u8, keylen);
146 |     (*db).remove(k).unwrap();
147 | }
148 | 
149 | /// Compare and swap.
150 | /// Returns 1 if successful, 0 if unsuccessful.
151 | /// Otherwise sets `actual_val` and `actual_vallen` to the current value,
152 | /// which must be freed using `sled_free_buf` by the caller if non-null.
153 | /// `actual_val` will be null and `actual_vallen` 0 if the current value is not
154 | /// set.
155 | #[no_mangle]
156 | pub unsafe extern "C" fn sled_compare_and_swap(
157 |     db: *mut Db,
158 |     key: *const c_char,
159 |     keylen: size_t,
160 |     old_val: *const c_uchar,
161 |     old_vallen: size_t,
162 |     new_val: *const c_uchar,
163 |     new_vallen: size_t,
164 |     actual_val: *mut *const c_uchar,
165 |     actual_vallen: *mut size_t,
166 | ) -> c_uchar {
167 |     let k = IVec::from(slice::from_raw_parts(key as *const u8, keylen));
168 | 
169 |     let old = if old_vallen == 0 {
170 |         None
171 |     } else {
172 |         let copy =
173 |             IVec::from(slice::from_raw_parts(old_val as *const u8, old_vallen));
174 |         Some(copy)
175 |     };
176 | 
177 |     let new = if new_vallen == 0 {
178 |         None
179 |     } else {
180 |         let copy =
181 |             IVec::from(slice::from_raw_parts(new_val as *const u8, new_vallen));
182 |         Some(copy)
183 |     };
184 | 
185 |     let res = (*db).compare_and_swap(k, old, new);
186 | 
187 |     match res {
188 |         Ok(Ok(())) => 1,
189 |         Ok(Err(sled::CompareAndSwapError { current: None, .. })) => {
190 |             *actual_vallen = 0;
191 |             0
192 |         }
193 |         Ok(Err(sled::CompareAndSwapError { current: Some(v), .. })) => {
194 |             *actual_val = leak_buf(v.to_vec(), actual_vallen) as *const u8;
195 |             0
196 |         }
197 |         // TODO proper error propagation
198 |         Err(e) => panic!("{:?}", e),
199 |     }
200 | }
201 | 
202 | /// Iterate over tuples which have specified key prefix.
203 | /// Caller is responsible for freeing the returned iterator with
204 | /// `sled_free_iter`.
205 | #[no_mangle]
206 | pub unsafe extern "C" fn sled_scan_prefix(
207 |     db: *mut Db,
208 |     key: *const c_char,
209 |     keylen: size_t,
210 | ) -> *mut Iter {
211 |     let k = slice::from_raw_parts(key as *const u8, keylen);
212 |     Box::into_raw(Box::new((*db).scan_prefix(k)))
213 | }
214 | 
215 | /// Get they next kv pair from an iterator.
216 | /// Caller is responsible for freeing the key and value with `sled_free_buf`.
217 | /// Returns 0 when exhausted.
218 | #[no_mangle]
219 | pub unsafe extern "C" fn sled_iter_next(
220 |     iter: *mut Iter,
221 |     key: *mut *const c_char,
222 |     keylen: *mut size_t,
223 |     val: *mut *const c_char,
224 |     vallen: *mut size_t,
225 | ) -> c_uchar {
226 |     match (*iter).next() {
227 |         Some(Ok((k, v))) => {
228 |             *key = leak_buf(k.to_vec(), keylen);
229 |             *val = leak_buf(v.to_vec(), vallen);
230 |             1
231 |         }
232 |         // TODO proper error propagation
233 |         Some(Err(e)) => panic!("{:?}", e),
234 |         None => 0,
235 |     }
236 | }
237 | 


--------------------------------------------------------------------------------
/src/atomic_shim.rs:
--------------------------------------------------------------------------------
  1 | ///! Inline of `https://github.com/bltavares/atomic-shim`
  2 | 
  3 | #[cfg(not(any(
  4 |     target_arch = "mips",
  5 |     target_arch = "powerpc",
  6 |     feature = "mutex"
  7 | )))]
  8 | pub use std::sync::atomic::{AtomicI64, AtomicU64};
  9 | #[cfg(any(target_arch = "mips", target_arch = "powerpc", feature = "mutex"))]
 10 | mod shim {
 11 |     use parking_lot::{const_rwlock, RwLock};
 12 |     use std::sync::atomic::Ordering;
 13 | 
 14 |     #[derive(Debug, Default)]
 15 |     pub struct AtomicU64 {
 16 |         value: RwLock<u64>,
 17 |     }
 18 | 
 19 |     impl AtomicU64 {
 20 |         pub const fn new(v: u64) -> Self {
 21 |             Self { value: const_rwlock(v) }
 22 |         }
 23 | 
 24 |         #[allow(dead_code)]
 25 |         pub fn load(&self, _: Ordering) -> u64 {
 26 |             *self.value.read()
 27 |         }
 28 | 
 29 |         #[allow(dead_code)]
 30 |         pub fn store(&self, value: u64, _: Ordering) {
 31 |             let mut lock = self.value.write();
 32 |             *lock = value;
 33 |         }
 34 | 
 35 |         #[allow(dead_code)]
 36 |         pub fn swap(&self, value: u64, _: Ordering) -> u64 {
 37 |             let mut lock = self.value.write();
 38 |             let prev = *lock;
 39 |             *lock = value;
 40 |             prev
 41 |         }
 42 | 
 43 |         #[allow(dead_code)]
 44 |         pub fn compare_exchange(
 45 |             &self,
 46 |             current: u64,
 47 |             new: u64,
 48 |             _: Ordering,
 49 |             _: Ordering,
 50 |         ) -> Result<u64, u64> {
 51 |             let mut lock = self.value.write();
 52 |             let prev = *lock;
 53 |             if prev == current {
 54 |                 *lock = new;
 55 |                 Ok(current)
 56 |             } else {
 57 |                 Err(prev)
 58 |             }
 59 |         }
 60 | 
 61 |         #[allow(dead_code)]
 62 |         pub fn compare_exchange_weak(
 63 |             &self,
 64 |             current: u64,
 65 |             new: u64,
 66 |             success: Ordering,
 67 |             failure: Ordering,
 68 |         ) -> Result<u64, u64> {
 69 |             self.compare_exchange(current, new, success, failure)
 70 |         }
 71 | 
 72 |         #[allow(dead_code)]
 73 |         pub fn fetch_add(&self, val: u64, _: Ordering) -> u64 {
 74 |             let mut lock = self.value.write();
 75 |             let prev = *lock;
 76 |             *lock = prev.wrapping_add(val);
 77 |             prev
 78 |         }
 79 | 
 80 |         #[allow(dead_code)]
 81 |         pub fn fetch_sub(&self, val: u64, _: Ordering) -> u64 {
 82 |             let mut lock = self.value.write();
 83 |             let prev = *lock;
 84 |             *lock = prev.wrapping_sub(val);
 85 |             prev
 86 |         }
 87 | 
 88 |         #[allow(dead_code)]
 89 |         pub fn fetch_and(&self, val: u64, _: Ordering) -> u64 {
 90 |             let mut lock = self.value.write();
 91 |             let prev = *lock;
 92 |             *lock = prev & val;
 93 |             prev
 94 |         }
 95 | 
 96 |         #[allow(dead_code)]
 97 |         pub fn fetch_nand(&self, val: u64, _: Ordering) -> u64 {
 98 |             let mut lock = self.value.write();
 99 |             let prev = *lock;
100 |             *lock = !(prev & val);
101 |             prev
102 |         }
103 | 
104 |         #[allow(dead_code)]
105 |         pub fn fetch_or(&self, val: u64, _: Ordering) -> u64 {
106 |             let mut lock = self.value.write();
107 |             let prev = *lock;
108 |             *lock = prev | val;
109 |             prev
110 |         }
111 | 
112 |         #[allow(dead_code)]
113 |         pub fn fetch_xor(&self, val: u64, _: Ordering) -> u64 {
114 |             let mut lock = self.value.write();
115 |             let prev = *lock;
116 |             *lock = prev ^ val;
117 |             prev
118 |         }
119 |     }
120 | 
121 |     impl From<u64> for AtomicU64 {
122 |         fn from(value: u64) -> Self {
123 |             AtomicU64::new(value)
124 |         }
125 |     }
126 | 
127 |     #[derive(Debug, Default)]
128 |     pub struct AtomicI64 {
129 |         value: RwLock<i64>,
130 |     }
131 | 
132 |     impl AtomicI64 {
133 |         pub fn new(v: i64) -> Self {
134 |             Self { value: const_rwlock(v) }
135 |         }
136 | 
137 |         #[allow(dead_code)]
138 |         pub fn load(&self, _: Ordering) -> i64 {
139 |             *self.value.read()
140 |         }
141 | 
142 |         #[allow(dead_code)]
143 |         pub fn store(&self, value: i64, _: Ordering) {
144 |             let mut lock = self.value.write();
145 |             *lock = value;
146 |         }
147 | 
148 |         #[allow(dead_code)]
149 |         pub fn swap(&self, value: i64, _: Ordering) -> i64 {
150 |             let mut lock = self.value.write();
151 |             let prev = *lock;
152 |             *lock = value;
153 |             prev
154 |         }
155 | 
156 |         #[allow(dead_code)]
157 |         pub fn compare_exchange(
158 |             &self,
159 |             current: i64,
160 |             new: i64,
161 |             _: Ordering,
162 |             _: Ordering,
163 |         ) -> Result<i64, i64> {
164 |             let mut lock = self.value.write();
165 |             let prev = *lock;
166 |             if prev == current {
167 |                 *lock = new;
168 |                 Ok(current)
169 |             } else {
170 |                 Err(prev)
171 |             }
172 |         }
173 | 
174 |         #[allow(dead_code)]
175 |         pub fn compare_exchange_weak(
176 |             &self,
177 |             current: i64,
178 |             new: i64,
179 |             success: Ordering,
180 |             failure: Ordering,
181 |         ) -> Result<i64, i64> {
182 |             self.compare_exchange(current, new, success, failure)
183 |         }
184 | 
185 |         #[allow(dead_code)]
186 |         pub fn fetch_add(&self, val: i64, _: Ordering) -> i64 {
187 |             let mut lock = self.value.write();
188 |             let prev = *lock;
189 |             *lock = prev.wrapping_add(val);
190 |             prev
191 |         }
192 | 
193 |         #[allow(dead_code)]
194 |         pub fn fetch_sub(&self, val: i64, _: Ordering) -> i64 {
195 |             let mut lock = self.value.write();
196 |             let prev = *lock;
197 |             *lock = prev.wrapping_sub(val);
198 |             prev
199 |         }
200 | 
201 |         #[allow(dead_code)]
202 |         pub fn fetch_and(&self, val: i64, _: Ordering) -> i64 {
203 |             let mut lock = self.value.write();
204 |             let prev = *lock;
205 |             *lock = prev & val;
206 |             prev
207 |         }
208 | 
209 |         #[allow(dead_code)]
210 |         pub fn fetch_nand(&self, val: i64, _: Ordering) -> i64 {
211 |             let mut lock = self.value.write();
212 |             let prev = *lock;
213 |             *lock = !(prev & val);
214 |             prev
215 |         }
216 | 
217 |         #[allow(dead_code)]
218 |         pub fn fetch_or(&self, val: i64, _: Ordering) -> i64 {
219 |             let mut lock = self.value.write();
220 |             let prev = *lock;
221 |             *lock = prev | val;
222 |             prev
223 |         }
224 | 
225 |         #[allow(dead_code)]
226 |         pub fn fetch_xor(&self, val: i64, _: Ordering) -> i64 {
227 |             let mut lock = self.value.write();
228 |             let prev = *lock;
229 |             *lock = prev ^ val;
230 |             prev
231 |         }
232 |     }
233 | 
234 |     impl From<i64> for AtomicI64 {
235 |         fn from(value: i64) -> Self {
236 |             AtomicI64::new(value)
237 |         }
238 |     }
239 | }
240 | 
241 | #[cfg(any(
242 |     target_arch = "mips",
243 |     target_arch = "powerpc",
244 |     feature = "mutex"
245 | ))]
246 | pub use shim::{AtomicI64, AtomicU64};
247 | 


--------------------------------------------------------------------------------
/src/stack.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unsafe_code)]
  2 | 
  3 | use std::{
  4 |     fmt::{self, Debug},
  5 |     ops::Deref,
  6 |     sync::atomic::Ordering::{Acquire, Release},
  7 | };
  8 | 
  9 | use crossbeam_epoch::{unprotected, Atomic, Guard, Owned, Shared};
 10 | 
 11 | use crate::debug_delay;
 12 | 
 13 | /// A node in the lock-free `Stack`.
 14 | #[derive(Debug)]
 15 | pub struct Node<T: Send + 'static> {
 16 |     pub(crate) inner: T,
 17 |     pub(crate) next: Atomic<Node<T>>,
 18 | }
 19 | 
 20 | impl<T: Send + 'static> Drop for Node<T> {
 21 |     fn drop(&mut self) {
 22 |         unsafe {
 23 |             let mut cursor = self.next.load(Acquire, unprotected());
 24 | 
 25 |             while !cursor.is_null() {
 26 |                 // we carefully unset the next pointer here to avoid
 27 |                 // a stack overflow when freeing long lists.
 28 |                 let node = cursor.into_owned();
 29 |                 cursor = node.next.swap(Shared::null(), Acquire, unprotected());
 30 |                 drop(node);
 31 |             }
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | /// A simple lock-free stack, with the ability to atomically
 37 | /// append or entirely swap-out entries.
 38 | pub struct Stack<T: Send + 'static> {
 39 |     head: Atomic<Node<T>>,
 40 | }
 41 | 
 42 | impl<T: Send + 'static> Default for Stack<T> {
 43 |     fn default() -> Self {
 44 |         Self { head: Atomic::null() }
 45 |     }
 46 | }
 47 | 
 48 | impl<T: Send + 'static> Drop for Stack<T> {
 49 |     fn drop(&mut self) {
 50 |         unsafe {
 51 |             let curr = self.head.load(Acquire, unprotected());
 52 |             if !curr.as_raw().is_null() {
 53 |                 drop(curr.into_owned());
 54 |             }
 55 |         }
 56 |     }
 57 | }
 58 | 
 59 | impl<T> Debug for Stack<T>
 60 | where
 61 |     T: Clone + Debug + Send + 'static + Sync,
 62 | {
 63 |     fn fmt(
 64 |         &self,
 65 |         formatter: &mut fmt::Formatter<'_>,
 66 |     ) -> Result<(), fmt::Error> {
 67 |         let guard = crossbeam_epoch::pin();
 68 |         let head = self.head(&guard);
 69 |         let iter = Iter::from_ptr(head, &guard);
 70 | 
 71 |         formatter.write_str("Stack [")?;
 72 |         let mut written = false;
 73 |         for node in iter {
 74 |             if written {
 75 |                 formatter.write_str(", ")?;
 76 |             }
 77 |             formatter.write_str(&*format!("({:?}) ", &node))?;
 78 |             node.fmt(formatter)?;
 79 |             written = true;
 80 |         }
 81 |         formatter.write_str("]")?;
 82 |         Ok(())
 83 |     }
 84 | }
 85 | 
 86 | impl<T: Send + 'static> Deref for Node<T> {
 87 |     type Target = T;
 88 |     fn deref(&self) -> &T {
 89 |         &self.inner
 90 |     }
 91 | }
 92 | 
 93 | impl<T: Send + Sync + 'static> Stack<T> {
 94 |     /// Add an item to the stack, spinning until successful.
 95 |     pub(crate) fn push(&self, inner: T, guard: &Guard) {
 96 |         debug_delay();
 97 |         let node = Owned::new(Node { inner, next: Atomic::null() });
 98 | 
 99 |         unsafe {
100 |             let node = node.into_shared(guard);
101 | 
102 |             loop {
103 |                 let head = self.head(guard);
104 |                 node.deref().next.store(head, Release);
105 |                 if self.head.compare_and_set(head, node, Release, guard).is_ok()
106 |                 {
107 |                     return;
108 |                 }
109 |             }
110 |         }
111 |     }
112 | 
113 |     /// Clears the stack and returns all items
114 |     pub(crate) fn take_iter<'a>(
115 |         &self,
116 |         guard: &'a Guard,
117 |     ) -> impl Iterator<Item = &'a T> {
118 |         debug_delay();
119 |         let node = self.head.swap(Shared::null(), Release, guard);
120 | 
121 |         let iter = Iter { inner: node, guard };
122 | 
123 |         if !node.is_null() {
124 |             unsafe {
125 |                 guard.defer_destroy(node);
126 |             }
127 |         }
128 | 
129 |         iter
130 |     }
131 | 
132 |     /// Pop the next item off the stack. Returns None if nothing is there.
133 |     pub(crate) fn pop(&self, guard: &Guard) -> Option<T> {
134 |         use std::ptr;
135 |         use std::sync::atomic::Ordering::SeqCst;
136 |         debug_delay();
137 |         let mut head = self.head(guard);
138 |         loop {
139 |             match unsafe { head.as_ref() } {
140 |                 Some(h) => {
141 |                     let next = h.next.load(Acquire, guard);
142 |                     match self.head.compare_and_set(head, next, Release, guard)
143 |                     {
144 |                         Ok(_) => unsafe {
145 |                             // we unset the next pointer before destruction
146 |                             // to avoid double-frees.
147 |                             h.next.store(Shared::default(), SeqCst);
148 |                             guard.defer_destroy(head);
149 |                             return Some(ptr::read(&h.inner));
150 |                         },
151 |                         Err(h) => head = h.current,
152 |                     }
153 |                 }
154 |                 None => return None,
155 |             }
156 |         }
157 |     }
158 | 
159 |     /// Returns the current head pointer of the stack, which can
160 |     /// later be used as the key for cas and cap operations.
161 |     pub(crate) fn head<'g>(&self, guard: &'g Guard) -> Shared<'g, Node<T>> {
162 |         self.head.load(Acquire, guard)
163 |     }
164 | }
165 | 
166 | /// An iterator over nodes in a lock-free stack.
167 | pub struct Iter<'a, T>
168 | where
169 |     T: Send + 'static + Sync,
170 | {
171 |     inner: Shared<'a, Node<T>>,
172 |     guard: &'a Guard,
173 | }
174 | 
175 | impl<'a, T> Iter<'a, T>
176 | where
177 |     T: 'a + Send + 'static + Sync,
178 | {
179 |     /// Creates a `Iter` from a pointer to one.
180 |     pub(crate) fn from_ptr<'b>(
181 |         ptr: Shared<'b, Node<T>>,
182 |         guard: &'b Guard,
183 |     ) -> Iter<'b, T> {
184 |         Iter { inner: ptr, guard }
185 |     }
186 | }
187 | 
188 | impl<'a, T> Iterator for Iter<'a, T>
189 | where
190 |     T: Send + 'static + Sync,
191 | {
192 |     type Item = &'a T;
193 | 
194 |     fn next(&mut self) -> Option<Self::Item> {
195 |         debug_delay();
196 |         if self.inner.is_null() {
197 |             None
198 |         } else {
199 |             unsafe {
200 |                 let ret = &self.inner.deref().inner;
201 |                 self.inner = self.inner.deref().next.load(Acquire, self.guard);
202 |                 Some(ret)
203 |             }
204 |         }
205 |     }
206 | 
207 |     fn size_hint(&self) -> (usize, Option<usize>) {
208 |         let mut size = 0;
209 |         let mut cursor = self.inner;
210 | 
211 |         while !cursor.is_null() {
212 |             unsafe {
213 |                 cursor = cursor.deref().next.load(Acquire, self.guard);
214 |             }
215 |             size += 1;
216 |         }
217 | 
218 |         (size, Some(size))
219 |     }
220 | }
221 | 
222 | #[test]
223 | #[cfg(not(miri))] // can't create threads
224 | fn basic_functionality() {
225 |     use crossbeam_epoch::pin;
226 |     use crossbeam_utils::CachePadded;
227 |     use std::sync::Arc;
228 |     use std::thread;
229 | 
230 |     let guard = pin();
231 |     let ll = Arc::new(Stack::default());
232 |     assert_eq!(ll.pop(&guard), None);
233 |     ll.push(CachePadded::new(1), &guard);
234 |     let ll2 = Arc::clone(&ll);
235 |     let t = thread::spawn(move || {
236 |         let guard = pin();
237 |         ll2.push(CachePadded::new(2), &guard);
238 |         ll2.push(CachePadded::new(3), &guard);
239 |         ll2.push(CachePadded::new(4), &guard);
240 |         guard.flush();
241 |     });
242 |     t.join().unwrap();
243 |     ll.push(CachePadded::new(5), &guard);
244 |     assert_eq!(ll.pop(&guard), Some(CachePadded::new(5)));
245 |     assert_eq!(ll.pop(&guard), Some(CachePadded::new(4)));
246 |     let ll3 = Arc::clone(&ll);
247 |     let t = thread::spawn(move || {
248 |         let guard = pin();
249 |         assert_eq!(ll3.pop(&guard), Some(CachePadded::new(3)));
250 |         assert_eq!(ll3.pop(&guard), Some(CachePadded::new(2)));
251 |         guard.flush();
252 |     });
253 |     t.join().unwrap();
254 |     assert_eq!(ll.pop(&guard), Some(CachePadded::new(1)));
255 |     let ll4 = Arc::clone(&ll);
256 |     let t = thread::spawn(move || {
257 |         let guard = pin();
258 |         assert_eq!(ll4.pop(&guard), None);
259 |         guard.flush();
260 |     });
261 |     t.join().unwrap();
262 |     drop(ll);
263 |     guard.flush();
264 |     drop(guard);
265 | }
266 | 


--------------------------------------------------------------------------------
/src/arc.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unsafe_code)]
  2 | 
  3 | /// We create our own `Arc` because we never use the weak
  4 | /// count on the std `Arc`, but we use a LOT of `Arc`'s, so
  5 | /// the extra 8 bytes turn into a huge overhead.
  6 | use std::{
  7 |     alloc::{alloc, dealloc, Layout},
  8 |     convert::TryFrom,
  9 |     fmt::{self, Debug},
 10 |     mem,
 11 |     ops::Deref,
 12 |     ptr,
 13 |     sync::atomic::{AtomicUsize, Ordering},
 14 | };
 15 | 
 16 | // we make this repr(C) because we do a raw
 17 | // write to the beginning where we expect
 18 | // the rc to be.
 19 | #[repr(C)]
 20 | struct ArcInner<T: ?Sized> {
 21 |     rc: AtomicUsize,
 22 |     inner: T,
 23 | }
 24 | 
 25 | pub struct Arc<T: ?Sized> {
 26 |     ptr: *mut ArcInner<T>,
 27 | }
 28 | 
 29 | unsafe impl<T: Send + Sync + ?Sized> Send for Arc<T> {}
 30 | 
 31 | unsafe impl<T: Send + Sync + ?Sized> Sync for Arc<T> {}
 32 | 
 33 | impl<T: Debug + ?Sized> Debug for Arc<T> {
 34 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
 35 |         Debug::fmt(&**self, f)
 36 |     }
 37 | }
 38 | 
 39 | impl<T> Arc<T> {
 40 |     pub fn new(inner: T) -> Arc<T> {
 41 |         let bx = Box::new(ArcInner { inner, rc: AtomicUsize::new(1) });
 42 |         let ptr = Box::into_raw(bx);
 43 |         Arc { ptr }
 44 |     }
 45 | 
 46 |     // See std::sync::arc::Arc::copy_from_slice,
 47 |     // "Unsafe because the caller must either take ownership or bind `T: Copy`"
 48 |     unsafe fn copy_from_slice(s: &[T]) -> Arc<[T]> {
 49 |         let align =
 50 |             std::cmp::max(mem::align_of::<T>(), mem::align_of::<AtomicUsize>());
 51 | 
 52 |         let rc_width = std::cmp::max(align, mem::size_of::<AtomicUsize>());
 53 |         let data_width = mem::size_of::<T>().checked_mul(s.len()).unwrap();
 54 | 
 55 |         let size_unpadded = rc_width.checked_add(data_width).unwrap();
 56 |         // Pad size out to alignment
 57 |         let size_padded = (size_unpadded + align - 1) & !(align - 1);
 58 | 
 59 |         let layout = Layout::from_size_align(size_padded, align).unwrap();
 60 | 
 61 |         let ptr = alloc(layout);
 62 | 
 63 |         assert!(!ptr.is_null(), "failed to allocate Arc");
 64 |         #[allow(clippy::cast_ptr_alignment)]
 65 |         ptr::write(ptr as _, AtomicUsize::new(1));
 66 | 
 67 |         let data_ptr = ptr.add(rc_width);
 68 |         ptr::copy_nonoverlapping(s.as_ptr(), data_ptr as _, s.len());
 69 | 
 70 |         let fat_ptr: *const ArcInner<[T]> = Arc::fatten(ptr, s.len());
 71 | 
 72 |         Arc { ptr: fat_ptr as *mut _ }
 73 |     }
 74 | 
 75 |     /// <https://users.rust-lang.org/t/construct-fat-pointer-to-struct/29198/9>
 76 |     #[allow(trivial_casts)]
 77 |     fn fatten(data: *const u8, len: usize) -> *const ArcInner<[T]> {
 78 |         // Requirements of slice::from_raw_parts.
 79 |         assert!(!data.is_null());
 80 |         assert!(isize::try_from(len).is_ok());
 81 | 
 82 |         let slice =
 83 |             unsafe { core::slice::from_raw_parts(data as *const (), len) };
 84 |         slice as *const [()] as *const _
 85 |     }
 86 | 
 87 |     pub fn into_raw(arc: Arc<T>) -> *const T {
 88 |         let ptr = unsafe { &(*arc.ptr).inner };
 89 |         #[allow(clippy::mem_forget)]
 90 |         mem::forget(arc);
 91 |         ptr
 92 |     }
 93 | 
 94 |     pub unsafe fn from_raw(ptr: *const T) -> Arc<T> {
 95 |         let align =
 96 |             std::cmp::max(mem::align_of::<T>(), mem::align_of::<AtomicUsize>());
 97 | 
 98 |         let rc_width = std::cmp::max(align, mem::size_of::<AtomicUsize>());
 99 | 
100 |         let sub_ptr = (ptr as *const u8).sub(rc_width) as *mut ArcInner<T>;
101 | 
102 |         Arc { ptr: sub_ptr }
103 |     }
104 | }
105 | 
106 | impl<T: ?Sized> Arc<T> {
107 |     pub fn strong_count(arc: &Arc<T>) -> usize {
108 |         unsafe { (*arc.ptr).rc.load(Ordering::Acquire) }
109 |     }
110 | 
111 |     pub fn get_mut(arc: &mut Arc<T>) -> Option<&mut T> {
112 |         if Arc::strong_count(arc) == 1 {
113 |             Some(unsafe { &mut arc.ptr.as_mut().unwrap().inner })
114 |         } else {
115 |             None
116 |         }
117 |     }
118 | }
119 | 
120 | impl<T: ?Sized + Clone> Arc<T> {
121 |     pub fn make_mut(arc: &mut Arc<T>) -> &mut T {
122 |         if Arc::strong_count(arc) != 1 {
123 |             *arc = Arc::new((**arc).clone());
124 |             assert_eq!(Arc::strong_count(arc), 1);
125 |         }
126 |         Arc::get_mut(arc).unwrap()
127 |     }
128 | }
129 | 
130 | impl<T: Default> Default for Arc<T> {
131 |     fn default() -> Arc<T> {
132 |         Arc::new(T::default())
133 |     }
134 | }
135 | 
136 | impl<T: ?Sized> Clone for Arc<T> {
137 |     fn clone(&self) -> Arc<T> {
138 |         // safe to use Relaxed ordering below because
139 |         // of the required synchronization for passing
140 |         // any objects to another thread.
141 |         let last_count =
142 |             unsafe { (*self.ptr).rc.fetch_add(1, Ordering::Relaxed) };
143 | 
144 |         if last_count == usize::max_value() {
145 |             std::process::abort();
146 |         }
147 | 
148 |         Arc { ptr: self.ptr }
149 |     }
150 | }
151 | 
152 | impl<T: ?Sized> Drop for Arc<T> {
153 |     fn drop(&mut self) {
154 |         unsafe {
155 |             let rc = (*self.ptr).rc.fetch_sub(1, Ordering::Release) - 1;
156 |             if rc == 0 {
157 |                 std::sync::atomic::fence(Ordering::Acquire);
158 |                 Box::from_raw(self.ptr);
159 |             }
160 |         }
161 |     }
162 | }
163 | 
164 | impl<T: Copy> From<&[T]> for Arc<[T]> {
165 |     #[inline]
166 |     fn from(s: &[T]) -> Arc<[T]> {
167 |         unsafe { Arc::copy_from_slice(s) }
168 |     }
169 | }
170 | 
171 | #[allow(clippy::fallible_impl_from)]
172 | impl<T> From<Box<[T]>> for Arc<[T]> {
173 |     #[inline]
174 |     fn from(b: Box<[T]>) -> Arc<[T]> {
175 |         let len = b.len();
176 |         unsafe {
177 |             let src = Box::into_raw(b);
178 |             let value_layout = Layout::for_value(&*src);
179 |             let align = std::cmp::max(
180 |                 value_layout.align(),
181 |                 mem::align_of::<AtomicUsize>(),
182 |             );
183 |             let rc_width = std::cmp::max(align, mem::size_of::<AtomicUsize>());
184 |             let unpadded_size =
185 |                 rc_width.checked_add(value_layout.size()).unwrap();
186 |             // pad the total `Arc` allocation size to the alignment of
187 |             // `max(value, AtomicUsize)`
188 |             let size = (unpadded_size + align - 1) & !(align - 1);
189 |             let dst_layout = Layout::from_size_align(size, align).unwrap();
190 |             let dst = alloc(dst_layout);
191 |             assert!(!dst.is_null(), "failed to allocate Arc");
192 | 
193 |             #[allow(clippy::cast_ptr_alignment)]
194 |             ptr::write(dst as _, AtomicUsize::new(1));
195 |             let data_ptr = dst.add(rc_width);
196 |             ptr::copy_nonoverlapping(
197 |                 src as *const u8,
198 |                 data_ptr,
199 |                 value_layout.size(),
200 |             );
201 | 
202 |             // free the old box memory without running Drop
203 |             if value_layout.size() != 0 {
204 |                 dealloc(src as *mut u8, value_layout);
205 |             }
206 | 
207 |             let fat_ptr: *const ArcInner<[T]> = Arc::fatten(dst, len);
208 | 
209 |             Arc { ptr: fat_ptr as *mut _ }
210 |         }
211 |     }
212 | }
213 | 
214 | #[test]
215 | fn boxed_slice_to_arc_slice() {
216 |     let box1: Box<[u8]> = Box::new([1, 2, 3]);
217 |     let arc1: Arc<[u8]> = box1.into();
218 |     assert_eq!(&*arc1, &*vec![1, 2, 3]);
219 |     let box2: Box<[u8]> = Box::new([]);
220 |     let arc2: Arc<[u8]> = box2.into();
221 |     assert_eq!(&*arc2, &*vec![]);
222 |     let box3: Box<[u64]> = Box::new([1, 2, 3]);
223 |     let arc3: Arc<[u64]> = box3.into();
224 |     assert_eq!(&*arc3, &*vec![1, 2, 3]);
225 | }
226 | 
227 | impl<T> From<Vec<T>> for Arc<[T]> {
228 |     #[inline]
229 |     fn from(mut v: Vec<T>) -> Arc<[T]> {
230 |         unsafe {
231 |             let arc = Arc::copy_from_slice(&v);
232 | 
233 |             // Allow the Vec to free its memory, but not destroy its contents
234 |             v.set_len(0);
235 | 
236 |             arc
237 |         }
238 |     }
239 | }
240 | 
241 | impl<T: ?Sized> Deref for Arc<T> {
242 |     type Target = T;
243 | 
244 |     fn deref(&self) -> &T {
245 |         unsafe { &(*self.ptr).inner }
246 |     }
247 | }
248 | 
249 | impl<T: ?Sized> std::borrow::Borrow<T> for Arc<T> {
250 |     fn borrow(&self) -> &T {
251 |         &**self
252 |     }
253 | }
254 | 
255 | impl<T: ?Sized> AsRef<T> for Arc<T> {
256 |     fn as_ref(&self) -> &T {
257 |         &**self
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------