├── benchmarks ├── criterion │ ├── src │ │ └── lib.rs │ ├── Cargo.toml │ └── benches │ │ └── sled.rs └── stress2 │ ├── lsan.sh │ ├── tsan.sh │ └── Cargo.toml ├── bindings ├── neon-sled │ ├── .npmignore │ ├── native │ │ ├── index.node │ │ ├── build.rs │ │ ├── artifacts.json │ │ ├── Cargo.toml │ │ └── src │ │ │ └── lib.rs │ ├── README.md │ ├── package.json │ └── lib │ │ └── index.js ├── python │ ├── alice_populate.py │ ├── alice_check.py │ └── rsdb.py └── sled-native │ ├── README.md │ ├── cbindgen.toml │ ├── Cargo.toml │ └── src │ └── lib.rs ├── art ├── tree_face.png ├── tree_face_anti-transphobia.png └── CREDITS ├── .github ├── ISSUE_TEMPLATE │ ├── blank_issue.md │ ├── config.yml │ ├── feature_request.md │ └── bugs.md ├── FUNDING.yml └── workflows │ └── test.yml ├── .rustfmt.toml ├── experiments ├── new_segment_ownership │ ├── Cargo.lock │ ├── Cargo.toml │ └── src │ │ └── main.rs └── epoch │ ├── Cargo.toml │ ├── sanitizers.sh │ ├── Cargo.lock │ └── src │ └── main.rs ├── scripts ├── shufnice.sh ├── ubuntu_bench ├── cross_compile.sh ├── instructions ├── sanitizers.sh └── execution_explorer.py ├── .gitignore ├── tests ├── test_space_leaks.rs ├── common │ └── mod.rs └── test_quiescent.rs ├── src ├── measure_allocs.rs ├── fail.rs ├── pagecache │ ├── parallel_io_unix.rs │ ├── constants.rs │ ├── header.rs │ ├── disk_pointer.rs │ ├── parallel_io_windows.rs │ ├── parallel_io_polyfill.rs │ ├── reservation.rs │ └── pagetable.rs ├── doc │ ├── performance_guide │ │ └── mod.rs │ ├── testing_strategies │ │ └── mod.rs │ ├── reactive_semantics │ │ └── mod.rs │ ├── limits │ │ └── mod.rs │ ├── engineering_practices │ │ └── mod.rs │ ├── mod.rs │ ├── merge_operators │ │ └── mod.rs │ └── motivating_experiences │ │ └── mod.rs ├── fastcmp.rs ├── fastlock.rs ├── batch.rs ├── lazy.rs ├── concurrency_control.rs ├── varint.rs ├── context.rs ├── debug_delay.rs ├── meta.rs ├── oneshot.rs ├── sys_limits.rs ├── dll.rs ├── flusher.rs ├── threadpool.rs ├── result.rs ├── event_log.rs ├── atomic_shim.rs ├── stack.rs └── arc.rs ├── SECURITY.md ├── tsan_suppressions.txt ├── LICENSE-MIT ├── RELEASE_CHECKLIST.md ├── CONTRIBUTING.md ├── examples └── playground.rs ├── Cargo.toml ├── code-of-conduct.md └── SAFETY.md /benchmarks/criterion/src/lib.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bindings/neon-sled/.npmignore: -------------------------------------------------------------------------------- 1 | native/target 2 | native/index.node -------------------------------------------------------------------------------- /art/tree_face.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timokoesters/sled/master/art/tree_face.png -------------------------------------------------------------------------------- /art/tree_face_anti-transphobia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timokoesters/sled/master/art/tree_face_anti-transphobia.png -------------------------------------------------------------------------------- /bindings/neon-sled/native/index.node: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timokoesters/sled/master/bindings/neon-sled/native/index.node -------------------------------------------------------------------------------- /art/CREDITS: -------------------------------------------------------------------------------- 1 | original tree logo with face: 2 | https://twitter.com/daiyitastic 3 | 4 | anti-transphobia additions: 5 | spacejam 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/blank_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Issue (do not use this for bug reports or feature requests) 3 | about: Create an issue with a blank template. 4 | --- 5 | -------------------------------------------------------------------------------- /bindings/python/alice_populate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from sled import Conf 3 | 4 | c = Conf() 5 | c.path(b"ALICE.data") 6 | t = c.tree() 7 | t.set(b"k1", b"v1") 8 | t.close() 9 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | version = "Two" 2 | use_small_heuristics = "Max" 3 | reorder_imports = true 4 | max_width = 80 5 | wrap_comments = true 6 | combine_control_expr = true 7 | report_todo = "Always" 8 | -------------------------------------------------------------------------------- /bindings/neon-sled/native/build.rs: -------------------------------------------------------------------------------- 1 | extern crate neon_build; 2 | 3 | fn main() { 4 | neon_build::setup(); // must be called in build.rs 5 | 6 | // add project-specific build logic here... 7 | } 8 | -------------------------------------------------------------------------------- /bindings/neon-sled/README.md: -------------------------------------------------------------------------------- 1 | # neon-sled 2 | 3 | $ cargo check 4 | $ neon build 5 | $ 6 | 7 | ## in node 8 | 9 | $ node 10 | > let addon = require('.') 11 | 12 | 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: sled discord 4 | url: https://discord.gg/Z6VsXds 5 | about: Please ask questions in the discord server here. 6 | -------------------------------------------------------------------------------- /experiments/new_segment_ownership/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "new_segment_ownership" 5 | version = "0.1.0" 6 | 7 | -------------------------------------------------------------------------------- /bindings/sled-native/README.md: -------------------------------------------------------------------------------- 1 | # Native C-API for sled 2 | 3 | ## Building 4 | 5 | ``` 6 | $ cargo install cargo-c 7 | $ cargo cinstall --prefix=/usr --destdir=/tmp/staging 8 | $ sudo cp -a /tmp/staging/* / 9 | ``` 10 | 11 | 12 | -------------------------------------------------------------------------------- /experiments/epoch/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "epoch" 3 | version = "0.1.0" 4 | authors = ["Tyler Neely "] 5 | edition = "2018" 6 | 7 | [profile.release] 8 | debug = true 9 | 10 | [dependencies] 11 | crossbeam-epoch = "0.8.0" 12 | -------------------------------------------------------------------------------- /scripts/shufnice.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | while true; do 4 | PID=`pgrep $1` 5 | TIDS=`ls /proc/$PID/task` 6 | TID=`echo $TIDS | tr " " "\n" | shuf -n1` 7 | NICE=$((`shuf -i 0-39 -n 1` - 20)) 8 | echo "renicing $TID to $NICE" 9 | renice -n $NICE -p $TID 10 | done 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Request a feature for sled 4 | labels: feature 5 | --- 6 | 7 | #### Use Case: 8 | 9 | #### Proposed Change: 10 | 11 | #### Who Benefits From The Change(s)? 12 | 13 | #### Alternative Approaches 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *db 2 | *conf 3 | *snap.* 4 | *grind.out* 5 | vgcore* 6 | *.bk 7 | *orig 8 | tags 9 | perf* 10 | *folded 11 | *out 12 | *perf 13 | *svg 14 | *txt 15 | experiments 16 | target 17 | Cargo.lock 18 | *swp 19 | *swo 20 | *.proptest-regressions 21 | corpus 22 | artifacts 23 | .idea 24 | cargo-timing* 25 | -------------------------------------------------------------------------------- /experiments/new_segment_ownership/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "new_segment_ownership" 3 | version = "0.1.0" 4 | authors = ["Tyler Neely "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | -------------------------------------------------------------------------------- /benchmarks/stress2/lsan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | echo "lsan" 6 | export RUSTFLAGS="-Z sanitizer=leak" 7 | cargo build --features=no_jemalloc --target x86_64-unknown-linux-gnu 8 | rm -rf default.sled 9 | target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10 --set-prop=100000000 --val-len=100000 10 | -------------------------------------------------------------------------------- /bindings/neon-sled/native/artifacts.json: -------------------------------------------------------------------------------- 1 | {"active":"release","targets":{"release":{"rustc":"rustc 1.20.0 (f3d6973f4 2017-08-27)","env":{"npm_config_target":"1.7.8","npm_config_arch":"x64","npm_config_target_arch":"x64","npm_config_disturl":"https://atom.io/download/electron","npm_config_runtime":"electron","npm_config_build_from_source":"true","npm_config_devdir":"/Users/mn/.electron-gyp"}}}} -------------------------------------------------------------------------------- /bindings/python/alice_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | 5 | from sled import Conf 6 | 7 | crashed_state_directory = sys.argv[1] 8 | os.chdir(crashed_state_directory) 9 | 10 | dirlist = os.listdir('.') 11 | 12 | assert("ALICE.data" in dirlist) 13 | 14 | c = Conf() 15 | c.path(b"ALICE.data") 16 | 17 | t = c.tree() 18 | 19 | assert(t.get(b"k1") == b"v1") 20 | -------------------------------------------------------------------------------- /bindings/neon-sled/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "neon-sled", 3 | "version": "0.1.3", 4 | "description": "", 5 | "main": "lib/index.js", 6 | "author": [ 7 | "Matthias Nehlsen ", 8 | "Tyler Neely " 9 | ], 10 | "license": "Apache 2", 11 | "dependencies": { 12 | "neon-cli": "^0.1.20" 13 | }, 14 | "scripts": { 15 | "install": "neon build" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /bindings/neon-sled/native/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "neon-sled" 3 | version = "0.1.2" 4 | authors = ["Matthias Nehlsen ", 5 | "Tyler Neely "] 6 | license = "Apache 2" 7 | build = "build.rs" 8 | 9 | [lib] 10 | name = "neon_sled" 11 | crate-type = ["dylib"] 12 | 13 | [build-dependencies] 14 | neon-build = "0.1.20" 15 | 16 | [dependencies] 17 | neon = "0.1.20" 18 | sled = "0.14" -------------------------------------------------------------------------------- /benchmarks/criterion/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "critter" 3 | publish = false 4 | version = "0.1.0" 5 | authors = ["Tyler Neely "] 6 | edition = "2018" 7 | 8 | [[bench]] 9 | name = "sled" 10 | harness = false 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | 14 | [dependencies] 15 | criterion = "0.3.0" 16 | sled = { path = "../.." } 17 | jemallocator = "0.3.2" 18 | -------------------------------------------------------------------------------- /benchmarks/stress2/tsan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | echo "tsan" 6 | export RUSTFLAGS="-Z sanitizer=thread" 7 | export TSAN_OPTIONS="suppressions=/home/t/src/sled/tsan_suppressions.txt" 8 | sudo rm -rf default.sled 9 | cargo +nightly run --features=lock_free_delays,no_jemalloc --target x86_64-unknown-linux-gnu -- --duration=6 10 | cargo +nightly run --features=lock_free_delays,no_jemalloc --target x86_64-unknown-linux-gnu -- --duration=6 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bugs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a bug 4 | labels: bug 5 | --- 6 | 7 | Bug reports must include all following items: 8 | 9 | 1. expected result 10 | 1. actual result 11 | 1. sled version 12 | 1. rustc version 13 | 1. operating system 14 | 1. minimal code sample that helps to reproduce the issue 15 | 1. logs, panic messages, stack traces 16 | 17 | Incomplete bug reports will be closed. 18 | 19 | Thank you for understanding :) 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /bindings/sled-native/cbindgen.toml: -------------------------------------------------------------------------------- 1 | header = "// SPDX-License-Identifier: Apache-2.0" 2 | sys_includes = ["stddef.h", "stdint.h", "stdlib.h"] 3 | no_includes = true 4 | include_guard = "SLED_H" 5 | tab_width = 4 6 | style = "Type" 7 | # language = "C" 8 | cpp_compat = true 9 | 10 | [parse] 11 | parse_deps = true 12 | include = ['sled'] 13 | 14 | [export] 15 | prefix = "Sled" 16 | item_types = ["enums", "structs", "unions", "typedefs", "opaque", "functions"] 17 | 18 | [enum] 19 | rename_variants = "ScreamingSnakeCase" 20 | prefix_with_name = true 21 | -------------------------------------------------------------------------------- /bindings/sled-native/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sled-native" 3 | version = "0.34.6" 4 | authors = ["Tyler Neely "] 5 | description = "a C-compatible API for sled" 6 | license = "Apache-2.0" 7 | homepage = "https://github.com/spacejam/sled" 8 | repository = "https://github.com/spacejam/sled/sled-native" 9 | keywords = ["database", "embedded", "concurrent", "persistent", "c"] 10 | documentation = "https://docs.rs/sled-native/" 11 | edition = "2018" 12 | 13 | [lib] 14 | name = "sled" 15 | crate-type = ["cdylib", "staticlib"] 16 | 17 | [dependencies] 18 | libc = "0.2.62" 19 | sled = {version = "0.34.6", path = "../.."} 20 | -------------------------------------------------------------------------------- /experiments/epoch/sanitizers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | 4 | echo "asan" 5 | cargo clean 6 | export RUSTFLAGS="-Z sanitizer=address" 7 | # export ASAN_OPTIONS="detect_odr_violation=0" 8 | cargo +nightly run --target x86_64-unknown-linux-gnu 9 | unset ASAN_OPTIONS 10 | 11 | echo "lsan" 12 | cargo clean 13 | export RUSTFLAGS="-Z sanitizer=leak" 14 | cargo +nightly run --target x86_64-unknown-linux-gnu 15 | 16 | echo "tsan" 17 | cargo clean 18 | export RUSTFLAGS="-Z sanitizer=thread" 19 | export TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt 20 | cargo +nightly run --target x86_64-unknown-linux-gnu 21 | unset RUSTFLAGS 22 | unset TSAN_OPTIONS 23 | -------------------------------------------------------------------------------- /scripts/ubuntu_bench: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sudo apt-get update 4 | sudo apt-get install htop dstat build-essential linux-tools-common linux-tools-generic linux-tools-`uname -r` 5 | curl https://sh.rustup.rs -sSf | sh 6 | source $HOME/.cargo/env 7 | 8 | cargo install flamegraph 9 | 10 | git clone https://github.com/spacejam/sled.git 11 | cd sled 12 | 13 | cores=$(grep -c ^processor /proc/cpuinfo) 14 | writers=(($cores / 5 + 1 )) 15 | readers=$(( ($cores / 5 + 1) * 4 )) 16 | 17 | cargo build --release --bin=stress2 --features=stress 18 | 19 | # we use sudo here to get access to symbols 20 | pushd benchmarks/stress2 21 | cargo flamegraph --release -- --get=$readers --set=$writers 22 | -------------------------------------------------------------------------------- /tests/test_space_leaks.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | 3 | #[test] 4 | #[cfg_attr(miri, ignore)] 5 | fn size_leak() -> sled::Result<()> { 6 | common::setup_logger(); 7 | 8 | let tree = sled::Config::new() 9 | .temporary(true) 10 | .segment_size(2048) 11 | .flush_every_ms(None) 12 | .open()?; 13 | 14 | for _ in 0..10_000 { 15 | tree.insert(b"", b"")?; 16 | } 17 | 18 | tree.flush()?; 19 | 20 | let sz = tree.size_on_disk()?; 21 | assert!( 22 | sz <= 16384, 23 | "expected system to use less than or equal to \ 24 | 16486 bytes, but actually used {}", 25 | sz 26 | ); 27 | 28 | Ok(()) 29 | } 30 | -------------------------------------------------------------------------------- /scripts/cross_compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | # checks sled's compatibility using several targets 5 | 6 | targets="wasm32-wasi wasm32-unknown-unknown aarch64-fuchsia aarch64-linux-android \ 7 | i686-linux-android i686-unknown-linux-gnu \ 8 | x86_64-linux-android x86_64-fuchsia \ 9 | mips-unknown-linux-musl aarch64-apple-ios" 10 | 11 | rustup update --no-self-update 12 | 13 | RUSTFLAGS="--cfg miri" cargo check 14 | 15 | rustup toolchain install 1.39.0 --no-self-update 16 | cargo clean 17 | rm Cargo.lock 18 | cargo +1.39.0 check 19 | 20 | for target in $targets; do 21 | echo "setting up $target..." 22 | rustup target add $target 23 | echo "checking $target..." 24 | cargo check --target $target 25 | done 26 | 27 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: spacejam # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /src/measure_allocs.rs: -------------------------------------------------------------------------------- 1 | #![allow(unsafe_code)] 2 | 3 | use std::sync::atomic::{AtomicUsize, Ordering::Release}; 4 | 5 | // define a passthrough allocator that tracks alloc calls. 6 | // adapted from the flatbuffer codebase 7 | use std::alloc::{GlobalAlloc, Layout, System}; 8 | 9 | pub(crate) struct TrackingAllocator; 10 | 11 | pub static ALLOCATIONS: AtomicUsize = AtomicUsize::new(0); 12 | pub static ALLOCATED_BYTES: AtomicUsize = AtomicUsize::new(0); 13 | 14 | unsafe impl GlobalAlloc for TrackingAllocator { 15 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 16 | ALLOCATIONS.fetch_add(1, Release); 17 | ALLOCATED_BYTES.fetch_add(layout.size(), Release); 18 | System.alloc(layout) 19 | } 20 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 21 | System.dealloc(ptr, layout) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | sled uses some unsafe functionality in the core lock-free algorithms, and in a few places to more efficiently copy data. 6 | 7 | Please contact [Tyler Neely](mailto:tylerneely@gmail.com?subject=sled%20security%20issue) immediately if you find any vulnerability, and I will work with you to fix the issue rapidly and coordinate public disclosure with an expedited release including the fix. 8 | 9 | If you are a bug hunter or a person with a security interest, here is my mental model of memory corruption risk in the sled codebase: 10 | 11 | 1. memory issues relating to the lock-free data structures in their colder failure paths. these have been tested a bit by injecting delays into random places, but this is still an area with elevated risk 12 | 1. anywhere the `unsafe` keyword is used 13 | -------------------------------------------------------------------------------- /bindings/neon-sled/lib/index.js: -------------------------------------------------------------------------------- 1 | const sled = require('../native'); 2 | 3 | function open (path) { 4 | console.log("Creating a modern embedded database at", path); 5 | let ptr_str = sled.createDb(path); 6 | console.log("Sled at pointer", ptr_str); 7 | 8 | return { 9 | set: (k, v) => { 10 | //console.log("SET", ptr_str, k, v); 11 | return sled.set(ptr_str, k, v); 12 | }, 13 | get: (k) => { 14 | //console.log("GET", ptr_str, k); 15 | return sled.get(ptr_str, k); 16 | }, 17 | del: (k) => { 18 | return sled.del(ptr_str, k); 19 | }, 20 | syncAndClose: () => { 21 | console.log("Saving DB and closing", ptr_str); 22 | sled.syncAndClose(ptr_str); 23 | console.log("Saved DB and closing", ptr_str); 24 | } 25 | } 26 | } 27 | 28 | module.exports = open; 29 | -------------------------------------------------------------------------------- /src/fail.rs: -------------------------------------------------------------------------------- 1 | use parking_lot::Mutex; 2 | 3 | use crate::{Lazy, Map}; 4 | 5 | type HM = Map<&'static str, u64>; 6 | 7 | static ACTIVE: Lazy, fn() -> Mutex> = Lazy::new(init); 8 | 9 | fn init() -> Mutex { 10 | Mutex::new(HM::default()) 11 | } 12 | 13 | /// Returns `true` if the given failpoint is active. 14 | pub fn is_active(name: &'static str) -> bool { 15 | let mut active = ACTIVE.lock(); 16 | if let Some(bitset) = active.get_mut(&name) { 17 | let bit = *bitset & 1; 18 | *bitset >>= 1; 19 | if *bitset == 0 { 20 | active.remove(&name); 21 | } 22 | bit != 0 23 | } else { 24 | false 25 | } 26 | } 27 | 28 | /// Enable a particular failpoint 29 | pub fn set(name: &'static str, bitset: u64) { 30 | ACTIVE.lock().insert(name, bitset); 31 | } 32 | 33 | /// Clear all active failpoints. 34 | pub fn reset() { 35 | ACTIVE.lock().clear(); 36 | } 37 | -------------------------------------------------------------------------------- /benchmarks/stress2/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "stress2" 3 | version = "0.1.0" 4 | authors = ["Tyler Neely "] 5 | publish = false 6 | edition = "2018" 7 | 8 | [profile.release] 9 | panic = 'abort' 10 | codegen-units = 1 11 | lto = "fat" 12 | debug = true 13 | 14 | [features] 15 | default = [] 16 | lock_free_delays = ["sled/lock_free_delays"] 17 | io_uring = ["sled/io_uring"] 18 | event_log = ["sled/event_log"] 19 | compression = ["sled/compression"] 20 | no_logs = ["sled/no_logs"] 21 | metrics = ["sled/metrics"] 22 | measure_allocs = ["sled/measure_allocs"] 23 | jemalloc = ["jemallocator"] 24 | logging = ["env_logger", "log", "color-backtrace"] 25 | 26 | [dependencies] 27 | rand = "0.7.3" 28 | env_logger = { version = "0.7.1", optional = true } 29 | log = { version = "0.4.8", optional = true } 30 | color-backtrace = { version = "0.3.0", optional = true } 31 | jemallocator = { version = "0.3.2", optional = true } 32 | num-format = "0.4.0" 33 | 34 | [dependencies.sled] 35 | path = "../.." 36 | -------------------------------------------------------------------------------- /scripts/instructions: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # counts instructions for a standard workload 3 | set -e 4 | 5 | OUTFILE="cachegrind.stress2.`git describe --always --dirty`-`date +%s`" 6 | 7 | rm -rf default.sled || true 8 | 9 | cargo build \ 10 | --bin=stress2 \ 11 | --release 12 | 13 | 14 | # --tool=callgrind --dump-instr=yes --collect-jumps=yes --simulate-cache=yes \ 15 | # --callgrind-out-file="$OUTFILE" \ 16 | 17 | valgrind \ 18 | --tool=cachegrind \ 19 | --cachegrind-out-file="$OUTFILE" \ 20 | ./target/release/stress2 --total-ops=50000 --set-prop=1000000000000 --threads=1 21 | 22 | LAST=`ls -t cachegrind.stress2.* | sed -n 2p` 23 | 24 | echo "comparing $LAST with new $OUTFILE" 25 | 26 | echo "--------------------------------------------------------------------------------" 27 | echo "change since last run:" 28 | echo " Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw" 29 | echo "--------------------------------------------------------------------------------" 30 | cg_diff $LAST $OUTFILE | tail -1 31 | -------------------------------------------------------------------------------- /tsan_suppressions.txt: -------------------------------------------------------------------------------- 1 | # This suppressions file should really only be used for things 2 | # that TSAN can not correctly reason about, like raw memory 3 | # fences or implicit equivalents created by performing atomic 4 | # operations on variables. 5 | 6 | # Read more about how to use this file at: 7 | # https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions 8 | 9 | # We ignore this because collect() calls functionality that relies 10 | # on atomic::fence for correctness, which doesn't get picked up by TSAN 11 | # as of Feb 1 2018 / rust 1.23. 12 | race:crossbeam_epoch::internal::Global::collect 13 | 14 | # Arc::drop is not properly detected by TSAN due to the use 15 | # of a raw atomic Acquire fence after the strong-count 16 | # atomic subtraction with a Release fence in the Drop impl. 17 | race:Arc*drop 18 | 19 | # lazy_static and thread_local rely on implicit barriers not 20 | # picked-up by TSAN 21 | race:lazy_static 22 | race:std::thread::local 23 | 24 | # tsan doesn't seem to pick up parking_lot RwLock-protected accesses 25 | # that sometimes use lock elision 26 | race:current_iobuf 27 | -------------------------------------------------------------------------------- /src/pagecache/parallel_io_unix.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | use std::fs::File; 3 | use std::io; 4 | use std::os::unix::fs::FileExt; 5 | 6 | use super::LogOffset; 7 | 8 | pub(crate) fn pread_exact_or_eof( 9 | file: &File, 10 | mut buf: &mut [u8], 11 | offset: LogOffset, 12 | ) -> io::Result { 13 | let mut total = 0_usize; 14 | while !buf.is_empty() { 15 | match file.read_at(buf, offset + u64::try_from(total).unwrap()) { 16 | Ok(0) => break, 17 | Ok(n) => { 18 | total += n; 19 | let tmp = buf; 20 | buf = &mut tmp[n..]; 21 | } 22 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 23 | Err(e) => return Err(e), 24 | } 25 | } 26 | Ok(total) 27 | } 28 | 29 | pub(crate) fn pread_exact( 30 | file: &File, 31 | buf: &mut [u8], 32 | offset: LogOffset, 33 | ) -> io::Result<()> { 34 | file.read_exact_at(buf, offset) 35 | } 36 | 37 | pub(crate) fn pwrite_all( 38 | file: &File, 39 | buf: &[u8], 40 | offset: LogOffset, 41 | ) -> io::Result<()> { 42 | file.write_all_at(buf, offset) 43 | } 44 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Tyler Neely 2 | Copyright (c) 2019 Tyler Neely 3 | 4 | Permission is hereby granted, free of charge, to any 5 | person obtaining a copy of this software and associated 6 | documentation files (the "Software"), to deal in the 7 | Software without restriction, including without 8 | limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software 11 | is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice 15 | shall be included in all copies or substantial portions 16 | of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 | DEALINGS IN THE SOFTWARE. 27 | -------------------------------------------------------------------------------- /src/doc/performance_guide/mod.rs: -------------------------------------------------------------------------------- 1 | //! ## Built-In Profiler 2 | //! 3 | //! To get a summary of latency histograms relating to different operations 4 | //! you've used on a sled database, sled can print a nice table when the Db is 5 | //! dropped by disabling the `no_metrics` default feature and setting 6 | //! `print_profile_on_drop(true)` on a `ConfigBuilder`: 7 | //! 8 | //! ```rust 9 | //! let config = sled::ConfigBuilder::new() 10 | //! .print_profile_on_drop(true) 11 | //! .build(); 12 | //! 13 | //! let db = sled::Db::start(config).unwrap(); 14 | //! ``` 15 | //! 16 | //! This is useful for finding outliers, general percentiles about usage, and 17 | //! especially for debugging performance issues if you create an issue on 18 | //! github. 19 | //! 20 | //! ## Use jemalloc 21 | //! 22 | //! jemalloc can dramatically improve performance in some situations, but you 23 | //! should always measure performance before and after using it, because maybe 24 | //! for some use cases it can cause regressions. 25 | //! 26 | //! Cargo.toml: 27 | //! ```toml 28 | //! [dependencies] 29 | //! jemallocator = "0.1" 30 | //! ``` 31 | //! 32 | //! `your_code.rs`: 33 | //! ```rust 34 | //! #[global_allocator] 35 | //! static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; 36 | //! ``` 37 | -------------------------------------------------------------------------------- /scripts/sanitizers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eo pipefail 3 | 4 | pushd benchmarks/stress2 5 | 6 | rustup toolchain install nightly --no-self-update 7 | rustup update --no-self-update 8 | 9 | export SLED_LOCK_FREE_DELAY_INTENSITY=2000 10 | 11 | echo "asan" 12 | cargo clean 13 | export RUSTFLAGS="-Z sanitizer=address" 14 | export ASAN_OPTIONS="detect_odr_violation=0" 15 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu 16 | sudo rm -rf default.sled 17 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10 18 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6 19 | unset ASAN_OPTIONS 20 | 21 | echo "lsan" 22 | cargo clean 23 | export RUSTFLAGS="-Z sanitizer=leak" 24 | cargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu 25 | sudo rm -rf default.sled 26 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=10 27 | sudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6 28 | 29 | echo "tsan" 30 | cargo clean 31 | export RUSTFLAGS="-Z sanitizer=thread" 32 | export TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt 33 | sudo rm -rf default.sled 34 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=10 35 | cargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=6 36 | unset RUSTFLAGS 37 | unset TSAN_OPTIONS 38 | -------------------------------------------------------------------------------- /tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(feature = "testing"))] 2 | compile_error!( 3 | "please run tests using the \"testing\" feature, \ 4 | which enables additional checks at runtime and \ 5 | causes more race conditions to jump out by \ 6 | inserting delays in concurrent code." 7 | ); 8 | 9 | pub fn setup_logger() { 10 | use std::io::Write; 11 | 12 | fn tn() -> String { 13 | std::thread::current().name().unwrap_or("unknown").to_owned() 14 | } 15 | 16 | #[cfg(feature = "pretty_backtrace")] 17 | color_backtrace::install(); 18 | 19 | let mut builder = env_logger::Builder::new(); 20 | builder 21 | .format(|buf, record| { 22 | writeln!( 23 | buf, 24 | "{:05} {:20} {:10} {}", 25 | record.level(), 26 | tn(), 27 | record.module_path().unwrap().split("::").last().unwrap(), 28 | record.args() 29 | ) 30 | }) 31 | .filter(None, log::LevelFilter::Info); 32 | 33 | if let Ok(env) = std::env::var("RUST_LOG") { 34 | builder.parse_filters(&env); 35 | } 36 | 37 | let _r = builder.try_init(); 38 | } 39 | 40 | #[allow(dead_code)] 41 | pub fn cleanup(dir: &str) { 42 | let dir = std::path::Path::new(dir); 43 | if dir.exists() { 44 | std::fs::remove_dir_all(dir).unwrap(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/pagecache/constants.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | // crc: u32 4 4 | // kind: u8 1 5 | // seg num: u64 9 (varint) 6 | // pid: u64 9 (varint) 7 | // len: u64 9 (varint) 8 | /// Log messages have a header that might eb up to this length. 9 | pub const MAX_MSG_HEADER_LEN: usize = 32; 10 | 11 | /// Log segments have a header of this length. 12 | pub const SEG_HEADER_LEN: usize = 20; 13 | 14 | /// During testing, this should never be exceeded. 15 | // TODO drop this to 3 over time 16 | #[allow(unused)] 17 | pub const MAX_SPACE_AMPLIFICATION: f64 = 10.; 18 | 19 | pub(crate) const META_PID: PageId = 0; 20 | pub(crate) const COUNTER_PID: PageId = 1; 21 | pub(crate) const BATCH_MANIFEST_PID: PageId = PageId::max_value() - 666; 22 | 23 | pub(crate) const PAGE_CONSOLIDATION_THRESHOLD: usize = 10; 24 | pub(crate) const SEGMENT_CLEANUP_THRESHOLD: usize = 50; 25 | 26 | // Allows for around 1 trillion items to be stored 27 | // 2^37 * (assuming 50% node fill, 8 items per leaf) 28 | // and well below 1% of nodes being non-leaf nodes. 29 | #[cfg(target_pointer_width = "64")] 30 | pub(crate) const MAX_PID_BITS: usize = 37; 31 | 32 | // Allows for around 32 billion items to be stored 33 | // 2^32 * (assuming 50% node fill of 8 items per leaf) 34 | // and well below 1% of nodes being non-leaf nodes. 35 | // Assumed to be enough for a 32-bit system. 36 | #[cfg(target_pointer_width = "32")] 37 | pub(crate) const MAX_PID_BITS: usize = 32; 38 | -------------------------------------------------------------------------------- /RELEASE_CHECKLIST.md: -------------------------------------------------------------------------------- 1 | # Release Checklist 2 | 3 | This checklist must be completed before publishing a release of any kind. 4 | 5 | Over time, anything in this list that can be turned into an automated test should be, but 6 | there are still some big blind spots. 7 | 8 | ## API stability 9 | 10 | - [ ] rust-flavored semver respected 11 | 12 | ## Performance 13 | 14 | - [ ] micro-benchmark regressions should not happen unless newly discovered correctness criteria demands them 15 | - [ ] mixed point operation latency distribution should narrow over time 16 | - [ ] sequential operation average throughput should increase over time 17 | - [ ] workloads should pass TSAN and ASAN on macOS. Linux should additionally pass LSAN & MSAN. 18 | - [ ] workload write and space amplification thresholds should see no regressions 19 | 20 | ## Concurrency Audit 21 | 22 | - [ ] any new `Guard` objects are dropped inside the rayon threadpool 23 | - [ ] no new EBR `Collector`s, as they destroy causality. These will be optimized in-bulk in the future. 24 | - [ ] no code assumes a recently read page pointer will remain unchanged (transactions may change this if reads are inline) 25 | - [ ] no calls to `rand::thread_rng` from a droppable function (anything in the SegmentAccountant) 26 | 27 | ## Burn-In 28 | 29 | - [ ] fuzz tests should run at least 24 hours each with zero crashes 30 | - [ ] sequential and point workloads run at least 24 hours in constrained docker container without OOM / out of disk 31 | -------------------------------------------------------------------------------- /src/doc/testing_strategies/mod.rs: -------------------------------------------------------------------------------- 1 | //! We believe operators of stateful systems should get as much sleep as they 2 | //! want. We take testing seriously, and we take pains to avoid the pesticide 3 | //! paradox wherever possible. 4 | //! 5 | //! sled uses the following testing strategies, and is eager to expand their 6 | //! use: 7 | //! 8 | //! * quickcheck-based model testing on the Tree, `PageCache`, and Log 9 | //! * proptest-based model testing on the `PageTable` using the [model](https://docs.rs/model) 10 | //! testing library 11 | //! * linearizability testing on the `PageTable` using the [model](https://docs.rs/model) 12 | //! testing library 13 | //! * deterministic concurrent model testing using linux realtime priorities, 14 | //! approaching the utility of the PULSE system available for the Erlang 15 | //! ecosystem 16 | //! * `ThreadSanitizer` on a concurrent workload 17 | //! * `LeakSanitizer` on a concurrent workload 18 | //! * failpoints with model testing: at every IO operation, a test can cause the 19 | //! system to simulate a crash 20 | //! * crash testing: processes are quickly spun up and then `kill -9`'d while 21 | //! recovering and writing. the recovered data is verified to recover the log 22 | //! in-order, stopping at the first torn log message or incomplete segment 23 | //! * fuzzing: libfuzzer is used to generate sequences of operations on the Tree 24 | //! * TLA+ has been used to model some of the concurrent algorithms, but much 25 | //! more is necessary 26 | -------------------------------------------------------------------------------- /src/doc/reactive_semantics/mod.rs: -------------------------------------------------------------------------------- 1 | //! As of sled `0.16.8` we support the [`watch_prefix` feature](https://docs.rs/sled/latest/sled/struct.Tree.html#method.watch_prefix) which allows a caller to create an iterator over all events that happen to keys that begin with a specified prefix. Supplying an empty vector allows you to subscribe to all updates on the `Tree`. 2 | //! 3 | //! #### reactive architectures 4 | //! 5 | //! Subscription to keys prefixed with "topic names" can allow you to treat sled 6 | //! as a durable message bus. 7 | //! 8 | //! #### replicated systems 9 | //! 10 | //! Watching the empty prefix will subscribe to all updates on the entire 11 | //! database. You can feed this into a replication system 12 | //! 13 | //! #### analysis tools and auditing 14 | //! 15 | //! #### ordering guarantees 16 | //! 17 | //! Updates are received in-order for particular keys, but updates for different 18 | //! keys may be observed in different orders by different `Subscriber`s. As an 19 | //! example, consider updating the keys `k1` and `k2` twice, adding 1 to the 20 | //! current value. Different `Subscriber`s may observe the following histories: 21 | //! 22 | //! ``` 23 | //! Set(k1, 100), Set(k1, 101), Set(k2, 200), Set(k2, 201) 24 | //! or 25 | //! Set(k1, 100), Set(k2, 200), Set(k1, 101), Set(k2, 201) 26 | //! or 27 | //! Set(k1, 100), Set(k2, 200), Set(k2, 201), Set(k1, 101) 28 | //! or 29 | //! Set(k2, 200), Set(k1, 100), Set(k1, 101), Set(k2, 201) 30 | //! or 31 | //! Set(k2, 200), Set(k1, 100), Set(k2, 201), Set(k1, 101) 32 | //! or 33 | //! Set(k2, 200), Set(k2, 201), Set(k1, 100), Set(k1, 101) 34 | //! ``` 35 | -------------------------------------------------------------------------------- /src/fastcmp.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | 3 | #[cfg(any(unix, windows))] 4 | #[allow(unsafe_code)] 5 | pub(crate) fn fastcmp(l: &[u8], r: &[u8]) -> Ordering { 6 | let len = std::cmp::min(l.len(), r.len()); 7 | let cmp = unsafe { libc::memcmp(l.as_ptr() as _, r.as_ptr() as _, len) }; 8 | match cmp { 9 | a if a > 0 => Ordering::Greater, 10 | a if a < 0 => Ordering::Less, 11 | _ => l.len().cmp(&r.len()), 12 | } 13 | } 14 | 15 | #[cfg(not(any(unix, windows)))] 16 | #[allow(unsafe_code)] 17 | pub(crate) fn fastcmp(l: &[u8], r: &[u8]) -> Ordering { 18 | l.cmp(r) 19 | } 20 | 21 | #[cfg(test)] 22 | mod qc { 23 | use super::fastcmp; 24 | 25 | fn prop_cmp_matches(l: &[u8], r: &[u8]) -> bool { 26 | assert_eq!(fastcmp(l, r), l.cmp(r)); 27 | assert_eq!(fastcmp(r, l), r.cmp(l)); 28 | assert_eq!(fastcmp(l, l), l.cmp(l)); 29 | assert_eq!(fastcmp(r, r), r.cmp(r)); 30 | true 31 | } 32 | 33 | #[test] 34 | fn basic_functionality() { 35 | let cases: [&[u8]; 8] = [ 36 | &[], 37 | &[0], 38 | &[1], 39 | &[1], 40 | &[255], 41 | &[1, 2, 3], 42 | &[1, 2, 3, 0], 43 | &[1, 2, 3, 55], 44 | ]; 45 | for pair in cases.windows(2) { 46 | prop_cmp_matches(pair[0], pair[1]); 47 | } 48 | } 49 | 50 | quickcheck::quickcheck! { 51 | #[cfg_attr(miri, ignore)] 52 | fn qc_fastcmp(l: Vec, r: Vec) -> bool { 53 | prop_cmp_matches(&l, &r) 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/doc/limits/mod.rs: -------------------------------------------------------------------------------- 1 | //! This page documents some limitations that sled imposes on users. 2 | //! 3 | //! * The underlying pagecache can currently store 2^36 pages. Leaf nodes in the 4 | //! `Tree` tend to split when they have more than 16 keys and values. This 5 | //! means that sled can hold a little less than **4,294,967,296 total items** 6 | //! (index nodes in the tree will also consume pages, but ideally far fewer 7 | //! than 1%). This is easy to increase without requiring migration, as it is 8 | //! entirely a runtime concern, but nobody has expressed any interest in this 9 | //! being larger yet. Note to future folks who need to increase this: increase 10 | //! the width of the Node1 type in the pagetable module, and correspondingly 11 | //! increase the number of bits that are used to index into it. It's just a 12 | //! simple wait-free grow-only 2-level pagetable. 13 | //! * keys and values use `usize` for the length fields due to the way that Rust 14 | //! uses `usize` for slice lengths, and will be limited to the target 15 | //! platform's pointer width. On 64-bit machines, this will be 64 bits. On 16 | //! 32-bit machines, it will be limited to `u32::max_value()`. 17 | //! * Due to the 32-bit limitation on slice sizes on 32-bit architectures, we 18 | //! currently do not support systems large enough for the snapshot file to 19 | //! reach over 4gb. The snapshot file tends to be a small fraction of the 20 | //! total db size, and it's likely we'll be able to implement a streaming 21 | //! deserializer if this ever becomes an issue, but it seems unclear if anyone 22 | //! will encounter this limitation. 23 | -------------------------------------------------------------------------------- /src/fastlock.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | cell::UnsafeCell, 3 | ops::{Deref, DerefMut}, 4 | sync::atomic::{ 5 | AtomicBool, 6 | Ordering::{Acquire, Release}, 7 | }, 8 | }; 9 | 10 | pub struct FastLockGuard<'a, T> { 11 | mu: &'a FastLock, 12 | } 13 | 14 | impl<'a, T> Drop for FastLockGuard<'a, T> { 15 | fn drop(&mut self) { 16 | assert!(self.mu.lock.swap(false, Release)); 17 | } 18 | } 19 | 20 | impl<'a, T> Deref for FastLockGuard<'a, T> { 21 | type Target = T; 22 | 23 | fn deref(&self) -> &T { 24 | #[allow(unsafe_code)] 25 | unsafe { 26 | &*self.mu.inner.get() 27 | } 28 | } 29 | } 30 | 31 | impl<'a, T> DerefMut for FastLockGuard<'a, T> { 32 | fn deref_mut(&mut self) -> &mut T { 33 | #[allow(unsafe_code)] 34 | unsafe { 35 | &mut *self.mu.inner.get() 36 | } 37 | } 38 | } 39 | 40 | #[repr(C)] 41 | pub struct FastLock { 42 | inner: UnsafeCell, 43 | lock: AtomicBool, 44 | } 45 | 46 | #[allow(unsafe_code)] 47 | unsafe impl Sync for FastLock {} 48 | 49 | #[allow(unsafe_code)] 50 | unsafe impl Send for FastLock {} 51 | 52 | impl FastLock { 53 | pub fn new(inner: T) -> FastLock { 54 | FastLock { lock: AtomicBool::new(false), inner: UnsafeCell::new(inner) } 55 | } 56 | 57 | pub fn try_lock(&self) -> Option> { 58 | let lock_result = 59 | self.lock.compare_exchange_weak(false, true, Acquire, Acquire); 60 | 61 | let success = lock_result.is_ok(); 62 | 63 | if success { Some(FastLockGuard { mu: self }) } else { None } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/pagecache/header.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | // This is the most writers in a single IO buffer 4 | // that we have space to accommodate in the counter 5 | // for writers in the IO buffer header. 6 | pub(in crate::pagecache) const MAX_WRITERS: Header = 127; 7 | 8 | pub(in crate::pagecache) type Header = u64; 9 | 10 | // salt: 31 bits 11 | // maxed: 1 bit 12 | // seal: 1 bit 13 | // n_writers: 7 bits 14 | // offset: 24 bits 15 | 16 | pub(crate) const fn is_maxed(v: Header) -> bool { 17 | v & (1 << 32) == 1 << 32 18 | } 19 | 20 | pub(crate) const fn mk_maxed(v: Header) -> Header { 21 | v | (1 << 32) 22 | } 23 | 24 | pub(crate) const fn is_sealed(v: Header) -> bool { 25 | v & (1 << 31) == 1 << 31 26 | } 27 | 28 | pub(crate) const fn mk_sealed(v: Header) -> Header { 29 | v | (1 << 31) 30 | } 31 | 32 | pub(crate) const fn n_writers(v: Header) -> Header { 33 | (v << 33) >> 57 34 | } 35 | 36 | #[inline] 37 | pub(crate) fn incr_writers(v: Header) -> Header { 38 | assert_ne!(n_writers(v), MAX_WRITERS); 39 | v + (1 << 24) 40 | } 41 | 42 | #[inline] 43 | pub(crate) fn decr_writers(v: Header) -> Header { 44 | assert_ne!(n_writers(v), 0); 45 | v - (1 << 24) 46 | } 47 | 48 | #[inline] 49 | pub(crate) fn offset(v: Header) -> usize { 50 | let ret = (v << 40) >> 40; 51 | usize::try_from(ret).unwrap() 52 | } 53 | 54 | #[inline] 55 | pub(crate) fn bump_offset(v: Header, by: usize) -> Header { 56 | assert_eq!(by >> 24, 0); 57 | v + (by as Header) 58 | } 59 | 60 | pub(crate) const fn bump_salt(v: Header) -> Header { 61 | (v + (1 << 33)) & 0xFFFF_FFFD_0000_0000 62 | } 63 | 64 | pub(crate) const fn salt(v: Header) -> Header { 65 | (v >> 33) << 33 66 | } 67 | -------------------------------------------------------------------------------- /src/batch.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_results)] 2 | 3 | use super::*; 4 | 5 | /// A batch of updates that will 6 | /// be applied atomically to the 7 | /// Tree. 8 | /// 9 | /// # Examples 10 | /// 11 | /// ``` 12 | /// # fn main() -> Result<(), Box> { 13 | /// use sled::{Batch, open}; 14 | /// 15 | /// # let _ = std::fs::remove_dir_all("batch_db_2"); 16 | /// let db = open("batch_db_2")?; 17 | /// db.insert("key_0", "val_0")?; 18 | /// 19 | /// let mut batch = Batch::default(); 20 | /// batch.insert("key_a", "val_a"); 21 | /// batch.insert("key_b", "val_b"); 22 | /// batch.insert("key_c", "val_c"); 23 | /// batch.remove("key_0"); 24 | /// 25 | /// db.apply_batch(batch)?; 26 | /// // key_0 no longer exists, and key_a, key_b, and key_c 27 | /// // now do exist. 28 | /// # let _ = std::fs::remove_dir_all("batch_db_2"); 29 | /// # Ok(()) } 30 | /// ``` 31 | #[derive(Debug, Default, Clone, PartialEq, Eq)] 32 | pub struct Batch { 33 | pub(crate) writes: Map>, 34 | } 35 | 36 | impl Batch { 37 | /// Set a key to a new value 38 | pub fn insert(&mut self, key: K, value: V) 39 | where 40 | K: Into, 41 | V: Into, 42 | { 43 | self.writes.insert(key.into(), Some(value.into())); 44 | } 45 | 46 | /// Remove a key 47 | pub fn remove(&mut self, key: K) 48 | where 49 | K: Into, 50 | { 51 | self.writes.insert(key.into(), None); 52 | } 53 | 54 | /// Get a value if it is present in the `Batch`. 55 | /// `Some(None)` means it's present as a deletion. 56 | pub fn get>(&self, k: K) -> Option> { 57 | let inner = self.writes.get(k.as_ref())?; 58 | Some(inner.as_ref()) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/doc/engineering_practices/mod.rs: -------------------------------------------------------------------------------- 1 | //! Over the years that sled development has been active, some practices have 2 | //! been collected that have helped to reduce risks throughout the codebase. 3 | //! 4 | //! # high-level 5 | //! 6 | //! * Start with the correctness requirements, ignore the performance impact 7 | //! until the end. You'll usually write something faster by focusing on 8 | //! keeping things minimal anyway. 9 | //! * Throw away what can't be done in a day of coding. when you rewrite it 10 | //! tomorrow, it will be simpler. 11 | //! 12 | //! # testing 13 | //! 14 | //! * Don't do what can't be tested to be correct 15 | //! * For concurrent code, it must be delayable to induce strange histories when 16 | //! running under test 17 | //! * For IO code, it must have a failpoint so that IO errors can be injected 18 | //! during testing, as most bugs in cloud systems happen in the untested 19 | //! error-handling code 20 | //! * Lean heavily into model-based property testing. sled should act like a 21 | //! `BTreeMap`, even after crashes 22 | //! 23 | //! # when testing and performance collide 24 | //! 25 | //! * cold code is buggy code 26 | //! * if you see a significant optimization that will make correctness-critical 27 | //! codepaths harder to hit in tests, the optimization should only be created 28 | //! if it's possible to artificially increase the chances of hitting the 29 | //! codepath in test. Fox example, sled defaults to having an 8mb write 30 | //! buffer, but during tests we often turn it down to 512 bytes so that we can 31 | //! really abuse the correctness-critical aspects of its behavior. 32 | //! 33 | //! # numbers 34 | //! 35 | //! * No silent truncation should ever occur when converting numbers 36 | //! * No silent wrapping should occur 37 | //! * Crash or return a `ReportableBug` error in these cases 38 | //! * `as` is forbidden for anything that could lose information 39 | //! * Clippy's cast lints help us here, and it has been added to all pull 40 | //! requests 41 | 42 | //! # package 43 | //! 44 | //! * dependencies should be minimized to keep compilation simple 45 | //! 46 | //! # coding conventions 47 | //! 48 | //! * Self should be avoided. We have a lot of code, and it provides no context 49 | //! if people are jumping around a lot. Redundancy here improves orientation. 50 | -------------------------------------------------------------------------------- /tests/test_quiescent.rs: -------------------------------------------------------------------------------- 1 | #![cfg(all(target_os = "linux", not(miri)))] 2 | 3 | mod common; 4 | 5 | use std::time::{Duration, Instant}; 6 | 7 | use common::cleanup; 8 | 9 | #[test] 10 | fn quiescent_cpu_time() { 11 | const DB_DIR: &str = "sleeper"; 12 | cleanup(DB_DIR); 13 | 14 | fn run() { 15 | let start = Instant::now(); 16 | let db = sled::open(DB_DIR).unwrap(); 17 | std::thread::sleep(Duration::from_secs(10)); 18 | drop(db); 19 | let end = Instant::now(); 20 | 21 | let (user_cpu_time, system_cpu_time) = unsafe { 22 | let mut resource_usage: libc::rusage = std::mem::zeroed(); 23 | let return_value = libc::getrusage( 24 | libc::RUSAGE_SELF, 25 | (&mut resource_usage) as *mut libc::rusage, 26 | ); 27 | if return_value != 0 { 28 | panic!("error {} from getrusage()", *libc::__errno_location()); 29 | } 30 | (resource_usage.ru_utime, resource_usage.ru_stime) 31 | }; 32 | 33 | let user_cpu_seconds = 34 | user_cpu_time.tv_sec as f64 + user_cpu_time.tv_usec as f64 * 1e-6; 35 | let system_cpu_seconds = system_cpu_time.tv_sec as f64 36 | + system_cpu_time.tv_usec as f64 * 1e-6; 37 | let real_time_elapsed = end.duration_since(start); 38 | 39 | if user_cpu_seconds + system_cpu_seconds > 1.0 { 40 | panic!( 41 | "Database used too much CPU during a quiescent workload. User: {}s, system: {}s (wall clock: {}s)", 42 | user_cpu_seconds, 43 | system_cpu_seconds, 44 | real_time_elapsed.as_secs_f64(), 45 | ); 46 | } 47 | } 48 | 49 | let child = unsafe { libc::fork() }; 50 | if child == 0 { 51 | common::setup_logger(); 52 | if let Err(e) = std::thread::spawn(run).join() { 53 | println!("test failed: {:?}", e); 54 | std::process::exit(15); 55 | } else { 56 | std::process::exit(0); 57 | } 58 | } else { 59 | let mut status = 0; 60 | unsafe { 61 | libc::waitpid(child, &mut status as *mut libc::c_int, 0); 62 | } 63 | if status != 0 { 64 | cleanup(DB_DIR); 65 | panic!("child exited abnormally"); 66 | } 67 | } 68 | 69 | cleanup(DB_DIR); 70 | } 71 | -------------------------------------------------------------------------------- /src/pagecache/disk_pointer.rs: -------------------------------------------------------------------------------- 1 | use std::num::NonZeroU64; 2 | 3 | use super::{HeapId, LogOffset}; 4 | use crate::*; 5 | 6 | /// A pointer to a location on disk or an off-log heap item. 7 | #[derive(Debug, Clone, PartialOrd, Ord, Copy, Eq, PartialEq)] 8 | pub enum DiskPtr { 9 | /// Points to a value stored in the single-file log. 10 | Inline(LogOffset), 11 | /// Points to a value stored off-log in the heap. 12 | Heap(Option, HeapId), 13 | } 14 | 15 | impl DiskPtr { 16 | pub(crate) fn new_inline(l: LogOffset) -> Self { 17 | DiskPtr::Inline(l) 18 | } 19 | 20 | pub(crate) fn new_heap_item(lid: LogOffset, heap_id: HeapId) -> Self { 21 | DiskPtr::Heap(Some(NonZeroU64::new(lid).unwrap()), heap_id) 22 | } 23 | 24 | pub(crate) fn is_inline(&self) -> bool { 25 | match self { 26 | DiskPtr::Inline(_) => true, 27 | DiskPtr::Heap(_, _) => false, 28 | } 29 | } 30 | 31 | pub(crate) fn is_heap_item(&self) -> bool { 32 | match self { 33 | DiskPtr::Inline(_) => false, 34 | DiskPtr::Heap(_, _) => true, 35 | } 36 | } 37 | 38 | pub(crate) fn heap_id(&self) -> Option { 39 | if let DiskPtr::Heap(_, heap_id) = self { Some(*heap_id) } else { None } 40 | } 41 | 42 | #[doc(hidden)] 43 | pub fn lid(&self) -> Option { 44 | match self { 45 | DiskPtr::Inline(lid) => Some(*lid), 46 | DiskPtr::Heap(lid, _) => lid.map(NonZeroU64::get), 47 | } 48 | } 49 | 50 | pub(crate) fn forget_heap_log_coordinates(&mut self) { 51 | match self { 52 | DiskPtr::Inline(_) => {} 53 | DiskPtr::Heap(ref mut opt, _) => *opt = None, 54 | } 55 | } 56 | 57 | pub(crate) fn original_lsn(&self) -> Lsn { 58 | match self { 59 | DiskPtr::Heap(_, heap_id) => heap_id.original_lsn, 60 | DiskPtr::Inline(_) => panic!("called original_lsn on non-Heap"), 61 | } 62 | } 63 | 64 | pub(crate) fn heap_pointer_merged_into_snapshot(&self) -> bool { 65 | if let DiskPtr::Heap(None, _) = self { true } else { false } 66 | } 67 | } 68 | 69 | impl fmt::Display for DiskPtr { 70 | fn fmt( 71 | &self, 72 | f: &mut fmt::Formatter<'_>, 73 | ) -> std::result::Result<(), fmt::Error> { 74 | write!(f, "{:?}", self) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to the Project :) 2 | 3 | * Don't be a jerk - here's our [code of conduct](./code-of-conduct.md). 4 | We have a track record of defending our community from harm. 5 | 6 | There are at least three great ways to contribute to sled: 7 | 8 | * [financial contribution](https://github.com/sponsors/spacejam) 9 | * coding 10 | * conversation 11 | 12 | #### Coding Considerations: 13 | 14 | Please don't waste your time or ours by implementing things that 15 | we do not want to introduce and maintain. Please discuss in an 16 | issue or on chat before submitting a PR with: 17 | 18 | * public API changes 19 | * new functionality of any sort 20 | * additional unsafe code 21 | * significant refactoring 22 | 23 | The above changes are unlikely to be merged or receive 24 | timely attention without prior discussion. 25 | 26 | PRs that generally require less coordination beforehand: 27 | 28 | * Anything addressing a correctness issue. 29 | * Better docs: whatever you find confusing! 30 | * Small code changes with big performance implications, substantiated with [responsibly-gathered metrics](https://sled.rs/perf#experiment-checklist). 31 | * FFI submodule changes: these are generally less well maintained than the Rust core, and benefit more from public assistance. 32 | * Generally any new kind of test that avoids biases inherent in the others. 33 | 34 | ####### All PRs block on failing tests! 35 | 36 | sled has intense testing, including crash tests, multi-threaded tests with 37 | delay injection, a variety of mechanically-generated tests that combine fault 38 | injection with concurrency in interesting ways, cross-compilation and minimum 39 | supported Rust version checks, LLVM sanitizers, and more. It can sometimes be 40 | challenging to understand why something is failing these intense tests. 41 | 42 | For better understanding test failures, please: 43 | 44 | 1. read the failing test name and output log for clues 45 | 1. try to reproduce the failed test locally by running its assocated command from the [test script](https://github.com/spacejam/sled/blob/master/.github/workflows/test.yml) 46 | 1. If it is not clear why your test is failing, feel free to request help with understanding it either on discord or requesting help on the PR, and we will do our best to help. 47 | 48 | Want to help sled but don't have time for individual contributions? Contribute via [GitHub Sponsors](https://github.com/sponsors/spacejam) to support the people pushing the project forward! 49 | -------------------------------------------------------------------------------- /examples/playground.rs: -------------------------------------------------------------------------------- 1 | extern crate sled; 2 | 3 | use sled::{Config, Result}; 4 | 5 | fn basic() -> Result<()> { 6 | let config = Config::new().temporary(true); 7 | 8 | let db = config.open()?; 9 | 10 | let k = b"k".to_vec(); 11 | let v1 = b"v1".to_vec(); 12 | let v2 = b"v2".to_vec(); 13 | 14 | // set and get 15 | db.insert(k.clone(), v1.clone())?; 16 | assert_eq!(db.get(&k).unwrap().unwrap(), (v1)); 17 | 18 | // compare and swap 19 | match db.compare_and_swap(k.clone(), Some(&v1), Some(v2.clone()))? { 20 | Ok(()) => println!("it worked!"), 21 | Err(sled::CompareAndSwapError { current: cur, proposed: _ }) => { 22 | println!("the actual current value is {:?}", cur) 23 | } 24 | } 25 | 26 | // scan forward 27 | let mut iter = db.range(k.as_slice()..); 28 | let (k1, v1) = iter.next().unwrap().unwrap(); 29 | assert_eq!(v1, v2); 30 | assert_eq!(k1, k); 31 | assert_eq!(iter.next(), None); 32 | 33 | // deletion 34 | db.remove(&k)?; 35 | 36 | Ok(()) 37 | } 38 | 39 | fn merge_operator() -> Result<()> { 40 | fn concatenate_merge( 41 | _key: &[u8], // the key being merged 42 | old_value: Option<&[u8]>, // the previous value, if one existed 43 | merged_bytes: &[u8], // the new bytes being merged in 44 | ) -> Option> { 45 | // set the new value, return None to delete 46 | let mut ret = old_value.map_or_else(Vec::new, |ov| ov.to_vec()); 47 | 48 | ret.extend_from_slice(merged_bytes); 49 | 50 | Some(ret) 51 | } 52 | 53 | let config = Config::new().temporary(true); 54 | 55 | let db = config.open()?; 56 | db.set_merge_operator(concatenate_merge); 57 | 58 | let k = b"k".to_vec(); 59 | 60 | db.insert(k.clone(), vec![0])?; 61 | db.merge(k.clone(), vec![1])?; 62 | db.merge(k.clone(), vec![2])?; 63 | assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![0, 1, 2])); 64 | 65 | // sets replace previously merged data, 66 | // bypassing the merge function. 67 | db.insert(k.clone(), vec![3])?; 68 | assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![3])); 69 | 70 | // merges on non-present values will add them 71 | db.remove(&*k)?; 72 | db.merge(k.clone(), vec![4])?; 73 | assert_eq!(db.get(&*k).unwrap().unwrap(), (vec![4])); 74 | 75 | Ok(()) 76 | } 77 | 78 | fn main() -> Result<()> { 79 | basic()?; 80 | merge_operator() 81 | } 82 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sled" 3 | version = "0.34.6" 4 | authors = ["Tyler Neely "] 5 | description = "Lightweight high-performance pure-rust transactional embedded database." 6 | license = "MIT/Apache-2.0" 7 | homepage = "https://github.com/spacejam/sled" 8 | repository = "https://github.com/spacejam/sled" 9 | keywords = ["redis", "mongo", "sqlite", "lmdb", "rocksdb"] 10 | categories = ["database-implementations", "concurrency", "data-structures", "algorithms", "caching"] 11 | documentation = "https://docs.rs/sled/" 12 | readme = "README.md" 13 | edition = "2018" 14 | exclude = ["benchmarks", "examples", "bindings", "scripts", "experiments"] 15 | 16 | [package.metadata.docs.rs] 17 | features = ["docs"] 18 | 19 | [badges] 20 | maintenance = { status = "actively-developed" } 21 | 22 | [profile.release] 23 | debug = true 24 | opt-level = 3 25 | 26 | [features] 27 | default = [] 28 | # Do not use the "testing" feature in your own testing code, this is for 29 | # internal testing use only. It injects many delays and performs several 30 | # test-only configurations that cause performance to drop significantly. 31 | # It will cause your tests to take much more time, and possibly time out etc... 32 | testing = ["event_log", "lock_free_delays", "compression", "failpoints", "backtrace"] 33 | compression = ["zstd"] 34 | lock_free_delays = [] 35 | failpoints = [] 36 | event_log = [] 37 | metrics = [] 38 | no_logs = ["log/max_level_off"] 39 | no_inline = [] 40 | measure_allocs = [] 41 | pretty_backtrace = ["color-backtrace"] 42 | io_uring = ["rio"] 43 | docs = [] 44 | miri_optimizations = [] 45 | mutex = [] 46 | 47 | [dependencies] 48 | crossbeam-epoch = "0.9.1" 49 | crossbeam-utils = "0.8.1" 50 | fxhash = "0.2.1" 51 | libc = "0.2.81" 52 | zstd = { version = "0.6.0", optional = true } 53 | crc32fast = "1.2.1" 54 | log = "0.4.11" 55 | parking_lot = "0.11.1" 56 | color-backtrace = { version = "0.5.0", optional = true } 57 | rio = { version = "0.9.4", optional = true } 58 | backtrace = { version = "0.3.55", optional = true } 59 | 60 | [target.'cfg(any(target_os = "linux", target_os = "macos", target_os="windows"))'.dependencies] 61 | fs2 = "0.4.3" 62 | 63 | [dev-dependencies] 64 | rand = "0.7.0" 65 | rand_chacha = "0.3.0" 66 | rand_distr = "0.3.0" 67 | quickcheck = "0.9.2" 68 | log = "0.4.11" 69 | env_logger = "0.8.2" 70 | zerocopy = "0.3.0" 71 | byteorder = "1.3.4" 72 | 73 | [[test]] 74 | name = "test_crash_recovery" 75 | path = "tests/test_crash_recovery.rs" 76 | harness = false 77 | -------------------------------------------------------------------------------- /src/pagecache/parallel_io_windows.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | use std::fs::File; 3 | use std::io; 4 | use std::os::windows::fs::FileExt; 5 | 6 | use super::LogOffset; 7 | 8 | fn seek_read_exact( 9 | file: &mut F, 10 | mut buf: &mut [u8], 11 | mut offset: u64, 12 | ) -> io::Result<()> { 13 | while !buf.is_empty() { 14 | match file.seek_read(buf, offset) { 15 | Ok(0) => break, 16 | Ok(n) => { 17 | let tmp = buf; 18 | buf = &mut tmp[n..]; 19 | offset += n as u64; 20 | } 21 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 22 | Err(e) => return Err(e), 23 | } 24 | } 25 | if !buf.is_empty() { 26 | Err(io::Error::new( 27 | io::ErrorKind::UnexpectedEof, 28 | "failed to fill whole buffer", 29 | )) 30 | } else { 31 | Ok(()) 32 | } 33 | } 34 | 35 | fn seek_write_all( 36 | file: &mut F, 37 | mut buf: &[u8], 38 | mut offset: u64, 39 | ) -> io::Result<()> { 40 | while !buf.is_empty() { 41 | match file.seek_write(buf, offset) { 42 | Ok(0) => { 43 | return Err(io::Error::new( 44 | io::ErrorKind::WriteZero, 45 | "failed to write whole buffer", 46 | )); 47 | } 48 | Ok(n) => { 49 | buf = &buf[n..]; 50 | offset += n as u64; 51 | } 52 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 53 | Err(e) => return Err(e), 54 | } 55 | } 56 | Ok(()) 57 | } 58 | 59 | pub(crate) fn pread_exact_or_eof( 60 | file: &File, 61 | mut buf: &mut [u8], 62 | offset: LogOffset, 63 | ) -> io::Result { 64 | let mut total = 0_usize; 65 | while !buf.is_empty() { 66 | match file.seek_read(buf, offset + u64::try_from(total).unwrap()) { 67 | Ok(0) => break, 68 | Ok(n) => { 69 | total += n; 70 | let tmp = buf; 71 | buf = &mut tmp[n..]; 72 | } 73 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 74 | Err(e) => return Err(e), 75 | } 76 | } 77 | Ok(total) 78 | } 79 | 80 | pub(crate) fn pread_exact( 81 | file: &File, 82 | buf: &mut [u8], 83 | offset: LogOffset, 84 | ) -> io::Result<()> { 85 | let mut f = file.try_clone()?; 86 | seek_read_exact(&mut f, buf, offset) 87 | } 88 | 89 | pub(crate) fn pwrite_all( 90 | file: &File, 91 | buf: &[u8], 92 | offset: LogOffset, 93 | ) -> io::Result<()> { 94 | let mut f = file.try_clone()?; 95 | seek_write_all(&mut f, buf, offset) 96 | } 97 | -------------------------------------------------------------------------------- /src/pagecache/parallel_io_polyfill.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{self, Read, Seek, Write}; 3 | 4 | use parking_lot::Mutex; 5 | 6 | use super::LogOffset; 7 | 8 | fn init_mu() -> Mutex<()> { 9 | Mutex::new(()) 10 | } 11 | 12 | type MutexInit = fn() -> Mutex<()>; 13 | 14 | static GLOBAL_FILE_LOCK: crate::Lazy, MutexInit> = 15 | crate::Lazy::new(init_mu); 16 | 17 | pub(crate) fn pread_exact_or_eof( 18 | file: &File, 19 | mut buf: &mut [u8], 20 | offset: LogOffset, 21 | ) -> io::Result { 22 | let _lock = GLOBAL_FILE_LOCK.lock(); 23 | 24 | let mut f = file.try_clone()?; 25 | 26 | let _ = f.seek(io::SeekFrom::Start(offset))?; 27 | 28 | let mut total = 0; 29 | while !buf.is_empty() { 30 | match f.read(buf) { 31 | Ok(0) => break, 32 | Ok(n) => { 33 | total += n; 34 | let tmp = buf; 35 | buf = &mut tmp[n..]; 36 | } 37 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 38 | Err(e) => return Err(e), 39 | } 40 | } 41 | Ok(total) 42 | } 43 | 44 | pub(crate) fn pread_exact( 45 | file: &File, 46 | mut buf: &mut [u8], 47 | offset: LogOffset, 48 | ) -> io::Result<()> { 49 | let _lock = GLOBAL_FILE_LOCK.lock(); 50 | 51 | let mut f = file.try_clone()?; 52 | 53 | let _ = f.seek(io::SeekFrom::Start(offset))?; 54 | 55 | while !buf.is_empty() { 56 | match f.read(buf) { 57 | Ok(0) => break, 58 | Ok(n) => { 59 | let tmp = buf; 60 | buf = &mut tmp[n..]; 61 | } 62 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 63 | Err(e) => return Err(e), 64 | } 65 | } 66 | if !buf.is_empty() { 67 | Err(io::Error::new( 68 | io::ErrorKind::UnexpectedEof, 69 | "failed to fill whole buffer", 70 | )) 71 | } else { 72 | Ok(()) 73 | } 74 | } 75 | 76 | pub(crate) fn pwrite_all( 77 | file: &File, 78 | mut buf: &[u8], 79 | offset: LogOffset, 80 | ) -> io::Result<()> { 81 | let _lock = GLOBAL_FILE_LOCK.lock(); 82 | 83 | let mut f = file.try_clone()?; 84 | 85 | let _ = f.seek(io::SeekFrom::Start(offset))?; 86 | 87 | while !buf.is_empty() { 88 | match f.write(buf) { 89 | Ok(0) => { 90 | return Err(io::Error::new( 91 | io::ErrorKind::WriteZero, 92 | "failed to write whole buffer", 93 | )); 94 | } 95 | Ok(n) => buf = &buf[n..], 96 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 97 | Err(e) => return Err(e), 98 | } 99 | } 100 | Ok(()) 101 | } 102 | -------------------------------------------------------------------------------- /src/doc/mod.rs: -------------------------------------------------------------------------------- 1 | //! #### what is sled? 2 | //! 3 | //! * an embedded kv store 4 | //! * a construction kit for stateful systems 5 | //! * ordered map API similar to a Rust `BTreeMap, Vec>` 6 | //! * fully atomic single-key operations, supports CAS 7 | //! * zero-copy reads 8 | //! * merge operators 9 | //! * forward and reverse iterators 10 | //! * a monotonic ID generator capable of giving out 75-125+ million unique IDs 11 | //! per second, never double allocating even in the presence of crashes 12 | //! * [zstd](https://github.com/facebook/zstd) compression (use the zstd build 13 | //! feature) 14 | //! * cpu-scalable lock-free implementation 15 | //! * SSD-optimized log-structured storage 16 | //! 17 | //! #### why another kv store? 18 | //! 19 | //! People face unnecessary hardship when working with existing embedded 20 | //! databases. They tend to have sharp performance trade-offs, are difficult to 21 | //! tune, have unclear consistency guarantees, and are generally inflexible. 22 | //! Facebook uses distributed machine learning to find configurations that 23 | //! achieve great performance for specific workloads on rocksdb. Most engineers 24 | //! don't have access to that kind of infrastructure. We would like to build 25 | //! sled so that it can be optimized using simple local methods, with as little 26 | //! user input as possible, and in many cases exceed the performance of popular 27 | //! systems today. 28 | //! 29 | //! This is how we aim to improve the situation: 30 | //! 31 | //! 1. don't make the user think. the interface should be obvious. 32 | //! 1. don't surprise users with performance traps. 33 | //! 1. don't wake up operators. bring reliability techniques from academia into 34 | //! real-world practice. 1. don't use so much electricity. our data structures 35 | //! should play to modern hardware's strengths. 36 | //! 37 | //! sled is written by people with experience designing, building, testing, and 38 | //! operating databases at high scales. we think the situation can be improved. 39 | //! 40 | //! #### targeted toward our vision of the future 41 | //! Building a database takes years. Designers of databases make bets about 42 | //! target usage and hardware. Here are the trends that we see, which we want to 43 | //! optimize the experience around: 44 | //! 45 | //! 1. more cores on servers, spanning sockets and numa domains 46 | //! 1. the vast majority of content consumption and generation happening on 47 | //! phones 1. compute migrating to the edge, into CDNs 48 | //! 1. conflict-free and OT-based replication techniques at the edge 49 | //! 1. strongly-consistent replication techniques within and between datacenters 50 | //! 1. event-driven architectures which benefit heavily from subscriber/watch 51 | //! semantics 52 | 53 | pub mod engineering_practices; 54 | pub mod limits; 55 | pub mod merge_operators; 56 | pub mod performance_guide; 57 | pub mod reactive_semantics; 58 | pub mod sled_architectural_outlook; 59 | pub mod testing_strategies; 60 | -------------------------------------------------------------------------------- /src/lazy.rs: -------------------------------------------------------------------------------- 1 | //! This module exists because `lazy_static` causes TSAN to 2 | //! be very unhappy. We rely heavily on TSAN for finding 3 | //! races, so we don't use `lazy_static`. 4 | 5 | use std::sync::atomic::{ 6 | AtomicBool, AtomicPtr, 7 | Ordering::{Acquire, SeqCst}, 8 | }; 9 | 10 | /// A lazily initialized value 11 | pub struct Lazy { 12 | value: AtomicPtr, 13 | init_mu: AtomicBool, 14 | init: F, 15 | } 16 | 17 | impl Lazy { 18 | /// Create a new Lazy 19 | pub const fn new(init: F) -> Self 20 | where 21 | F: Sized, 22 | { 23 | Self { 24 | value: AtomicPtr::new(std::ptr::null_mut()), 25 | init_mu: AtomicBool::new(false), 26 | init, 27 | } 28 | } 29 | } 30 | 31 | impl Drop for Lazy { 32 | fn drop(&mut self) { 33 | let value_ptr = self.value.load(Acquire); 34 | if !value_ptr.is_null() { 35 | #[allow(unsafe_code)] 36 | unsafe { 37 | drop(Box::from_raw(value_ptr)) 38 | } 39 | } 40 | } 41 | } 42 | 43 | impl std::ops::Deref for Lazy 44 | where 45 | F: Fn() -> T, 46 | { 47 | type Target = T; 48 | 49 | fn deref(&self) -> &T { 50 | { 51 | let value_ptr = self.value.load(Acquire); 52 | if !value_ptr.is_null() { 53 | #[allow(unsafe_code)] 54 | unsafe { 55 | return &*value_ptr; 56 | } 57 | } 58 | } 59 | 60 | // We want to keep looping as long as it returns true, 61 | // so we don't need any explicit conversion here. 62 | while self 63 | .init_mu 64 | .compare_exchange(false, true, SeqCst, SeqCst) 65 | .is_err() 66 | { 67 | std::sync::atomic::spin_loop_hint(); 68 | } 69 | 70 | { 71 | let value_ptr = self.value.load(Acquire); 72 | // we need to check this again because 73 | // maybe some other thread completed 74 | // the initialization already. 75 | if !value_ptr.is_null() { 76 | let unlock = self.init_mu.swap(false, SeqCst); 77 | assert!(unlock); 78 | #[allow(unsafe_code)] 79 | unsafe { 80 | return &*value_ptr; 81 | } 82 | } 83 | } 84 | 85 | { 86 | let value = (self.init)(); 87 | let value_ptr = Box::into_raw(Box::new(value)); 88 | 89 | let old = self.value.swap(value_ptr, SeqCst); 90 | assert!(old.is_null()); 91 | 92 | let unlock = self.init_mu.swap(false, SeqCst); 93 | assert!(unlock); 94 | 95 | #[allow(unsafe_code)] 96 | unsafe { 97 | &*value_ptr 98 | } 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /experiments/new_segment_ownership/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::sync::{ 2 | atomic::{AtomicUsize, Ordering}, 3 | Arc, 4 | }; 5 | 6 | const SZ: usize = 128; 7 | 8 | #[derive(Default, Debug)] 9 | struct Log { 10 | segment_accountant: Arc, 11 | io_buf: Arc, 12 | } 13 | 14 | impl Log { 15 | fn new() -> Log { 16 | let io_buf = Arc::new(IoBuf::default()); 17 | let segment_accountant = io_buf.segment.segment_accountant.clone(); 18 | Log { io_buf, segment_accountant } 19 | } 20 | 21 | fn reserve(&mut self, size: usize) -> Reservation { 22 | assert!(size <= SZ); 23 | if self.io_buf.buf.load(Ordering::SeqCst) + size > SZ { 24 | let segment = self.segment_accountant.clone().next_segment(); 25 | let buf = AtomicUsize::new(0); 26 | self.io_buf = Arc::new(IoBuf { segment, buf }); 27 | } 28 | let io_buf = self.io_buf.clone(); 29 | io_buf.buf.fetch_add(size, Ordering::SeqCst); 30 | Reservation { io_buf } 31 | } 32 | } 33 | 34 | #[derive(Default, Debug)] 35 | struct Reservation { 36 | io_buf: Arc, 37 | } 38 | 39 | #[derive(Default, Debug)] 40 | struct IoBuf { 41 | segment: Arc, 42 | buf: AtomicUsize, 43 | } 44 | 45 | #[derive(Default, Debug)] 46 | struct Segment { 47 | offset: usize, 48 | segment_accountant: Arc, 49 | } 50 | 51 | #[derive(Default, Debug)] 52 | struct SegmentAccountant { 53 | tip: AtomicUsize, 54 | free: Vec, 55 | } 56 | 57 | impl SegmentAccountant { 58 | fn next_segment(self: Arc) -> Arc { 59 | let offset = SZ + self.tip.fetch_add(SZ, Ordering::SeqCst); 60 | println!("setting new segment {}", offset); 61 | Arc::new(Segment { segment_accountant: self, offset }) 62 | } 63 | } 64 | 65 | fn main() { 66 | let mut log = Log::new(); 67 | { 68 | let _ = log.reserve(64); 69 | let _ = log.reserve(64); 70 | } 71 | println!("src/main.rs:70"); 72 | { 73 | let _ = log.reserve(128); 74 | } 75 | println!("src/main.rs:74"); 76 | { 77 | let _ = log.reserve(128); 78 | } 79 | println!("src/main.rs:78"); 80 | { 81 | let _ = log.reserve(128); 82 | } 83 | println!("src/main.rs:77"); 84 | } 85 | 86 | mod dropz { 87 | use super::*; 88 | 89 | impl Drop for IoBuf { 90 | fn drop(&mut self) { 91 | println!("IoBuf::drop"); 92 | } 93 | } 94 | impl Drop for Segment { 95 | fn drop(&mut self) { 96 | println!("dropping Segment {:?}", self.offset); 97 | } 98 | } 99 | impl Drop for SegmentAccountant { 100 | fn drop(&mut self) { 101 | println!("SegmentAccountant::drop"); 102 | } 103 | } 104 | impl Drop for Reservation { 105 | fn drop(&mut self) { 106 | println!("Reservation::drop"); 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/concurrency_control.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "testing")] 2 | use std::cell::RefCell; 3 | use std::sync::atomic::AtomicBool; 4 | 5 | use parking_lot::{RwLockReadGuard, RwLockWriteGuard}; 6 | 7 | use super::*; 8 | 9 | #[cfg(feature = "testing")] 10 | thread_local! { 11 | pub static COUNT: RefCell = RefCell::new(0); 12 | } 13 | 14 | const RW_REQUIRED_BIT: usize = 1 << 31; 15 | 16 | #[derive(Default)] 17 | pub(crate) struct ConcurrencyControl { 18 | active: AtomicUsize, 19 | upgrade_complete: AtomicBool, 20 | rw: RwLock<()>, 21 | } 22 | 23 | static CONCURRENCY_CONTROL: Lazy< 24 | ConcurrencyControl, 25 | fn() -> ConcurrencyControl, 26 | > = Lazy::new(init_cc); 27 | 28 | fn init_cc() -> ConcurrencyControl { 29 | ConcurrencyControl::default() 30 | } 31 | 32 | #[derive(Debug)] 33 | #[must_use] 34 | pub(crate) enum Protector<'a> { 35 | Write(RwLockWriteGuard<'a, ()>), 36 | Read(RwLockReadGuard<'a, ()>), 37 | None(&'a AtomicUsize), 38 | } 39 | 40 | impl<'a> Drop for Protector<'a> { 41 | fn drop(&mut self) { 42 | if let Protector::None(active) = self { 43 | active.fetch_sub(1, Release); 44 | } 45 | #[cfg(feature = "testing")] 46 | COUNT.with(|c| { 47 | let mut c = c.borrow_mut(); 48 | *c -= 1; 49 | assert_eq!(*c, 0); 50 | }); 51 | } 52 | } 53 | 54 | pub(crate) fn read<'a>() -> Protector<'a> { 55 | CONCURRENCY_CONTROL.read() 56 | } 57 | 58 | pub(crate) fn write<'a>() -> Protector<'a> { 59 | CONCURRENCY_CONTROL.write() 60 | } 61 | 62 | impl ConcurrencyControl { 63 | fn enable(&self) { 64 | if self.active.fetch_or(RW_REQUIRED_BIT, SeqCst) < RW_REQUIRED_BIT { 65 | // we are the first to set this bit 66 | while self.active.load(Acquire) != RW_REQUIRED_BIT { 67 | std::sync::atomic::spin_loop_hint() 68 | } 69 | self.upgrade_complete.store(true, Release); 70 | } 71 | } 72 | 73 | fn read(&self) -> Protector<'_> { 74 | #[cfg(feature = "testing")] 75 | COUNT.with(|c| { 76 | let mut c = c.borrow_mut(); 77 | *c += 1; 78 | assert_eq!(*c, 1); 79 | }); 80 | 81 | let active = self.active.fetch_add(1, Release); 82 | 83 | if active >= RW_REQUIRED_BIT { 84 | self.active.fetch_sub(1, Release); 85 | Protector::Read(self.rw.read()) 86 | } else { 87 | Protector::None(&self.active) 88 | } 89 | } 90 | 91 | fn write(&self) -> Protector<'_> { 92 | #[cfg(feature = "testing")] 93 | COUNT.with(|c| { 94 | let mut c = c.borrow_mut(); 95 | *c += 1; 96 | assert_eq!(*c, 1); 97 | }); 98 | self.enable(); 99 | while !self.upgrade_complete.load(Acquire) { 100 | std::sync::atomic::spin_loop_hint() 101 | } 102 | Protector::Write(self.rw.write()) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/varint.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryFrom; 2 | 3 | /// Returns the number of bytes that this varint will need 4 | pub fn size(int: u64) -> usize { 5 | if int <= 240 { 6 | 1 7 | } else if int <= 2287 { 8 | 2 9 | } else if int <= 67823 { 10 | 3 11 | } else if int <= 0x00FF_FFFF { 12 | 4 13 | } else if int <= 0xFFFF_FFFF { 14 | 5 15 | } else if int <= 0x00FF_FFFF_FFFF { 16 | 6 17 | } else if int <= 0xFFFF_FFFF_FFFF { 18 | 7 19 | } else if int <= 0x00FF_FFFF_FFFF_FFFF { 20 | 8 21 | } else { 22 | 9 23 | } 24 | } 25 | 26 | /// Returns how many bytes the varint consumed while serializing 27 | pub fn serialize_into(int: u64, buf: &mut [u8]) -> usize { 28 | if int <= 240 { 29 | buf[0] = u8::try_from(int).unwrap(); 30 | 1 31 | } else if int <= 2287 { 32 | buf[0] = u8::try_from((int - 240) / 256 + 241).unwrap(); 33 | buf[1] = u8::try_from((int - 240) % 256).unwrap(); 34 | 2 35 | } else if int <= 67823 { 36 | buf[0] = 249; 37 | buf[1] = u8::try_from((int - 2288) / 256).unwrap(); 38 | buf[2] = u8::try_from((int - 2288) % 256).unwrap(); 39 | 3 40 | } else if int <= 0x00FF_FFFF { 41 | buf[0] = 250; 42 | let bytes = int.to_le_bytes(); 43 | buf[1..4].copy_from_slice(&bytes[..3]); 44 | 4 45 | } else if int <= 0xFFFF_FFFF { 46 | buf[0] = 251; 47 | let bytes = int.to_le_bytes(); 48 | buf[1..5].copy_from_slice(&bytes[..4]); 49 | 5 50 | } else if int <= 0x00FF_FFFF_FFFF { 51 | buf[0] = 252; 52 | let bytes = int.to_le_bytes(); 53 | buf[1..6].copy_from_slice(&bytes[..5]); 54 | 6 55 | } else if int <= 0xFFFF_FFFF_FFFF { 56 | buf[0] = 253; 57 | let bytes = int.to_le_bytes(); 58 | buf[1..7].copy_from_slice(&bytes[..6]); 59 | 7 60 | } else if int <= 0x00FF_FFFF_FFFF_FFFF { 61 | buf[0] = 254; 62 | let bytes = int.to_le_bytes(); 63 | buf[1..8].copy_from_slice(&bytes[..7]); 64 | 8 65 | } else { 66 | buf[0] = 255; 67 | let bytes = int.to_le_bytes(); 68 | buf[1..9].copy_from_slice(&bytes[..8]); 69 | 9 70 | } 71 | } 72 | 73 | /// Returns the deserialized varint, along with how many bytes 74 | /// were taken up by the varint. 75 | pub fn deserialize(buf: &[u8]) -> crate::Result<(u64, usize)> { 76 | if buf.is_empty() { 77 | return Err(crate::Error::corruption(None)); 78 | } 79 | let res = match buf[0] { 80 | 0..=240 => (u64::from(buf[0]), 1), 81 | 241..=248 => { 82 | let varint = 83 | 240 + 256 * (u64::from(buf[0]) - 241) + u64::from(buf[1]); 84 | (varint, 2) 85 | } 86 | 249 => { 87 | let varint = 2288 + 256 * u64::from(buf[1]) + u64::from(buf[2]); 88 | (varint, 3) 89 | } 90 | other => { 91 | let sz = other as usize - 247; 92 | let mut aligned = [0; 8]; 93 | aligned[..sz].copy_from_slice(&buf[1..=sz]); 94 | let varint = u64::from_le_bytes(aligned); 95 | (varint, sz + 1) 96 | } 97 | }; 98 | Ok(res) 99 | } 100 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | clippy_check: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: nightly 16 | components: clippy 17 | override: true 18 | - run: rustup component add clippy 19 | - uses: actions-rs/clippy-check@v1 20 | with: 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | args: --all-features 23 | default: 24 | name: Cargo Test on ${{ matrix.os }} 25 | runs-on: ${{ matrix.os }} 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | os: [ubuntu-latest, macos-latest, windows-latest] 30 | steps: 31 | - uses: actions/checkout@v1 32 | - name: Cache target 33 | uses: actions/cache@v1 34 | env: 35 | cache-name: cache-target 36 | RUST_BACKTRACE: 1 37 | with: 38 | path: target 39 | key: ${{ runner.os }}-${{ env.cache-name }} 40 | restore-keys: | 41 | ${{ runner.os }}- 42 | - name: cargo test 43 | run: | 44 | rustup update --no-self-update 45 | cargo test --release --no-default-features --features=testing -- --nocapture 46 | examples: 47 | name: Example Tests 48 | runs-on: ubuntu-latest 49 | steps: 50 | - uses: actions/checkout@v1 51 | - name: Cache target 52 | uses: actions/cache@v1 53 | env: 54 | cache-name: cache-target 55 | with: 56 | path: target 57 | key: ${{ runner.os }}-${{ env.cache-name }} 58 | restore-keys: | 59 | ${{ runner.os }}- 60 | - name: example tests 61 | run: | 62 | rustup update --no-self-update 63 | cargo run --example playground 64 | cargo run --example structured 65 | cross-compile: 66 | name: Cross Compile 67 | runs-on: macos-latest 68 | steps: 69 | - uses: actions/checkout@v1 70 | - name: cross compile 71 | run: | 72 | set -eo pipefail 73 | echo "cross build" 74 | scripts/cross_compile.sh 75 | burn-in: 76 | name: Burn In 77 | runs-on: ubuntu-latest 78 | steps: 79 | - uses: actions/checkout@v1 80 | - name: Cache target 81 | uses: actions/cache@v1 82 | env: 83 | cache-name: cache-target 84 | with: 85 | path: target 86 | key: ${{ runner.os }}-${{ env.cache-name }} 87 | restore-keys: | 88 | ${{ runner.os }}- 89 | - name: burn in 90 | run: | 91 | set -eo pipefail 92 | pushd benchmarks/stress2 93 | cargo run --release -- --duration=60 94 | rm -rf default.sled 95 | sanitizers: 96 | name: Sanitizers 97 | runs-on: ubuntu-latest 98 | steps: 99 | - uses: actions/checkout@v1 100 | - name: Cache rustup 101 | uses: actions/cache@v1 102 | env: 103 | cache-name: cache-target 104 | with: 105 | path: ~/.rustup 106 | key: ${{ runner.os }}-${{ env.cache-name }} 107 | restore-keys: | 108 | ${{ runner.os }}- 109 | - name: sanitizers 110 | run: | 111 | set -eo pipefail 112 | scripts/sanitizers.sh 113 | -------------------------------------------------------------------------------- /src/doc/merge_operators/mod.rs: -------------------------------------------------------------------------------- 1 | //! Merge operators are an extremely powerful tool for use in embedded kv 2 | //! stores. They allow users to specify custom logic for combining multiple 3 | //! versions of a value into one. 4 | //! 5 | //! As a motivating example, imagine that you have a counter. In a traditional 6 | //! kv store, you would need to read the old value, modify it, then write it 7 | //! back (RMW). If you want to increment the counter from multiple threads, you 8 | //! would need to either use higher-level locking or you need to spin in a CAS 9 | //! loop until your increment is successful. Merge operators remove the need for 10 | //! all of this by allowing multiple threads to "merge" in the desired 11 | //! operation, rather than performing a read, then modification, then later 12 | //! writing. `+1 -> +1 -> +1` instead of `w(r(key) + 1) -> w(r(key)+ 1) -> 13 | //! w(r(key) + 1)`. 14 | //! 15 | //! Here's an example of using a merge operator to just concatenate merged bytes 16 | //! together. Note that calling `set` acts as a value replacement, bypassing the 17 | //! merging logic and replacing previously merged values. Calling `merge` is 18 | //! like `set` but when the key is fetched, it will use the merge operator to 19 | //! combine all `merge`'s since the last `set`. 20 | //! 21 | //! ```rust 22 | //! fn concatenate_merge( 23 | //! _key: &[u8], // the key being merged 24 | //! old_value: Option<&[u8]>, // the previous value, if one existed 25 | //! merged_bytes: &[u8] // the new bytes being merged in 26 | //! ) -> Option> { // set the new value, return None to delete 27 | //! let mut ret = old_value 28 | //! .map(|ov| ov.to_vec()) 29 | //! .unwrap_or_else(|| vec![]); 30 | //! 31 | //! ret.extend_from_slice(merged_bytes); 32 | //! 33 | //! Some(ret) 34 | //! } 35 | //! 36 | //! let config = ConfigBuilder::new() 37 | //! .temporary(true) 38 | //! .build(); 39 | //! 40 | //! let tree = Tree::start(config).unwrap(); 41 | //! tree.set_merge_operator(concatenate_merge); 42 | //! 43 | //! tree.set(k, vec![0]); 44 | //! tree.merge(k, vec![1]); 45 | //! tree.merge(k, vec![2]); 46 | //! assert_eq!(tree.get(&k), Ok(Some(vec![0, 1, 2]))); 47 | //! 48 | //! // sets replace previously merged data, 49 | //! // bypassing the merge function. 50 | //! tree.set(k, vec![3]); 51 | //! assert_eq!(tree.get(&k), Ok(Some(vec![3]))); 52 | //! 53 | //! // merges on non-present values will add them 54 | //! tree.del(&k); 55 | //! tree.merge(k, vec![4]); 56 | //! assert_eq!(tree.get(&k), Ok(Some(vec![4]))); 57 | //! ``` 58 | //! 59 | //! ### beyond the basics 60 | //! 61 | //! Merge operators can be used to express arbitrarily complex logic. You can 62 | //! use them to implement any sort of high-level data structure on top of sled, 63 | //! using merges of different values to represent your desired operations. 64 | //! Similar to the above example, you could implement a list that lets you push 65 | //! items. Bloom filters are particularly easy to implement, and merge operators 66 | //! also are quite handy for building persistent CRDTs. 67 | //! 68 | //! ### warnings 69 | //! 70 | //! If you call `merge` without setting a merge operator, an error will be 71 | //! returned. Merge operators may be changed over time, but make sure you do 72 | //! this carefully to avoid race conditions. If you need to push a one-time 73 | //! operation to a value, use `update_and_fetch` or `fetch_and_update` instead. 74 | -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[derive(Debug, Clone)] 4 | #[doc(hidden)] 5 | pub struct Context { 6 | // TODO file from config should be in here 7 | config: RunningConfig, 8 | /// Periodically flushes dirty data. We keep this in an 9 | /// Arc separate from the PageCache below to separate 10 | /// "high-level" references from Db, Tree etc... from 11 | /// "low-level" references like background threads. 12 | /// When the last high-level reference is dropped, it 13 | /// should trigger all background threads to clean 14 | /// up synchronously. 15 | #[cfg(all( 16 | not(miri), 17 | any( 18 | windows, 19 | target_os = "linux", 20 | target_os = "macos", 21 | target_os = "dragonfly", 22 | target_os = "freebsd", 23 | target_os = "openbsd", 24 | target_os = "netbsd", 25 | ) 26 | ))] 27 | pub(crate) flusher: Arc>>, 28 | #[doc(hidden)] 29 | pub pagecache: PageCache, 30 | } 31 | 32 | impl std::ops::Deref for Context { 33 | type Target = RunningConfig; 34 | 35 | fn deref(&self) -> &RunningConfig { 36 | &self.config 37 | } 38 | } 39 | 40 | impl Context { 41 | pub(crate) fn start(config: RunningConfig) -> Result { 42 | trace!("starting context"); 43 | 44 | let pagecache = PageCache::start(config.clone())?; 45 | 46 | Ok(Self { 47 | config, 48 | pagecache, 49 | #[cfg(all( 50 | not(miri), 51 | any( 52 | windows, 53 | target_os = "linux", 54 | target_os = "macos", 55 | target_os = "dragonfly", 56 | target_os = "freebsd", 57 | target_os = "openbsd", 58 | target_os = "netbsd", 59 | ) 60 | ))] 61 | flusher: Arc::new(parking_lot::Mutex::new(None)), 62 | }) 63 | } 64 | 65 | /// Returns `true` if the database was 66 | /// recovered from a previous process. 67 | /// Note that database state is only 68 | /// guaranteed to be present up to the 69 | /// last call to `flush`! Otherwise state 70 | /// is synced to disk periodically if the 71 | /// `sync_every_ms` configuration option 72 | /// is set to `Some(number_of_ms_between_syncs)` 73 | /// or if the IO buffer gets filled to 74 | /// capacity before being rotated. 75 | pub fn was_recovered(&self) -> bool { 76 | self.pagecache.was_recovered() 77 | } 78 | 79 | /// Generate a monotonic ID. Not guaranteed to be 80 | /// contiguous. Written to disk every `idgen_persist_interval` 81 | /// operations, followed by a blocking flush. During recovery, we 82 | /// take the last recovered generated ID and add 2x 83 | /// the `idgen_persist_interval` to it. While persisting, if the 84 | /// previous persisted counter wasn't synced to disk yet, we will do 85 | /// a blocking flush to fsync the latest counter, ensuring 86 | /// that we will never give out the same counter twice. 87 | pub fn generate_id(&self) -> Result { 88 | let _cc = concurrency_control::read(); 89 | self.pagecache.generate_id_inner() 90 | } 91 | 92 | pub(crate) fn pin_log(&self, guard: &Guard) -> Result> { 93 | self.pagecache.pin_log(guard) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /bindings/neon-sled/native/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate neon; 3 | extern crate sled; 4 | 5 | use neon::js::JsNull; 6 | use neon::js::JsString; 7 | use neon::js::Value; 8 | use neon::vm::{Call, JsResult}; 9 | 10 | fn extract_arg(call: &mut Call, idx: i32) -> Result { 11 | let args = &call.arguments; 12 | let handle = args.get(call.guard, idx).ok_or(())?; 13 | Ok((*handle).to_string(call.scope).map_err(|_| ())?.value()) 14 | } 15 | 16 | fn create_db(mut call: Call) -> JsResult { 17 | let path = extract_arg(&mut call, 0).unwrap(); 18 | let t = sled::Config::default().path(path).tree(); 19 | 20 | let ptr = Box::into_raw(Box::new(t)); 21 | let ptr_string = format!("{}", ptr as usize); 22 | Ok(JsString::new(call.guard, &*ptr_string).unwrap()) 23 | } 24 | 25 | fn cast_string_to_ptr<'a>(ptr_str: String) -> &'a sled::Tree { 26 | let ptr_from_str = ptr_str.parse::().unwrap(); 27 | //println!("ptr_from_str: {}", ptr_from_str); 28 | 29 | let ptr = ptr_from_str as *mut sled::Tree; 30 | unsafe { &*ptr } 31 | } 32 | 33 | fn set(mut call: Call) -> JsResult { 34 | let arg0 = extract_arg(&mut call, 0).unwrap(); 35 | let arg1 = extract_arg(&mut call, 1); 36 | let arg2 = extract_arg(&mut call, 2); 37 | 38 | //println!("SET args {:?} {:?}", arg0, arg1); 39 | 40 | let t = cast_string_to_ptr(arg0); 41 | 42 | let k = arg1.unwrap().into_bytes(); 43 | let v = arg2.unwrap().into_bytes(); 44 | 45 | t.set(k.clone(), v); 46 | 47 | let from_db = t 48 | .get(&*k) 49 | .and_then(|from_db| { 50 | let str = unsafe { std::str::from_utf8_unchecked(&*from_db) }; 51 | JsString::new(call.guard, str) 52 | }) 53 | .unwrap_or_else(|| JsString::new(call.guard, "").unwrap()); 54 | 55 | Ok(from_db) 56 | } 57 | 58 | fn get(mut call: Call) -> JsResult { 59 | let arg0 = extract_arg(&mut call, 0).unwrap(); 60 | let arg1 = extract_arg(&mut call, 1); 61 | 62 | //println!("GET args {:?}", arg0); 63 | 64 | let t = cast_string_to_ptr(arg0); 65 | let k = arg1.unwrap().into_bytes(); 66 | 67 | let from_db = t 68 | .get(&*k) 69 | .map(|from_db| { 70 | let str = unsafe { std::str::from_utf8_unchecked(&*from_db) }; 71 | JsString::new(call.guard, str).unwrap() 72 | }) 73 | .unwrap_or_else(|| JsString::new(call.guard, "").unwrap()); 74 | 75 | Ok(from_db) 76 | } 77 | 78 | fn del(mut call: Call) -> JsResult { 79 | let arg0 = extract_arg(&mut call, 0).unwrap(); 80 | let arg1 = extract_arg(&mut call, 1); 81 | 82 | let t = cast_string_to_ptr(arg0); 83 | let k = arg1.unwrap().into_bytes(); 84 | 85 | t.del(&*k); 86 | 87 | Ok(JsNull::new()) 88 | } 89 | 90 | fn sync_and_close(mut call: Call) -> JsResult { 91 | let arg0 = extract_arg(&mut call, 0).unwrap(); 92 | let ptr_from_str = arg0.parse::().unwrap(); 93 | let ptr = ptr_from_str as *mut sled::Tree; 94 | 95 | unsafe { 96 | let t = Box::from_raw(ptr); 97 | drop(t); 98 | } 99 | Ok(JsNull::new()) 100 | } 101 | 102 | register_module!(m, { 103 | m.export("get", get)?; 104 | m.export("set", set)?; 105 | m.export("del", del)?; 106 | m.export("createDb", create_db)?; 107 | m.export("syncAndClose", sync_and_close) 108 | }); 109 | -------------------------------------------------------------------------------- /bindings/python/rsdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from ctypes import * 4 | import os 5 | 6 | sled = CDLL("./libsled_native.so") 7 | 8 | sled.sled_create_config.argtypes = () 9 | sled.sled_create_config.restype = c_void_p 10 | 11 | sled.sled_config_set_path.argtypes = (c_void_p, c_char_p) 12 | 13 | sled.sled_free_config.argtypes = (c_void_p,) 14 | 15 | sled.sled_open_tree.argtypes = (c_void_p,) 16 | sled.sled_open_tree.restype = c_void_p 17 | 18 | sled.sled_free_tree.argtypes = (c_void_p,) 19 | 20 | sled.sled_get.argtypes = (c_void_p, c_char_p, c_size_t, POINTER(c_size_t)) 21 | sled.sled_get.restype = c_char_p 22 | 23 | sled.sled_scan.argtypes = (c_void_p, c_char_p, c_size_t, POINTER(c_size_t)) 24 | sled.sled_scan.restype = c_void_p 25 | 26 | sled.sled_set.argtypes = (c_void_p, c_char_p, c_size_t, c_char_p, c_size_t) 27 | sled.sled_set.restype = None 28 | 29 | sled.sled_del.argtypes = (c_void_p, c_char_p, c_size_t) 30 | sled.sled_del.restype = None 31 | 32 | sled.sled_cas.argtypes = (c_void_p, 33 | c_char_p, c_size_t, # key 34 | c_char_p, c_size_t, # old 35 | c_char_p, c_size_t, # new 36 | POINTER(c_char_p), POINTER(c_size_t), # actual ret 37 | ) 38 | sled.sled_cas.restype = c_ubyte 39 | 40 | 41 | class Conf: 42 | def __init__(self): 43 | self.ptr = c_void_p(sled.sled_create_config()) 44 | 45 | def tree(self): 46 | tree_ptr = sled.sled_open_tree(self.ptr) 47 | return Tree(c_void_p(tree_ptr)) 48 | 49 | def path(self, path): 50 | sled.sled_config_set_path(self.ptr, path) 51 | 52 | def __del__(self): 53 | sled.sled_free_config(self.ptr) 54 | 55 | 56 | class TreeIterator: 57 | def __init__(self, ptr): 58 | self.ptr = ptr 59 | 60 | def __del__(self): 61 | sled.sled_free_iter(self.ptr) 62 | 63 | 64 | class Tree: 65 | def __init__(self, ptr): 66 | self.ptr = ptr 67 | 68 | def __del__(self): 69 | if self.ptr: 70 | sled.sled_free_tree(self.ptr) 71 | 72 | def close(self): 73 | self.__del__() 74 | self.ptr = None 75 | 76 | def set(self, key, val): 77 | sled.sled_set(self.ptr, key, len(key), val, len(val)) 78 | 79 | def get(self, key): 80 | vallen = c_size_t(0) 81 | ptr = sled.sled_get(self.ptr, key, len(key), byref(vallen)) 82 | return ptr[:vallen.value] 83 | 84 | def delete(self, key): 85 | sled.sled_del(self.ptr, key, len(key)) 86 | 87 | def cas(self, key, old, new): 88 | actual_vallen = c_size_t(0) 89 | actual_val = c_char_p(0) 90 | 91 | if old is None: 92 | old = b"" 93 | 94 | if new is None: 95 | new = b"" 96 | 97 | success = sled.sled_compare_and_swap( 98 | self.ptr, key, 99 | len(key), 100 | old, len(old), 101 | new, len(new), 102 | byref(actual_val), byref(actual_vallen)) 103 | 104 | if actual_vallen.value == 0: 105 | return (None, success == 1) 106 | else: 107 | return (actual_val.value[:actual_vallen.value], success == 1) 108 | 109 | def scan(self, key): 110 | return sled.sled_scan(self.ptr, key, len(key)) 111 | -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at tylerneely@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | -------------------------------------------------------------------------------- /SAFETY.md: -------------------------------------------------------------------------------- 1 | # sled safety model 2 | 3 | This document applies 4 | [STPA](http://psas.scripts.mit.edu/home/get_file.php?name=STPA_handbook.pdf)-style 5 | hazard analysis to the sled embedded database for the purpose of guiding 6 | design and testing efforts to prevent unacceptable losses. 7 | 8 | Outline 9 | 10 | * [purpose of analysis](#purpose-of-analysis) 11 | * [losses](#losses) 12 | * [system boundary](#system-boundary) 13 | * [hazards](#hazards) 14 | * [leading indicators](#leading-indicators) 15 | * [constraints](#constraints) 16 | * [model of control structure](#model-of-control-structure) 17 | * [identify unsafe control actions](#identify-unsafe-control-actions) 18 | * [identify loss scenarios][#identify-loss-scenarios) 19 | * [resources for learning more about STAMP, STPA, and CAST](#resources) 20 | 21 | # Purpose of Analysis 22 | 23 | ## Losses 24 | 25 | We wish to prevent the following undesirable situations: 26 | 27 | * data loss 28 | * inconsistent (non-linearizable) data access 29 | * process crash 30 | * resource exhaustion 31 | 32 | ## System Boundary 33 | 34 | We draw the line between system and environment where we can reasonably 35 | invest our efforts to prevent losses. 36 | 37 | Inside the boundary: 38 | 39 | * codebase 40 | * put safe control actions into place that prevent losses 41 | * documentation 42 | * show users how to use sled safely 43 | * recommend hardware, kernels, user code 44 | 45 | Outside the boundary: 46 | 47 | * Direct changes to hardware, kernels, user code 48 | 49 | ## Hazards 50 | 51 | These hazards can result in the above losses: 52 | 53 | * data may be lost if 54 | * bugs in the logging system 55 | * `Db::flush` fails to make previous writes durable 56 | * bugs in the GC system 57 | * the old location is overwritten before the defragmented location becomes durable 58 | * bugs in the recovery system 59 | * hardare failures 60 | * consistency violations may be caused by 61 | * transaction concurrency control failure to enforce linearizability (strict serializability) 62 | * non-linearizable lock-free single-key operations 63 | * panic 64 | * of user threads 65 | * IO threads 66 | * flusher & GC thread 67 | * indexing 68 | * unwraps/expects 69 | * failed TryInto/TryFrom + unwrap 70 | * persistent storage exceeding (2 + N concurrent writers) * logical data size 71 | * in-memory cache exceeding the configured cache size 72 | * caused by incorrect calculation of cache 73 | * use-after-free 74 | * data race 75 | * memory leak 76 | * integer overflow 77 | * buffer overrun 78 | * uninitialized memory access 79 | 80 | ## Constraints 81 | 82 | # Models of Control Structures 83 | 84 | for each control action we have, consider: 85 | 86 | 1. what hazards happen when we fail to apply it / it does not exist? 87 | 2. what hazards happen when we do apply it 88 | 3. what hazards happen when we apply it too early or too late? 89 | 4. what hazards happen if we apply it for too long or not long enough? 90 | 91 | durability model 92 | 93 | * recovery 94 | * LogIter::max_lsn 95 | * return None if last_lsn_in_batch >= self.max_lsn 96 | * batch requirement set to last reservation base + inline len - 1 97 | * reserve bumps 98 | * bump_atomic_lsn(&self.iobufs.max_reserved_lsn, reservation_lsn + inline_buf_len as Lsn - 1); 99 | 100 | lock-free linearizability model 101 | 102 | transactional linearizability (strict serializability) model 103 | 104 | panic model 105 | 106 | memory usage model 107 | 108 | storage usage model 109 | 110 | -------------------------------------------------------------------------------- /src/debug_delay.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::float_arithmetic)] 2 | 3 | use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; 4 | 5 | use crate::Lazy; 6 | 7 | /// This function is useful for inducing random jitter into our atomic 8 | /// operations, shaking out more possible interleavings quickly. It gets 9 | /// fully eliminated by the compiler in non-test code. 10 | pub fn debug_delay() { 11 | use std::thread; 12 | use std::time::Duration; 13 | 14 | static GLOBAL_DELAYS: AtomicUsize = AtomicUsize::new(0); 15 | 16 | static INTENSITY: Lazy u32> = Lazy::new(|| { 17 | std::env::var("SLED_LOCK_FREE_DELAY_INTENSITY") 18 | .unwrap_or_else(|_| "100".into()) 19 | .parse() 20 | .expect( 21 | "SLED_LOCK_FREE_DELAY_INTENSITY must be set to a \ 22 | non-negative integer (ideally below 1,000,000)", 23 | ) 24 | }); 25 | 26 | static CRASH_CHANCE: Lazy u32> = Lazy::new(|| { 27 | std::env::var("SLED_CRASH_CHANCE") 28 | .unwrap_or_else(|_| "0".into()) 29 | .parse() 30 | .expect( 31 | "SLED_CRASH_CHANCE must be set to a \ 32 | non-negative integer (ideally below 50,000)", 33 | ) 34 | }); 35 | 36 | thread_local!( 37 | static LOCAL_DELAYS: std::cell::RefCell = std::cell::RefCell::new(0) 38 | ); 39 | 40 | if cfg!(feature = "miri_optimizations") { 41 | // Each interaction with LOCAL_DELAYS adds more stacked borrows 42 | // tracking information, and Miri is single-threaded anyway. 43 | return; 44 | } 45 | 46 | let global_delays = GLOBAL_DELAYS.fetch_add(1, Relaxed); 47 | let local_delays = LOCAL_DELAYS.with(|ld| { 48 | let mut ld = ld.borrow_mut(); 49 | let old = *ld; 50 | *ld = std::cmp::max(global_delays + 1, *ld + 1); 51 | old 52 | }); 53 | 54 | if *CRASH_CHANCE > 0 && random(*CRASH_CHANCE) == 0 { 55 | std::process::exit(9) 56 | } 57 | 58 | if global_delays == local_delays { 59 | // no other threads seem to be 60 | // calling this, so we may as 61 | // well skip it 62 | return; 63 | } 64 | 65 | if random(1000) == 1 { 66 | let duration = random(*INTENSITY); 67 | 68 | #[allow(clippy::cast_possible_truncation)] 69 | #[allow(clippy::cast_sign_loss)] 70 | thread::sleep(Duration::from_micros(u64::from(duration))); 71 | } 72 | 73 | if random(2) == 0 { 74 | thread::yield_now(); 75 | } 76 | } 77 | 78 | /// Generates a random number in `0..n`. 79 | fn random(n: u32) -> u32 { 80 | use std::cell::Cell; 81 | use std::num::Wrapping; 82 | 83 | thread_local! { 84 | static RNG: Cell> = Cell::new(Wrapping(1_406_868_647)); 85 | } 86 | 87 | #[allow(clippy::cast_possible_truncation)] 88 | RNG.try_with(|rng| { 89 | // This is the 32-bit variant of Xorshift. 90 | // 91 | // Source: https://en.wikipedia.org/wiki/Xorshift 92 | let mut x = rng.get(); 93 | x ^= x << 13; 94 | x ^= x >> 17; 95 | x ^= x << 5; 96 | rng.set(x); 97 | 98 | // This is a fast alternative to `x % n`. 99 | // 100 | // Author: Daniel Lemire 101 | // Source: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 102 | (u64::from(x.0).wrapping_mul(u64::from(n)) >> 32) as u32 103 | }) 104 | .unwrap_or(0) 105 | } 106 | -------------------------------------------------------------------------------- /src/meta.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | /// A simple map that can be used to store metadata 4 | /// for the pagecache tenant. 5 | #[derive(Clone, Debug, Eq, PartialEq, Default)] 6 | pub struct Meta { 7 | pub(crate) inner: BTreeMap, 8 | } 9 | 10 | impl Meta { 11 | /// Retrieve the `PageId` associated with an identifier 12 | pub(crate) fn get_root(&self, table: &[u8]) -> Option { 13 | self.inner.get(table).cloned() 14 | } 15 | 16 | /// Set the `PageId` associated with an identifier 17 | pub(crate) fn set_root( 18 | &mut self, 19 | name: IVec, 20 | pid: PageId, 21 | ) -> Option { 22 | self.inner.insert(name, pid) 23 | } 24 | 25 | /// Remove the page mapping for a given identifier 26 | pub(crate) fn del_root(&mut self, name: &[u8]) -> Option { 27 | self.inner.remove(name) 28 | } 29 | 30 | /// Return the current rooted tenants in Meta 31 | pub(crate) fn tenants(&self) -> BTreeMap { 32 | self.inner.clone() 33 | } 34 | 35 | pub(crate) fn rss(&self) -> u64 { 36 | self.inner 37 | .iter() 38 | .map(|(k, _pid)| { 39 | k.len() as u64 + std::mem::size_of::() as u64 40 | }) 41 | .sum() 42 | } 43 | } 44 | 45 | /// Open or create a new disk-backed Tree with its own keyspace, 46 | /// accessible from the `Db` via the provided identifier. 47 | pub(crate) fn open_tree( 48 | context: &Context, 49 | raw_name: V, 50 | guard: &Guard, 51 | ) -> Result 52 | where 53 | V: Into, 54 | { 55 | let name = raw_name.into(); 56 | 57 | // we loop because creating this Tree may race with 58 | // concurrent attempts to open the same one. 59 | loop { 60 | match context.pagecache.meta_pid_for_name(&name, guard) { 61 | Ok(root_id) => { 62 | assert_ne!(root_id, 0); 63 | return Ok(Tree(Arc::new(TreeInner { 64 | tree_id: name, 65 | context: context.clone(), 66 | subscribers: Subscribers::default(), 67 | root: AtomicU64::new(root_id), 68 | merge_operator: RwLock::new(None), 69 | }))); 70 | } 71 | Err(Error::CollectionNotFound(_)) => {} 72 | Err(other) => return Err(other), 73 | } 74 | 75 | // set up empty leaf 76 | let mut leaf = Node::new_empty_leaf(); 77 | leaf.is_index = false; 78 | let (leaf_id, leaf_ptr) = context.pagecache.allocate(leaf, guard)?; 79 | 80 | trace!( 81 | "allocated pid {} for leaf in new_tree for namespace {:?}", 82 | leaf_id, 83 | name 84 | ); 85 | 86 | // set up root index 87 | 88 | // vec![0] represents a prefix-encoded empty prefix 89 | let root = Node::new_root(leaf_id); 90 | let (root_id, root_ptr) = context.pagecache.allocate(root, guard)?; 91 | 92 | debug!("allocated pid {} for root of new_tree {:?}", root_id, name); 93 | 94 | let res = context.pagecache.cas_root_in_meta( 95 | &name, 96 | None, 97 | Some(root_id), 98 | guard, 99 | )?; 100 | 101 | if res.is_err() { 102 | // clean up the tree we just created if we couldn't 103 | // install it. 104 | let _ = context 105 | .pagecache 106 | .free(root_id, root_ptr, guard)? 107 | .expect("could not free allocated page"); 108 | let _ = context 109 | .pagecache 110 | .free(leaf_id, leaf_ptr, guard)? 111 | .expect("could not free allocated page"); 112 | continue; 113 | } 114 | 115 | return Ok(Tree(Arc::new(TreeInner { 116 | tree_id: name, 117 | subscribers: Subscribers::default(), 118 | context: context.clone(), 119 | root: AtomicU64::new(root_id), 120 | merge_operator: RwLock::new(None), 121 | }))); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /experiments/epoch/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "autocfg" 5 | version = "0.1.7" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | 8 | [[package]] 9 | name = "cfg-if" 10 | version = "0.1.10" 11 | source = "registry+https://github.com/rust-lang/crates.io-index" 12 | 13 | [[package]] 14 | name = "crossbeam-epoch" 15 | version = "0.8.0" 16 | source = "registry+https://github.com/rust-lang/crates.io-index" 17 | dependencies = [ 18 | "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", 19 | "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", 20 | "crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", 21 | "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 22 | "memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", 23 | "scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", 24 | ] 25 | 26 | [[package]] 27 | name = "crossbeam-utils" 28 | version = "0.7.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | dependencies = [ 31 | "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", 32 | "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", 33 | "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 34 | ] 35 | 36 | [[package]] 37 | name = "epoch" 38 | version = "0.1.0" 39 | dependencies = [ 40 | "crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", 41 | ] 42 | 43 | [[package]] 44 | name = "lazy_static" 45 | version = "1.4.0" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | 48 | [[package]] 49 | name = "memoffset" 50 | version = "0.5.3" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | dependencies = [ 53 | "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", 54 | ] 55 | 56 | [[package]] 57 | name = "rustc_version" 58 | version = "0.2.3" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | dependencies = [ 61 | "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", 62 | ] 63 | 64 | [[package]] 65 | name = "scopeguard" 66 | version = "1.0.0" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | 69 | [[package]] 70 | name = "semver" 71 | version = "0.9.0" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | dependencies = [ 74 | "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", 75 | ] 76 | 77 | [[package]] 78 | name = "semver-parser" 79 | version = "0.7.0" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | 82 | [metadata] 83 | "checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" 84 | "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" 85 | "checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac" 86 | "checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4" 87 | "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 88 | "checksum memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "75189eb85871ea5c2e2c15abbdd541185f63b408415e5051f5cac122d8c774b9" 89 | "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" 90 | "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" 91 | "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" 92 | "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" 93 | -------------------------------------------------------------------------------- /src/oneshot.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | future::Future, 3 | pin::Pin, 4 | task::{Context, Poll, Waker}, 5 | time::{Duration, Instant}, 6 | }; 7 | 8 | use parking_lot::{Condvar, Mutex}; 9 | 10 | use crate::Arc; 11 | 12 | #[derive(Debug)] 13 | struct OneShotState { 14 | filled: bool, 15 | fused: bool, 16 | item: Option, 17 | waker: Option, 18 | } 19 | 20 | impl Default for OneShotState { 21 | fn default() -> OneShotState { 22 | OneShotState { filled: false, fused: false, item: None, waker: None } 23 | } 24 | } 25 | 26 | /// A Future value which may or may not be filled 27 | #[derive(Debug)] 28 | pub struct OneShot { 29 | mu: Arc>>, 30 | cv: Arc, 31 | } 32 | 33 | /// The completer side of the Future 34 | pub struct OneShotFiller { 35 | mu: Arc>>, 36 | cv: Arc, 37 | } 38 | 39 | impl OneShot { 40 | /// Create a new `OneShotFiller` and the `OneShot` 41 | /// that will be filled by its completion. 42 | pub fn pair() -> (OneShotFiller, Self) { 43 | let mu = Arc::new(Mutex::new(OneShotState::default())); 44 | let cv = Arc::new(Condvar::new()); 45 | let future = Self { mu: mu.clone(), cv: cv.clone() }; 46 | let filler = OneShotFiller { mu, cv }; 47 | 48 | (filler, future) 49 | } 50 | 51 | /// Block on the `OneShot`'s completion 52 | /// or dropping of the `OneShotFiller` 53 | pub fn wait(self) -> Option { 54 | let mut inner = self.mu.lock(); 55 | while !inner.filled { 56 | self.cv.wait(&mut inner); 57 | } 58 | inner.item.take() 59 | } 60 | 61 | /// Block on the `OneShot`'s completion 62 | /// or dropping of the `OneShotFiller`, 63 | /// returning an error if not filled 64 | /// before a given timeout or if the 65 | /// system shuts down before then. 66 | pub fn wait_timeout( 67 | self, 68 | mut timeout: Duration, 69 | ) -> Result { 70 | let mut inner = self.mu.lock(); 71 | while !inner.filled { 72 | let start = Instant::now(); 73 | let res = self.cv.wait_for(&mut inner, timeout); 74 | if res.timed_out() { 75 | return Err(std::sync::mpsc::RecvTimeoutError::Disconnected); 76 | } 77 | timeout = 78 | if let Some(timeout) = timeout.checked_sub(start.elapsed()) { 79 | timeout 80 | } else { 81 | Duration::from_nanos(0) 82 | }; 83 | } 84 | if let Some(item) = inner.item.take() { 85 | Ok(item) 86 | } else { 87 | Err(std::sync::mpsc::RecvTimeoutError::Disconnected) 88 | } 89 | } 90 | } 91 | 92 | impl Future for OneShot { 93 | type Output = Option; 94 | 95 | fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { 96 | let mut state = self.mu.lock(); 97 | if state.fused { 98 | return Poll::Pending; 99 | } 100 | if state.filled { 101 | state.fused = true; 102 | Poll::Ready(state.item.take()) 103 | } else { 104 | state.waker = Some(cx.waker().clone()); 105 | Poll::Pending 106 | } 107 | } 108 | } 109 | 110 | impl OneShotFiller { 111 | /// Complete the `OneShot` 112 | pub fn fill(self, inner: T) { 113 | let mut state = self.mu.lock(); 114 | 115 | if let Some(waker) = state.waker.take() { 116 | waker.wake(); 117 | } 118 | 119 | state.filled = true; 120 | state.item = Some(inner); 121 | 122 | // having held the mutex makes this linearized 123 | // with the notify below. 124 | drop(state); 125 | 126 | let _notified = self.cv.notify_all(); 127 | } 128 | } 129 | 130 | impl Drop for OneShotFiller { 131 | fn drop(&mut self) { 132 | let mut state = self.mu.lock(); 133 | 134 | if state.filled { 135 | return; 136 | } 137 | 138 | if let Some(waker) = state.waker.take() { 139 | waker.wake(); 140 | } 141 | 142 | state.filled = true; 143 | 144 | // having held the mutex makes this linearized 145 | // with the notify below. 146 | drop(state); 147 | 148 | let _notified = self.cv.notify_all(); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/sys_limits.rs: -------------------------------------------------------------------------------- 1 | #![allow(unsafe_code)] 2 | 3 | #[cfg(any(target_os = "linux", target_os = "macos"))] 4 | use std::io; 5 | #[cfg(any(target_os = "linux"))] 6 | use {std::fs::File, std::io::Read}; 7 | 8 | /// See the Kernel's documentation for more information about this subsystem, 9 | /// found at: [Documentation/cgroup-v1/memory.txt](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt) 10 | /// 11 | /// If there's no memory limit specified on the container this may return 12 | /// 0x7FFFFFFFFFFFF000 (2^63-1 rounded down to 4k which is a common page size). 13 | /// So we know we are not running in a memory restricted environment. 14 | #[cfg(target_os = "linux")] 15 | fn get_cgroup_memory_limit() -> io::Result { 16 | File::open("/sys/fs/cgroup/memory/memory.limit_in_bytes") 17 | .and_then(read_u64_from) 18 | } 19 | 20 | #[cfg(target_os = "linux")] 21 | fn read_u64_from(mut file: File) -> io::Result { 22 | let mut s = String::new(); 23 | file.read_to_string(&mut s).and_then(|_| { 24 | s.trim() 25 | .parse() 26 | .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) 27 | }) 28 | } 29 | 30 | /// Returns the maximum size of total available memory of the process, in bytes. 31 | /// If this limit is exceeded, the malloc() and mmap() functions shall fail with 32 | /// errno set to [ENOMEM]. 33 | #[cfg(any(target_os = "linux", target_os = "macos"))] 34 | fn get_rlimit_as() -> io::Result { 35 | let mut limit = std::mem::MaybeUninit::::uninit(); 36 | 37 | let ret = unsafe { libc::getrlimit(libc::RLIMIT_AS, limit.as_mut_ptr()) }; 38 | 39 | if ret == 0 { 40 | Ok(unsafe { limit.assume_init() }) 41 | } else { 42 | Err(io::Error::last_os_error()) 43 | } 44 | } 45 | 46 | #[cfg(any(target_os = "linux", target_os = "macos"))] 47 | pub fn get_available_memory() -> io::Result { 48 | use std::convert::TryFrom; 49 | 50 | let pages = unsafe { libc::sysconf(libc::_SC_PHYS_PAGES) }; 51 | if pages == -1 { 52 | return Err(io::Error::last_os_error()); 53 | } 54 | 55 | let page_size = unsafe { libc::sysconf(libc::_SC_PAGE_SIZE) }; 56 | if page_size == -1 { 57 | return Err(io::Error::last_os_error()); 58 | } 59 | 60 | Ok(u64::try_from(pages).unwrap() * u64::try_from(page_size).unwrap()) 61 | } 62 | 63 | pub fn get_memory_limit() -> u64 { 64 | // Maximum addressable memory space limit in u64 65 | static MAX_USIZE: u64 = usize::max_value() as u64; 66 | 67 | let mut max: u64 = 0; 68 | 69 | #[cfg(target_os = "linux")] 70 | { 71 | if let Ok(mem) = get_cgroup_memory_limit() { 72 | max = mem; 73 | } 74 | 75 | // If there's no memory limit specified on the container this 76 | // actually returns 0x7FFFFFFFFFFFF000 (2^63-1 rounded down to 77 | // 4k which is a common page size). So we know we are not 78 | // running in a memory restricted environment. 79 | // src: https://github.com/dotnet/coreclr/blob/master/src/pal/src/misc/cgroup.cpp#L385-L428 80 | if max > 0x7FFF_FFFF_0000_0000 { 81 | return 0; 82 | } 83 | } 84 | 85 | #[cfg(any(target_os = "linux", target_os = "macos"))] 86 | { 87 | if let Ok(rlim) = get_rlimit_as() { 88 | let rlim_cur = Into::::into(rlim.rlim_cur); 89 | if rlim_cur < max || max == 0 { 90 | max = rlim_cur; 91 | } 92 | } 93 | 94 | if let Ok(available) = get_available_memory() { 95 | if available < max || max == 0 { 96 | max = available; 97 | } 98 | } 99 | } 100 | 101 | if max > MAX_USIZE { 102 | // It is observed in practice when the memory is unrestricted, Linux 103 | // control group returns a physical limit that is bigger than 104 | // the address space 105 | max = MAX_USIZE; 106 | } 107 | 108 | #[cfg(miri)] 109 | { 110 | // Miri has a significant memory consumption overhead. During a small 111 | // test run, a memory amplification of ~35x was observed. Certain 112 | // memory overheads may increase asymptotically with longer test runs, 113 | // such as the interpreter's dead_alloc_map. Memory overhead is 114 | // dominated by stacked borrows tags; the asymptotic behavior of this 115 | // overhead needs further investigation. 116 | max /= 40; 117 | } 118 | 119 | max 120 | } 121 | -------------------------------------------------------------------------------- /src/doc/motivating_experiences/mod.rs: -------------------------------------------------------------------------------- 1 | //!

2 | //! 3 | //!

4 | //! 5 | //! # Experiences with Other Systems 6 | //! 7 | //! sled is motivated by the experiences gained while working with other 8 | //! stateful systems, outlined below. 9 | //! 10 | //! Most of the points below are learned from being burned, rather than 11 | //! delighted. 12 | //! 13 | //! #### MySQL 14 | //! 15 | //! * make it easy to tail the replication stream in flexible topologies 16 | //! * support merging shards a la MariaDB 17 | //! * support mechanisms for live, lock-free schema updates a la 18 | //! pt-online-schema-change 19 | //! * include GTID in all replication information 20 | //! * actively reduce tree fragmentation 21 | //! * give operators and distributed database creators first-class support for 22 | //! replication, sharding, backup, tuning, and diagnosis 23 | //! * O_DIRECT + real linux AIO is worth the effort 24 | //! 25 | //! #### Redis 26 | //! 27 | //! * provide high-level collections that let engineers get to their business 28 | //! logic as quickly as possible instead of forcing them to define a schema in 29 | //! a relational system (usually spending an hour+ googling how to even do it) 30 | //! * don't let single slow requests block all other requests to a shard 31 | //! * let operators peer into the sequence of operations that hit the database 32 | //! to track down bad usage 33 | //! * don't force replicas to retrieve the entire state of the leader when they 34 | //! begin replication 35 | //! 36 | //! #### HBase 37 | //! 38 | //! * don't split "the source of truth" across too many decoupled systems or you 39 | //! will always have downtime 40 | //! * give users first-class APIs to peer into their system state without 41 | //! forcing them to write scrapers 42 | //! * serve http pages for high-level overviews and possibly log access 43 | //! * coprocessors are awesome but people should have easy ways of doing 44 | //! secondary indexing 45 | //! 46 | //! #### RocksDB 47 | //! 48 | //! * give users tons of flexibility with different usage patterns 49 | //! * don't force users to use distributed machine learning to discover 50 | //! configurations that work for their use cases 51 | //! * merge operators are extremely powerful 52 | //! * merge operators should be usable from serial transactions across multiple 53 | //! keys 54 | //! 55 | //! #### etcd 56 | //! 57 | //! * raft makes operating replicated systems SO MUCH EASIER than popular 58 | //! relational systems / redis etc... 59 | //! * modify raft to use leader leases instead of using the paxos register, 60 | //! avoiding livelocks in the presence of simple partitions 61 | //! * give users flexible interfaces 62 | //! * reactive semantics are awesome, but access must be done through smart 63 | //! clients, because users will assume watches are reliable 64 | //! * if we have smart clients anyway, quorum reads can be cheap by 65 | //! lower-bounding future reads to the raft id last observed 66 | //! * expose the metrics and operational levers required to build a self-driving 67 | //! stateful system on top of k8s/mesos/cloud providers/etc... 68 | //! 69 | //! #### Tendermint 70 | //! 71 | //! * build things in a testable way from the beginning 72 | //! * don't seek gratuitous concurrency 73 | //! * allow replication streams to be used in flexible ways 74 | //! * instant finality (or interface finality, the thing should be done by the 75 | //! time the request successfully returns to the client) is mandatory for nice 76 | //! high-level interfaces that don't push optimism (and rollbacks) into 77 | //! interfacing systems 78 | //! 79 | //! #### LMDB 80 | //! 81 | //! * approach a wait-free tree traversal for reads 82 | //! * use modern tree structures that can support concurrent writers 83 | //! * multi-process is nice for browsers etc... 84 | //! * people value read performance and are often forgiving of terrible write 85 | //! performance for most workloads 86 | //! 87 | //! #### Zookeeper 88 | //! * reactive semantics are awesome, but access must be done through smart 89 | //! clients, because users will assume watches are reliable 90 | //! * the more important the system, the more you should keep old snapshots 91 | //! around for emergency recovery 92 | //! * never assume a hostname that was resolvable in the past will be resolvable 93 | //! in the future 94 | //! * if a critical thread dies, bring down the entire system 95 | //! * make replication configuration as simple as possible. people will mess up 96 | //! the order and cause split brains if this is not automated. 97 | -------------------------------------------------------------------------------- /src/pagecache/reservation.rs: -------------------------------------------------------------------------------- 1 | use crate::{pagecache::*, *}; 2 | 3 | /// A pending log reservation which can be aborted or completed. 4 | /// NB the holder should quickly call `complete` or `abort` as 5 | /// taking too long to decide will cause the underlying IO 6 | /// buffer to become blocked. 7 | #[derive(Debug)] 8 | pub struct Reservation<'a> { 9 | pub(super) log: &'a Log, 10 | pub(super) iobuf: Arc, 11 | pub(super) buf: &'a mut [u8], 12 | pub(super) flushed: bool, 13 | pub pointer: DiskPtr, 14 | pub lsn: Lsn, 15 | pub(super) is_heap_item_rewrite: bool, 16 | pub(super) header_len: usize, 17 | } 18 | 19 | impl<'a> Drop for Reservation<'a> { 20 | fn drop(&mut self) { 21 | // We auto-abort if the user never uses a reservation. 22 | if !self.flushed { 23 | if let Err(e) = self.flush(false) { 24 | self.log.config.set_global_error(e); 25 | } 26 | } 27 | } 28 | } 29 | 30 | impl<'a> Reservation<'a> { 31 | /// Cancel the reservation, placing a failed flush on disk, returning 32 | /// the (cancelled) log sequence number and file offset. 33 | pub fn abort(mut self) -> Result<(Lsn, DiskPtr)> { 34 | if self.pointer.is_heap_item() && !self.is_heap_item_rewrite { 35 | // we can instantly free this heap item because its pointer 36 | // is assumed to have failed to have been installed into 37 | // the pagetable, so we can assume nobody is operating 38 | // on it. 39 | 40 | trace!( 41 | "removing heap item for aborted reservation at lsn {}", 42 | self.pointer 43 | ); 44 | 45 | self.log.config.heap.free(self.pointer.heap_id().unwrap()); 46 | } 47 | 48 | self.flush(false) 49 | } 50 | 51 | /// Complete the reservation, placing the buffer on disk. returns 52 | /// the log sequence number of the write, and the file offset. 53 | pub fn complete(mut self) -> Result<(Lsn, DiskPtr)> { 54 | self.flush(true) 55 | } 56 | 57 | /// Returns the length of the on-log reservation. 58 | pub(crate) fn reservation_len(&self) -> usize { 59 | self.buf.len() 60 | } 61 | 62 | /// Refills the reservation buffer with new data. 63 | /// Must supply a buffer of an identical length 64 | /// as the one initially provided. Don't use this 65 | /// on messages subject to compression etc... 66 | /// 67 | /// # Panics 68 | /// 69 | /// Will panic if the reservation is not the correct 70 | /// size to hold a serialized Lsn. 71 | #[doc(hidden)] 72 | pub fn mark_writebatch(self, peg_lsn: Lsn) -> Result<(Lsn, DiskPtr)> { 73 | trace!( 74 | "writing batch required stable lsn {} into \ 75 | BatchManifest at lid {:?} peg_lsn {}", 76 | peg_lsn, 77 | self.pointer.lid(), 78 | self.lsn 79 | ); 80 | 81 | if self.lsn == peg_lsn { 82 | // this can happen because high-level tree updates 83 | // may result in no work happening. 84 | self.abort() 85 | } else { 86 | self.buf[4] = MessageKind::BatchManifest.into(); 87 | 88 | let buf = lsn_to_arr(peg_lsn); 89 | 90 | let dst = &mut self.buf[self.header_len..]; 91 | 92 | dst.copy_from_slice(&buf); 93 | 94 | let mut intervals = self.log.iobufs.intervals.lock(); 95 | intervals.mark_batch((self.lsn, peg_lsn)); 96 | drop(intervals); 97 | 98 | self.complete() 99 | } 100 | } 101 | 102 | fn flush(&mut self, valid: bool) -> Result<(Lsn, DiskPtr)> { 103 | if self.flushed { 104 | panic!("flushing already-flushed reservation!"); 105 | } 106 | 107 | self.flushed = true; 108 | 109 | if !valid { 110 | // don't actually zero the message, still check its hash 111 | // on recovery to find corruption. 112 | self.buf[4] = MessageKind::Canceled.into(); 113 | } 114 | 115 | let crc32 = calculate_message_crc32( 116 | self.buf[..self.header_len].as_ref(), 117 | &self.buf[self.header_len..], 118 | ); 119 | let crc32_arr = u32_to_arr(crc32); 120 | 121 | #[allow(unsafe_code)] 122 | unsafe { 123 | std::ptr::copy_nonoverlapping( 124 | crc32_arr.as_ptr(), 125 | self.buf.as_mut_ptr(), 126 | std::mem::size_of::(), 127 | ); 128 | } 129 | self.log.exit_reservation(&self.iobuf)?; 130 | 131 | Ok((self.lsn, self.pointer)) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /benchmarks/criterion/benches/sled.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | 3 | use jemallocator::Jemalloc; 4 | 5 | use sled::Config; 6 | 7 | #[cfg_attr( 8 | // only enable jemalloc on linux and macos by default 9 | any(target_os = "linux", target_os = "macos"), 10 | global_allocator 11 | )] 12 | static ALLOC: Jemalloc = Jemalloc; 13 | 14 | fn counter() -> usize { 15 | use std::sync::atomic::{AtomicUsize, Ordering::Relaxed}; 16 | 17 | static C: AtomicUsize = AtomicUsize::new(0); 18 | 19 | C.fetch_add(1, Relaxed) 20 | } 21 | 22 | /// Generates a random number in `0..n`. 23 | fn random(n: u32) -> u32 { 24 | use std::cell::Cell; 25 | use std::num::Wrapping; 26 | 27 | thread_local! { 28 | static RNG: Cell> = Cell::new(Wrapping(1406868647)); 29 | } 30 | 31 | RNG.with(|rng| { 32 | // This is the 32-bit variant of Xorshift. 33 | // 34 | // Source: https://en.wikipedia.org/wiki/Xorshift 35 | let mut x = rng.get(); 36 | x ^= x << 13; 37 | x ^= x >> 17; 38 | x ^= x << 5; 39 | rng.set(x); 40 | 41 | // This is a fast alternative to `x % n`. 42 | // 43 | // Author: Daniel Lemire 44 | // Source: https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 45 | ((x.0 as u64).wrapping_mul(n as u64) >> 32) as u32 46 | }) 47 | } 48 | 49 | fn sled_bulk_load(c: &mut Criterion) { 50 | let mut count = 0_u32; 51 | let mut bytes = |len| -> Vec { 52 | count += 1; 53 | count.to_be_bytes().into_iter().cycle().take(len).copied().collect() 54 | }; 55 | 56 | let mut bench = |key_len, val_len| { 57 | let db = Config::new() 58 | .path(format!("bulk_k{}_v{}", key_len, val_len)) 59 | .temporary(true) 60 | .flush_every_ms(None) 61 | .open() 62 | .unwrap(); 63 | 64 | c.bench_function( 65 | &format!("bulk load key/value lengths {}/{}", key_len, val_len), 66 | |b| { 67 | b.iter(|| { 68 | db.insert(bytes(key_len), bytes(val_len)).unwrap(); 69 | }) 70 | }, 71 | ); 72 | }; 73 | 74 | for key_len in &[10_usize, 128, 256, 512] { 75 | for val_len in &[0_usize, 10, 128, 256, 512, 1024, 2048, 4096, 8192] { 76 | bench(*key_len, *val_len) 77 | } 78 | } 79 | } 80 | 81 | fn sled_monotonic_crud(c: &mut Criterion) { 82 | let db = Config::new().temporary(true).flush_every_ms(None).open().unwrap(); 83 | 84 | c.bench_function("monotonic inserts", |b| { 85 | let mut count = 0_u32; 86 | b.iter(|| { 87 | count += 1; 88 | db.insert(count.to_be_bytes(), vec![]).unwrap(); 89 | }) 90 | }); 91 | 92 | c.bench_function("monotonic gets", |b| { 93 | let mut count = 0_u32; 94 | b.iter(|| { 95 | count += 1; 96 | db.get(count.to_be_bytes()).unwrap(); 97 | }) 98 | }); 99 | 100 | c.bench_function("monotonic removals", |b| { 101 | let mut count = 0_u32; 102 | b.iter(|| { 103 | count += 1; 104 | db.remove(count.to_be_bytes()).unwrap(); 105 | }) 106 | }); 107 | } 108 | 109 | fn sled_random_crud(c: &mut Criterion) { 110 | const SIZE: u32 = 65536; 111 | 112 | let db = Config::new().temporary(true).flush_every_ms(None).open().unwrap(); 113 | 114 | c.bench_function("random inserts", |b| { 115 | b.iter(|| { 116 | let k = random(SIZE).to_be_bytes(); 117 | db.insert(k, vec![]).unwrap(); 118 | }) 119 | }); 120 | 121 | c.bench_function("random gets", |b| { 122 | b.iter(|| { 123 | let k = random(SIZE).to_be_bytes(); 124 | db.get(k).unwrap(); 125 | }) 126 | }); 127 | 128 | c.bench_function("random removals", |b| { 129 | b.iter(|| { 130 | let k = random(SIZE).to_be_bytes(); 131 | db.remove(k).unwrap(); 132 | }) 133 | }); 134 | } 135 | 136 | fn sled_empty_opens(c: &mut Criterion) { 137 | let _ = std::fs::remove_dir_all("empty_opens"); 138 | c.bench_function("empty opens", |b| { 139 | b.iter(|| { 140 | Config::new() 141 | .path(format!("empty_opens/{}.db", counter())) 142 | .flush_every_ms(None) 143 | .open() 144 | .unwrap() 145 | }) 146 | }); 147 | let _ = std::fs::remove_dir_all("empty_opens"); 148 | } 149 | 150 | criterion_group!( 151 | benches, 152 | sled_bulk_load, 153 | sled_monotonic_crud, 154 | sled_random_crud, 155 | sled_empty_opens 156 | ); 157 | criterion_main!(benches); 158 | -------------------------------------------------------------------------------- /src/dll.rs: -------------------------------------------------------------------------------- 1 | #![allow(unsafe_code)] 2 | 3 | use std::ptr; 4 | 5 | use crate::PageId; 6 | 7 | /// A simple doubly linked list for use in the `Lru` 8 | #[derive(Debug)] 9 | pub(crate) struct Node { 10 | inner: PageId, 11 | next: *mut Node, 12 | prev: *mut Node, 13 | } 14 | 15 | impl Node { 16 | fn unwire(&mut self) { 17 | unsafe { 18 | if !self.prev.is_null() { 19 | (*self.prev).next = self.next; 20 | } 21 | 22 | if !self.next.is_null() { 23 | (*self.next).prev = self.prev; 24 | } 25 | } 26 | 27 | self.next = ptr::null_mut(); 28 | self.prev = ptr::null_mut(); 29 | } 30 | } 31 | 32 | /// A simple non-cyclical doubly linked 33 | /// list where items can be efficiently 34 | /// removed from the middle, for the purposes 35 | /// of backing an LRU cache. 36 | pub struct DoublyLinkedList { 37 | head: *mut Node, 38 | tail: *mut Node, 39 | len: usize, 40 | } 41 | 42 | unsafe impl Send for DoublyLinkedList {} 43 | 44 | impl Drop for DoublyLinkedList { 45 | fn drop(&mut self) { 46 | let mut cursor = self.head; 47 | while !cursor.is_null() { 48 | unsafe { 49 | let node = Box::from_raw(cursor); 50 | 51 | // don't need to check for cycles 52 | // because this Dll is non-cyclical 53 | cursor = node.prev; 54 | 55 | // this happens without the manual drop, 56 | // but we keep it for explicitness 57 | drop(node); 58 | } 59 | } 60 | } 61 | } 62 | 63 | impl Default for DoublyLinkedList { 64 | fn default() -> Self { 65 | Self { head: ptr::null_mut(), tail: ptr::null_mut(), len: 0 } 66 | } 67 | } 68 | 69 | impl DoublyLinkedList { 70 | pub(crate) const fn len(&self) -> usize { 71 | self.len 72 | } 73 | 74 | pub(crate) fn push_head(&mut self, item: PageId) -> *mut Node { 75 | self.len += 1; 76 | 77 | let node = Node { inner: item, next: ptr::null_mut(), prev: self.head }; 78 | 79 | let ptr = Box::into_raw(Box::new(node)); 80 | 81 | self.push_head_ptr(ptr) 82 | } 83 | 84 | fn push_head_ptr(&mut self, ptr: *mut Node) -> *mut Node { 85 | if !self.head.is_null() { 86 | unsafe { 87 | (*self.head).next = ptr; 88 | (*ptr).prev = self.head; 89 | } 90 | } 91 | 92 | if self.tail.is_null() { 93 | self.tail = ptr; 94 | } 95 | 96 | self.head = ptr; 97 | 98 | ptr 99 | } 100 | 101 | #[cfg(test)] 102 | pub(crate) fn push_tail(&mut self, item: PageId) { 103 | self.len += 1; 104 | 105 | let node = Node { inner: item, next: self.tail, prev: ptr::null_mut() }; 106 | 107 | let ptr = Box::into_raw(Box::new(node)); 108 | 109 | if !self.tail.is_null() { 110 | unsafe { 111 | (*self.tail).prev = ptr; 112 | } 113 | } 114 | 115 | if self.head.is_null() { 116 | self.head = ptr; 117 | } 118 | 119 | self.tail = ptr; 120 | } 121 | 122 | pub(crate) fn promote(&mut self, ptr: *mut Node) -> *mut Node { 123 | if self.head == ptr { 124 | return ptr; 125 | } 126 | 127 | unsafe { 128 | if self.tail == ptr { 129 | self.tail = (*ptr).next; 130 | } 131 | 132 | if self.head == ptr { 133 | self.head = (*ptr).prev; 134 | } 135 | 136 | (*ptr).unwire(); 137 | 138 | self.push_head_ptr(ptr) 139 | } 140 | } 141 | 142 | #[cfg(test)] 143 | pub(crate) fn pop_head(&mut self) -> Option { 144 | if self.head.is_null() { 145 | return None; 146 | } 147 | 148 | self.len -= 1; 149 | 150 | unsafe { 151 | let mut head = Box::from_raw(self.head); 152 | 153 | if self.head == self.tail { 154 | self.tail = ptr::null_mut(); 155 | } 156 | 157 | self.head = head.prev; 158 | 159 | head.unwire(); 160 | 161 | Some(head.inner) 162 | } 163 | } 164 | 165 | pub(crate) fn pop_tail(&mut self) -> Option { 166 | if self.tail.is_null() { 167 | return None; 168 | } 169 | 170 | self.len -= 1; 171 | 172 | unsafe { 173 | let mut tail = Box::from_raw(self.tail); 174 | 175 | if self.head == self.tail { 176 | self.head = ptr::null_mut(); 177 | } 178 | 179 | self.tail = tail.next; 180 | 181 | tail.unwire(); 182 | 183 | Some(tail.inner) 184 | } 185 | } 186 | 187 | #[cfg(test)] 188 | pub(crate) fn into_vec(mut self) -> Vec { 189 | let mut res = vec![]; 190 | while let Some(val) = self.pop_head() { 191 | res.push(val); 192 | } 193 | res 194 | } 195 | } 196 | 197 | #[allow(unused_results)] 198 | #[test] 199 | fn basic_functionality() { 200 | let mut dll = DoublyLinkedList::default(); 201 | dll.push_head(5); 202 | dll.push_tail(6); 203 | dll.push_head(4); 204 | dll.push_tail(7); 205 | dll.push_tail(8); 206 | dll.push_head(3); 207 | dll.push_tail(9); 208 | dll.push_head(2); 209 | dll.push_head(1); 210 | assert_eq!(dll.len(), 9); 211 | assert_eq!(dll.into_vec(), vec![1, 2, 3, 4, 5, 6, 7, 8, 9]); 212 | } 213 | -------------------------------------------------------------------------------- /src/flusher.rs: -------------------------------------------------------------------------------- 1 | use std::thread; 2 | use std::time::Duration; 3 | 4 | use parking_lot::{Condvar, Mutex}; 5 | 6 | use super::*; 7 | 8 | #[derive(Debug, Clone, Copy)] 9 | pub(crate) enum ShutdownState { 10 | Running, 11 | ShuttingDown, 12 | ShutDown, 13 | } 14 | 15 | impl ShutdownState { 16 | fn is_running(self) -> bool { 17 | if let ShutdownState::Running = self { true } else { false } 18 | } 19 | 20 | fn is_shutdown(self) -> bool { 21 | if let ShutdownState::ShutDown = self { true } else { false } 22 | } 23 | } 24 | 25 | #[derive(Debug)] 26 | pub(crate) struct Flusher { 27 | shutdown: Arc>, 28 | sc: Arc, 29 | join_handle: Mutex>>, 30 | } 31 | 32 | impl Flusher { 33 | /// Spawns a thread that periodically calls `callback` until dropped. 34 | pub(crate) fn new( 35 | name: String, 36 | pagecache: PageCache, 37 | flush_every_ms: u64, 38 | ) -> Self { 39 | #[allow(clippy::mutex_atomic)] // mutex used in CondVar below 40 | let shutdown = Arc::new(Mutex::new(ShutdownState::Running)); 41 | let sc = Arc::new(Condvar::new()); 42 | 43 | let join_handle = thread::Builder::new() 44 | .name(name) 45 | .spawn({ 46 | let shutdown = shutdown.clone(); 47 | let sc = sc.clone(); 48 | move || run(&shutdown, &sc, &pagecache, flush_every_ms) 49 | }) 50 | .unwrap(); 51 | 52 | Self { shutdown, sc, join_handle: Mutex::new(Some(join_handle)) } 53 | } 54 | } 55 | 56 | fn run( 57 | shutdown: &Arc>, 58 | sc: &Arc, 59 | pagecache: &PageCache, 60 | flush_every_ms: u64, 61 | ) { 62 | let flush_every = Duration::from_millis(flush_every_ms); 63 | let mut shutdown = shutdown.lock(); 64 | let mut wrote_data = false; 65 | while shutdown.is_running() || wrote_data { 66 | let before = std::time::Instant::now(); 67 | let cc = concurrency_control::read(); 68 | match pagecache.log.roll_iobuf() { 69 | Ok(0) => { 70 | wrote_data = false; 71 | if !shutdown.is_running() { 72 | break; 73 | } 74 | } 75 | Ok(_) => { 76 | wrote_data = true; 77 | if !shutdown.is_running() { 78 | // loop right away if we're in 79 | // shutdown mode, to flush data 80 | // more quickly. 81 | continue; 82 | } 83 | } 84 | Err(e) => { 85 | error!("failed to flush from periodic flush thread: {}", e); 86 | 87 | #[cfg(feature = "failpoints")] 88 | pagecache.set_failpoint(e); 89 | 90 | *shutdown = ShutdownState::ShutDown; 91 | 92 | // having held the mutex makes this linearized 93 | // with the notify below. 94 | drop(shutdown); 95 | 96 | let _notified = sc.notify_all(); 97 | return; 98 | } 99 | } 100 | drop(cc); 101 | 102 | // so we can spend a little effort 103 | // cleaning up the segments. try not to 104 | // spend more than half of our sleep 105 | // time rewriting pages though. 106 | // 107 | // this looks weird because it's a rust-style do-while 108 | // where the conditional is the full body 109 | while { 110 | let made_progress = match pagecache.attempt_gc() { 111 | Err(e) => { 112 | error!( 113 | "failed to clean file from periodic flush thread: {}", 114 | e 115 | ); 116 | 117 | #[cfg(feature = "failpoints")] 118 | pagecache.set_failpoint(e); 119 | 120 | *shutdown = ShutdownState::ShutDown; 121 | 122 | // having held the mutex makes this linearized 123 | // with the notify below. 124 | drop(shutdown); 125 | 126 | let _notified = sc.notify_all(); 127 | return; 128 | } 129 | Ok(false) => false, 130 | Ok(true) => true, 131 | }; 132 | made_progress 133 | && shutdown.is_running() 134 | && before.elapsed() < flush_every / 2 135 | } {} 136 | 137 | if let Err(e) = pagecache.config.file.sync_all() { 138 | error!("failed to fsync from periodic flush thread: {}", e); 139 | } 140 | 141 | let sleep_duration = flush_every 142 | .checked_sub(before.elapsed()) 143 | .unwrap_or_else(|| Duration::from_millis(1)); 144 | 145 | if shutdown.is_running() { 146 | // only sleep before the next flush if we are 147 | // running normally. if we're shutting down, 148 | // flush faster. 149 | sc.wait_for(&mut shutdown, sleep_duration); 150 | } 151 | } 152 | 153 | *shutdown = ShutdownState::ShutDown; 154 | 155 | // having held the mutex makes this linearized 156 | // with the notify below. 157 | drop(shutdown); 158 | 159 | let _notified = sc.notify_all(); 160 | } 161 | 162 | impl Drop for Flusher { 163 | fn drop(&mut self) { 164 | let mut shutdown = self.shutdown.lock(); 165 | if shutdown.is_running() { 166 | *shutdown = ShutdownState::ShuttingDown; 167 | let _notified = self.sc.notify_all(); 168 | } 169 | 170 | #[allow(unused_variables)] 171 | let mut count = 0; 172 | while !shutdown.is_shutdown() { 173 | let _ = self.sc.wait_for(&mut shutdown, Duration::from_millis(100)); 174 | count += 1; 175 | 176 | testing_assert!(count < 5); 177 | } 178 | 179 | let mut join_handle_opt = self.join_handle.lock(); 180 | if let Some(join_handle) = join_handle_opt.take() { 181 | if let Err(e) = join_handle.join() { 182 | error!("error joining Periodic thread: {:?}", e); 183 | } 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/threadpool.rs: -------------------------------------------------------------------------------- 1 | //! A simple adaptive threadpool that returns a oneshot future. 2 | 3 | use std::{ 4 | collections::VecDeque, 5 | sync::atomic::{ 6 | AtomicBool, 7 | Ordering::{Acquire, Relaxed, Release, SeqCst}, 8 | }, 9 | thread, 10 | time::{Duration, Instant}, 11 | }; 12 | 13 | use parking_lot::{Condvar, Mutex}; 14 | 15 | use crate::{ 16 | debug_delay, warn, AtomicU64, AtomicUsize, Error, Lazy, OneShot, Result, 17 | }; 18 | 19 | // This is lower for CI reasons. 20 | #[cfg(windows)] 21 | const MAX_THREADS: usize = 16; 22 | 23 | #[cfg(not(windows))] 24 | const MAX_THREADS: usize = 128; 25 | 26 | const DESIRED_WAITING_THREADS: usize = 7; 27 | 28 | static WAITING_THREAD_COUNT: AtomicUsize = AtomicUsize::new(0); 29 | static TOTAL_THREAD_COUNT: AtomicUsize = AtomicUsize::new(0); 30 | static SPAWNS: AtomicUsize = AtomicUsize::new(0); 31 | static SPAWNING: AtomicBool = AtomicBool::new(false); 32 | static SUBMITTED: AtomicU64 = AtomicU64::new(0); 33 | static COMPLETED: AtomicU64 = AtomicU64::new(0); 34 | static QUEUE: Lazy Queue> = Lazy::new(init_queue); 35 | static BROKEN: AtomicBool = AtomicBool::new(false); 36 | 37 | type Work = Box; 38 | 39 | fn init_queue() -> Queue { 40 | debug_delay(); 41 | for _ in 0..DESIRED_WAITING_THREADS { 42 | debug_delay(); 43 | if let Err(e) = spawn_new_thread(true) { 44 | log::error!("failed to initialize threadpool: {:?}", e); 45 | } 46 | } 47 | Queue { cv: Condvar::new(), mu: Mutex::new(VecDeque::new()) } 48 | } 49 | 50 | struct Queue { 51 | cv: Condvar, 52 | mu: Mutex>, 53 | } 54 | 55 | impl Queue { 56 | fn recv_timeout(&self, duration: Duration) -> Option { 57 | let mut queue = self.mu.lock(); 58 | 59 | let cutoff = Instant::now() + duration; 60 | 61 | while queue.is_empty() { 62 | WAITING_THREAD_COUNT.fetch_add(1, SeqCst); 63 | let res = self.cv.wait_until(&mut queue, cutoff); 64 | WAITING_THREAD_COUNT.fetch_sub(1, SeqCst); 65 | if res.timed_out() { 66 | break; 67 | } 68 | } 69 | 70 | queue.pop_front() 71 | } 72 | 73 | fn try_recv(&self) -> Option { 74 | let mut queue = self.mu.lock(); 75 | queue.pop_front() 76 | } 77 | 78 | fn send(&self, work: Work) -> usize { 79 | let mut queue = self.mu.lock(); 80 | queue.push_back(work); 81 | 82 | let len = queue.len(); 83 | 84 | // having held the mutex makes this linearized 85 | // with the notify below. 86 | drop(queue); 87 | 88 | self.cv.notify_all(); 89 | 90 | len 91 | } 92 | } 93 | 94 | fn perform_work(is_immortal: bool) { 95 | let wait_limit = Duration::from_secs(1); 96 | 97 | let mut performed = 0; 98 | let mut contiguous_overshoots = 0; 99 | 100 | while is_immortal || performed < 5 || contiguous_overshoots < 3 { 101 | debug_delay(); 102 | let task_res = QUEUE.recv_timeout(wait_limit); 103 | 104 | if let Some(task) = task_res { 105 | WAITING_THREAD_COUNT.fetch_sub(1, SeqCst); 106 | (task)(); 107 | COMPLETED.fetch_add(1, Release); 108 | WAITING_THREAD_COUNT.fetch_add(1, SeqCst); 109 | performed += 1; 110 | } 111 | 112 | while let Some(task) = QUEUE.try_recv() { 113 | debug_delay(); 114 | WAITING_THREAD_COUNT.fetch_sub(1, SeqCst); 115 | (task)(); 116 | COMPLETED.fetch_add(1, Release); 117 | WAITING_THREAD_COUNT.fetch_add(1, SeqCst); 118 | performed += 1; 119 | } 120 | 121 | debug_delay(); 122 | 123 | let waiting = WAITING_THREAD_COUNT.load(Acquire); 124 | 125 | if waiting > DESIRED_WAITING_THREADS { 126 | contiguous_overshoots += 1; 127 | } else { 128 | contiguous_overshoots = 0; 129 | } 130 | } 131 | } 132 | 133 | // Create up to MAX_THREADS dynamic blocking task worker threads. 134 | // Dynamic threads will terminate themselves if they don't 135 | // receive any work after one second. 136 | fn maybe_spawn_new_thread() -> Result<()> { 137 | debug_delay(); 138 | let total_workers = TOTAL_THREAD_COUNT.load(Acquire); 139 | debug_delay(); 140 | let waiting_threads = WAITING_THREAD_COUNT.load(Acquire); 141 | 142 | if waiting_threads >= DESIRED_WAITING_THREADS 143 | || total_workers >= MAX_THREADS 144 | { 145 | return Ok(()); 146 | } 147 | 148 | if SPAWNING.compare_exchange_weak(false, true, Acquire, Acquire).is_ok() { 149 | spawn_new_thread(false)?; 150 | } 151 | 152 | Ok(()) 153 | } 154 | 155 | fn spawn_new_thread(is_immortal: bool) -> Result<()> { 156 | if BROKEN.load(Relaxed) { 157 | return Err(Error::ReportableBug( 158 | "IO thread unexpectedly panicked. please report \ 159 | this bug on the sled github repo." 160 | .to_string(), 161 | )); 162 | } 163 | 164 | let spawn_id = SPAWNS.fetch_add(1, SeqCst); 165 | 166 | TOTAL_THREAD_COUNT.fetch_add(1, SeqCst); 167 | let spawn_res = thread::Builder::new() 168 | .name(format!("sled-io-{}", spawn_id)) 169 | .spawn(move || { 170 | SPAWNING.store(false, SeqCst); 171 | debug_delay(); 172 | let res = std::panic::catch_unwind(|| perform_work(is_immortal)); 173 | TOTAL_THREAD_COUNT.fetch_sub(1, SeqCst); 174 | if is_immortal { 175 | // IO thread panicked, shut down the system 176 | BROKEN.store(true, SeqCst); 177 | panic!( 178 | "IO thread unexpectedly panicked. please report \ 179 | this bug on the sled github repo. error: {:?}", 180 | res 181 | ); 182 | } 183 | }); 184 | 185 | if let Err(e) = spawn_res { 186 | static E: AtomicBool = AtomicBool::new(false); 187 | 188 | SPAWNING.store(false, SeqCst); 189 | 190 | if E.compare_exchange(false, true, Relaxed, Relaxed).is_ok() { 191 | // only execute this once 192 | warn!( 193 | "Failed to dynamically increase the threadpool size: {:?}.", 194 | e, 195 | ) 196 | } 197 | } 198 | 199 | Ok(()) 200 | } 201 | 202 | /// Spawn a function on the threadpool. 203 | pub fn spawn(work: F) -> Result> 204 | where 205 | F: FnOnce() -> R + Send + 'static, 206 | R: Send + 'static + Sized, 207 | { 208 | SUBMITTED.fetch_add(1, Acquire); 209 | let (promise_filler, promise) = OneShot::pair(); 210 | let task = move || { 211 | promise_filler.fill((work)()); 212 | }; 213 | 214 | let depth = QUEUE.send(Box::new(task)); 215 | 216 | if depth > DESIRED_WAITING_THREADS { 217 | maybe_spawn_new_thread()?; 218 | } 219 | 220 | Ok(promise) 221 | } 222 | -------------------------------------------------------------------------------- /scripts/execution_explorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/gdb --command 2 | 3 | """ 4 | a simple python GDB script for running multithreaded 5 | programs in a way that is "deterministic enough" 6 | to tease out and replay interesting bugs. 7 | 8 | Tyler Neely 25 Sept 2017 9 | t@jujit.su 10 | 11 | references: 12 | https://sourceware.org/gdb/onlinedocs/gdb/All_002dStop-Mode.html 13 | https://sourceware.org/gdb/onlinedocs/gdb/Non_002dStop-Mode.html 14 | https://sourceware.org/gdb/onlinedocs/gdb/Threads-In-Python.html 15 | https://sourceware.org/gdb/onlinedocs/gdb/Events-In-Python.html 16 | https://blog.0x972.info/index.php?tag=gdb.py 17 | """ 18 | 19 | import gdb 20 | import random 21 | 22 | ############################################################################### 23 | # config # 24 | ############################################################################### 25 | # set this to a number for reproducing results or None to explore randomly 26 | seed = 156112673742 # None # 951931004895 27 | 28 | # set this to the number of valid threads in the program 29 | # {2, 3} assumes a main thread that waits on 2 workers. 30 | # {1, ... N} assumes all of the first N threads are to be explored 31 | threads_whitelist = {2, 3} 32 | 33 | # set this to the file of the binary to explore 34 | filename = "target/debug/binary" 35 | 36 | # set this to the place the threads should rendezvous before exploring 37 | entrypoint = "src/main.rs:8" 38 | 39 | # set this to after the threads are done 40 | exitpoint = "src/main.rs:12" 41 | 42 | # invariant unreachable points that should never be accessed 43 | unreachable = [ 44 | "panic_unwind::imp::panic" 45 | ] 46 | 47 | # set this to the locations you want to test interleavings for 48 | interesting = [ 49 | "src/main.rs:8", 50 | "src/main.rs:9" 51 | ] 52 | 53 | # uncomment this to output the specific commands issued to gdb 54 | gdb.execute("set trace-commands on") 55 | 56 | ############################################################################### 57 | ############################################################################### 58 | 59 | 60 | class UnreachableBreakpoint(gdb.Breakpoint): 61 | pass 62 | 63 | 64 | class DoneBreakpoint(gdb.Breakpoint): 65 | pass 66 | 67 | 68 | class InterestingBreakpoint(gdb.Breakpoint): 69 | pass 70 | 71 | 72 | class DeterministicExecutor: 73 | def __init__(self, seed=None): 74 | if seed: 75 | print("seeding with", seed) 76 | self.seed = seed 77 | random.seed(seed) 78 | else: 79 | # pick a random new seed if not provided with one 80 | self.reseed() 81 | 82 | gdb.execute("file " + filename) 83 | 84 | # non-stop is necessary to provide thread-specific 85 | # information when breakpoints are hit. 86 | gdb.execute("set non-stop on") 87 | gdb.execute("set confirm off") 88 | 89 | self.ready = set() 90 | self.finished = set() 91 | 92 | def reseed(self): 93 | random.seed() 94 | self.seed = random.randrange(1e12) 95 | print("reseeding with", self.seed) 96 | random.seed(self.seed) 97 | 98 | def restart(self): 99 | # reset inner state 100 | self.ready = set() 101 | self.finished = set() 102 | 103 | # disconnect callbacks 104 | gdb.events.stop.disconnect(self.scheduler_callback) 105 | gdb.events.exited.disconnect(self.exit_callback) 106 | 107 | # nuke all breakpoints 108 | gdb.execute("d") 109 | 110 | # end execution 111 | gdb.execute("k") 112 | 113 | # pick new seed 114 | self.reseed() 115 | 116 | self.run() 117 | 118 | def rendezvous_callback(self, event): 119 | try: 120 | self.ready.add(event.inferior_thread.num) 121 | if len(self.ready) == len(threads_whitelist): 122 | self.run_schedule() 123 | except Exception as e: 124 | # this will be thrown if breakpoint is not a part of event, 125 | # like when the event was stopped for another reason. 126 | print(e) 127 | 128 | def run(self): 129 | gdb.execute("b " + entrypoint) 130 | 131 | gdb.events.stop.connect(self.rendezvous_callback) 132 | gdb.events.exited.connect(self.exit_callback) 133 | 134 | gdb.execute("r") 135 | 136 | def run_schedule(self): 137 | print("running schedule") 138 | gdb.execute("d") 139 | gdb.events.stop.disconnect(self.rendezvous_callback) 140 | gdb.events.stop.connect(self.scheduler_callback) 141 | 142 | for bp in interesting: 143 | InterestingBreakpoint(bp) 144 | 145 | for bp in unreachable: 146 | UnreachableBreakpoint(bp) 147 | 148 | DoneBreakpoint(exitpoint) 149 | 150 | self.pick() 151 | 152 | def pick(self): 153 | threads = self.runnable_threads() 154 | if not threads: 155 | print("restarting execution after running out of valid threads") 156 | self.restart() 157 | return 158 | 159 | thread = random.choice(threads) 160 | 161 | gdb.execute("t " + str(thread.num)) 162 | gdb.execute("c") 163 | 164 | def scheduler_callback(self, event): 165 | if not isinstance(event, gdb.BreakpointEvent): 166 | print("WTF sched callback got", event.__dict__) 167 | return 168 | 169 | if isinstance(event.breakpoint, DoneBreakpoint): 170 | self.finished.add(event.inferior_thread.num) 171 | elif isinstance(event.breakpoint, UnreachableBreakpoint): 172 | print("!" * 80) 173 | print("unreachable breakpoint triggered with seed", self.seed) 174 | print("!" * 80) 175 | gdb.events.exited.disconnect(self.exit_callback) 176 | gdb.execute("q") 177 | else: 178 | print("thread", event.inferior_thread.num, 179 | "hit breakpoint at", event.breakpoint.location) 180 | 181 | self.pick() 182 | 183 | def runnable_threads(self): 184 | threads = gdb.selected_inferior().threads() 185 | 186 | def f(it): 187 | return (it.is_valid() and not 188 | it.is_exited() and 189 | it.num in threads_whitelist and 190 | it.num not in self.finished) 191 | 192 | good_threads = [it for it in threads if f(it)] 193 | good_threads.sort(key=lambda it: it.num) 194 | 195 | return good_threads 196 | 197 | def exit_callback(self, event): 198 | try: 199 | if event.exit_code != 0: 200 | print("!" * 80) 201 | print("interesting exit with seed", self.seed) 202 | print("!" * 80) 203 | else: 204 | print("happy exit") 205 | self.restart() 206 | 207 | gdb.execute("q") 208 | except Exception as e: 209 | pass 210 | 211 | de = DeterministicExecutor(seed) 212 | de.run() 213 | -------------------------------------------------------------------------------- /src/pagecache/pagetable.rs: -------------------------------------------------------------------------------- 1 | //! A simple wait-free, grow-only pagetable, assumes a dense keyspace. 2 | #![allow(unsafe_code)] 3 | 4 | use std::{ 5 | alloc::{alloc_zeroed, Layout}, 6 | convert::TryFrom, 7 | mem::{align_of, size_of}, 8 | sync::atomic::Ordering::{Acquire, Relaxed, Release}, 9 | }; 10 | 11 | use crossbeam_epoch::{pin, Atomic, Guard, Owned, Shared}; 12 | 13 | use crate::{ 14 | debug_delay, 15 | pagecache::{constants::MAX_PID_BITS, Page, PageView}, 16 | }; 17 | 18 | #[cfg(feature = "metrics")] 19 | use crate::{Measure, M}; 20 | 21 | #[allow(unused)] 22 | #[doc(hidden)] 23 | pub const PAGETABLE_NODE_SZ: usize = size_of::(); 24 | 25 | const NODE2_FAN_FACTOR: usize = 18; 26 | const NODE1_FAN_OUT: usize = 1 << (MAX_PID_BITS - NODE2_FAN_FACTOR); 27 | const NODE2_FAN_OUT: usize = 1 << NODE2_FAN_FACTOR; 28 | const FAN_MASK: u64 = (NODE2_FAN_OUT - 1) as u64; 29 | 30 | pub type PageId = u64; 31 | 32 | struct Node1 { 33 | children: [Atomic; NODE1_FAN_OUT], 34 | } 35 | 36 | struct Node2 { 37 | children: [Atomic; NODE2_FAN_OUT], 38 | } 39 | 40 | impl Node1 { 41 | fn new() -> Owned { 42 | let size = size_of::(); 43 | let align = align_of::(); 44 | 45 | unsafe { 46 | let layout = Layout::from_size_align_unchecked(size, align); 47 | 48 | #[allow(clippy::cast_ptr_alignment)] 49 | let ptr = alloc_zeroed(layout) as *mut Self; 50 | 51 | Owned::from_raw(ptr) 52 | } 53 | } 54 | } 55 | 56 | impl Node2 { 57 | fn new() -> Owned { 58 | let size = size_of::(); 59 | let align = align_of::(); 60 | 61 | unsafe { 62 | let layout = Layout::from_size_align_unchecked(size, align); 63 | 64 | #[allow(clippy::cast_ptr_alignment)] 65 | let ptr = alloc_zeroed(layout) as *mut Self; 66 | 67 | Owned::from_raw(ptr) 68 | } 69 | } 70 | } 71 | 72 | impl Drop for Node1 { 73 | fn drop(&mut self) { 74 | drop_iter(self.children.iter()); 75 | } 76 | } 77 | 78 | impl Drop for Node2 { 79 | fn drop(&mut self) { 80 | drop_iter(self.children.iter()); 81 | } 82 | } 83 | 84 | fn drop_iter(iter: core::slice::Iter<'_, Atomic>) { 85 | let guard = pin(); 86 | for child in iter { 87 | let shared_child = child.load(Relaxed, &guard); 88 | if shared_child.is_null() { 89 | // this does not leak because the PageTable is 90 | // assumed to be dense. 91 | break; 92 | } 93 | unsafe { 94 | drop(shared_child.into_owned()); 95 | } 96 | } 97 | } 98 | 99 | /// A simple lock-free radix tree. 100 | pub struct PageTable { 101 | head: Atomic, 102 | } 103 | 104 | impl Default for PageTable { 105 | fn default() -> Self { 106 | let head = Node1::new(); 107 | Self { head: Atomic::from(head) } 108 | } 109 | } 110 | 111 | impl PageTable { 112 | /// # Panics 113 | /// 114 | /// will panic if the item is not null already, 115 | /// which represents a serious failure to 116 | /// properly handle lifecycles of pages in the 117 | /// using system. 118 | pub(crate) fn insert<'g>( 119 | &self, 120 | pid: PageId, 121 | item: Page, 122 | guard: &'g Guard, 123 | ) -> PageView<'g> { 124 | debug_delay(); 125 | let tip = self.traverse(pid, guard); 126 | 127 | let shared = Owned::new(item).into_shared(guard); 128 | let old = tip.swap(shared, Release, guard); 129 | assert!(old.is_null()); 130 | 131 | PageView { read: shared, entry: tip } 132 | } 133 | 134 | /// Try to get a value from the tree. 135 | /// 136 | /// # Panics 137 | /// 138 | /// Panics if the page has never been allocated. 139 | pub(crate) fn get<'g>( 140 | &self, 141 | pid: PageId, 142 | guard: &'g Guard, 143 | ) -> PageView<'g> { 144 | #[cfg(feature = "metrics")] 145 | let _measure = Measure::new(&M.get_pagetable); 146 | debug_delay(); 147 | let tip = self.traverse(pid, guard); 148 | 149 | debug_delay(); 150 | let res = tip.load(Acquire, guard); 151 | 152 | assert!(!res.is_null()); 153 | 154 | PageView { read: res, entry: tip } 155 | } 156 | 157 | pub(crate) fn contains_pid(&self, pid: PageId, guard: &Guard) -> bool { 158 | #[cfg(feature = "metrics")] 159 | let _measure = Measure::new(&M.get_pagetable); 160 | debug_delay(); 161 | let tip = self.traverse(pid, guard); 162 | 163 | debug_delay(); 164 | let res = tip.load(Acquire, guard); 165 | 166 | !res.is_null() 167 | } 168 | 169 | fn traverse<'g>(&self, k: PageId, guard: &'g Guard) -> &'g Atomic { 170 | let (l1k, l2k) = split_fanout(k); 171 | 172 | debug_delay(); 173 | let head = self.head.load(Acquire, guard); 174 | 175 | debug_delay(); 176 | let l1 = unsafe { &head.deref().children }; 177 | 178 | debug_delay(); 179 | let mut l2_ptr = l1[l1k].load(Acquire, guard); 180 | 181 | if l2_ptr.is_null() { 182 | let next_child = Node2::new(); 183 | 184 | debug_delay(); 185 | let ret = l1[l1k].compare_and_set( 186 | Shared::null(), 187 | next_child, 188 | Release, 189 | guard, 190 | ); 191 | 192 | l2_ptr = match ret { 193 | Ok(next_child) => next_child, 194 | Err(returned) => { 195 | drop(returned.new); 196 | returned.current 197 | } 198 | }; 199 | } 200 | 201 | debug_delay(); 202 | let l2 = unsafe { &l2_ptr.deref().children }; 203 | 204 | &l2[l2k] 205 | } 206 | } 207 | 208 | #[inline] 209 | fn split_fanout(id: PageId) -> (usize, usize) { 210 | // right shift 32 on 32-bit pointer systems panics 211 | #[cfg(target_pointer_width = "64")] 212 | assert!( 213 | id <= 1 << MAX_PID_BITS, 214 | "trying to access key of {}, which is \ 215 | higher than 2 ^ {}", 216 | id, 217 | MAX_PID_BITS, 218 | ); 219 | 220 | let left = id >> NODE2_FAN_FACTOR; 221 | let right = id & FAN_MASK; 222 | 223 | (safe_usize(left), safe_usize(right)) 224 | } 225 | 226 | #[inline] 227 | fn safe_usize(value: PageId) -> usize { 228 | usize::try_from(value).unwrap() 229 | } 230 | 231 | impl Drop for PageTable { 232 | fn drop(&mut self) { 233 | let guard = pin(); 234 | let head = self.head.load(Relaxed, &guard); 235 | unsafe { 236 | drop(head.into_owned()); 237 | } 238 | } 239 | } 240 | 241 | #[test] 242 | fn fanout_functionality() { 243 | assert_eq!( 244 | split_fanout(0b11_1111_1111_1111_1111), 245 | (0, 0b11_1111_1111_1111_1111) 246 | ); 247 | assert_eq!( 248 | split_fanout(0b111_1111_1111_1111_1111), 249 | (0b1, 0b11_1111_1111_1111_1111) 250 | ); 251 | } 252 | -------------------------------------------------------------------------------- /src/result.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | cmp::PartialEq, 3 | error::Error as StdError, 4 | fmt::{self, Display}, 5 | io, 6 | }; 7 | 8 | #[cfg(feature = "testing")] 9 | use backtrace::Backtrace; 10 | 11 | use crate::{ 12 | pagecache::{DiskPtr, PageView}, 13 | IVec, 14 | }; 15 | 16 | /// The top-level result type for dealing with 17 | /// fallible operations. The errors tend to 18 | /// be fail-stop, and nested results are used 19 | /// in cases where the outer fail-stop error can 20 | /// have try `?` used on it, exposing the inner 21 | /// operation that is expected to fail under 22 | /// normal operation. The philosophy behind this 23 | /// is detailed [on the sled blog](https://sled.rs/errors). 24 | pub type Result = std::result::Result; 25 | 26 | /// A compare and swap result. If the CAS is successful, 27 | /// the new `PagePtr` will be returned as `Ok`. Otherwise, 28 | /// the `Err` will contain a tuple of the current `PagePtr` 29 | /// and the old value that could not be set atomically. 30 | pub(crate) type CasResult<'a, R> = 31 | std::result::Result, Option<(PageView<'a>, R)>>; 32 | 33 | /// An Error type encapsulating various issues that may come up 34 | /// in the operation of a `Db`. 35 | #[derive(Debug)] 36 | pub enum Error { 37 | /// The underlying collection no longer exists. 38 | CollectionNotFound(IVec), 39 | /// The system has been used in an unsupported way. 40 | Unsupported(String), 41 | /// An unexpected bug has happened. Please open an issue on github! 42 | ReportableBug(String), 43 | /// A read or write error has happened when interacting with the file 44 | /// system. 45 | Io(io::Error), 46 | /// Corruption has been detected in the storage file. 47 | Corruption { 48 | /// The file location that corrupted data was found at. 49 | at: Option, 50 | /// A backtrace for where the corruption was encountered. 51 | #[cfg(feature = "testing")] 52 | bt: Backtrace, 53 | /// A backtrace for where the corruption was encountered. 54 | #[cfg(not(feature = "testing"))] 55 | bt: (), 56 | }, 57 | // a failpoint has been triggered for testing purposes 58 | #[doc(hidden)] 59 | #[cfg(feature = "failpoints")] 60 | FailPoint, 61 | } 62 | 63 | impl Error { 64 | pub(crate) fn corruption(at: Option) -> Error { 65 | Error::Corruption { 66 | at, 67 | #[cfg(feature = "testing")] 68 | bt: Backtrace::new(), 69 | #[cfg(not(feature = "testing"))] 70 | bt: (), 71 | } 72 | } 73 | } 74 | 75 | impl Clone for Error { 76 | fn clone(&self) -> Self { 77 | use self::Error::*; 78 | 79 | match self { 80 | Io(ioe) => Io(io::Error::new(ioe.kind(), format!("{:?}", ioe))), 81 | CollectionNotFound(name) => CollectionNotFound(name.clone()), 82 | Unsupported(why) => Unsupported(why.clone()), 83 | ReportableBug(what) => ReportableBug(what.clone()), 84 | Corruption { at, bt } => Corruption { at: *at, bt: bt.clone() }, 85 | #[cfg(feature = "failpoints")] 86 | FailPoint => FailPoint, 87 | } 88 | } 89 | } 90 | 91 | impl Eq for Error {} 92 | 93 | impl PartialEq for Error { 94 | fn eq(&self, other: &Self) -> bool { 95 | use self::Error::*; 96 | 97 | match *self { 98 | CollectionNotFound(ref l) => { 99 | if let CollectionNotFound(ref r) = *other { 100 | l == r 101 | } else { 102 | false 103 | } 104 | } 105 | Unsupported(ref l) => { 106 | if let Unsupported(ref r) = *other { 107 | l == r 108 | } else { 109 | false 110 | } 111 | } 112 | ReportableBug(ref l) => { 113 | if let ReportableBug(ref r) = *other { 114 | l == r 115 | } else { 116 | false 117 | } 118 | } 119 | #[cfg(feature = "failpoints")] 120 | FailPoint => { 121 | if let FailPoint = *other { 122 | true 123 | } else { 124 | false 125 | } 126 | } 127 | Corruption { at: l, .. } => { 128 | if let Corruption { at: r, .. } = *other { 129 | l == r 130 | } else { 131 | false 132 | } 133 | } 134 | Io(_) => false, 135 | } 136 | } 137 | } 138 | 139 | impl From for Error { 140 | #[inline] 141 | fn from(io_error: io::Error) -> Self { 142 | Error::Io(io_error) 143 | } 144 | } 145 | 146 | impl From for io::Error { 147 | fn from(error: Error) -> io::Error { 148 | use self::Error::*; 149 | use std::io::ErrorKind; 150 | match error { 151 | Io(ioe) => ioe, 152 | CollectionNotFound(name) => io::Error::new( 153 | ErrorKind::NotFound, 154 | format!("collection not found: {:?}", name), 155 | ), 156 | Unsupported(why) => io::Error::new( 157 | ErrorKind::InvalidInput, 158 | format!("operation not supported: {:?}", why), 159 | ), 160 | ReportableBug(what) => io::Error::new( 161 | ErrorKind::Other, 162 | format!( 163 | "unexpected bug! please report this bug at : {:?}", 164 | what 165 | ), 166 | ), 167 | Corruption { .. } => io::Error::new( 168 | ErrorKind::InvalidData, 169 | format!("corruption encountered: {:?}", error), 170 | ), 171 | #[cfg(feature = "failpoints")] 172 | FailPoint => io::Error::new(ErrorKind::Other, "failpoint"), 173 | } 174 | } 175 | } 176 | 177 | impl StdError for Error {} 178 | 179 | impl Display for Error { 180 | fn fmt( 181 | &self, 182 | f: &mut fmt::Formatter<'_>, 183 | ) -> std::result::Result<(), fmt::Error> { 184 | use self::Error::*; 185 | 186 | match *self { 187 | CollectionNotFound(ref name) => { 188 | write!(f, "Collection {:?} does not exist", name,) 189 | } 190 | Unsupported(ref e) => write!(f, "Unsupported: {}", e), 191 | ReportableBug(ref e) => write!( 192 | f, 193 | "Unexpected bug has happened: {}. \ 194 | PLEASE REPORT THIS BUG!", 195 | e 196 | ), 197 | #[cfg(feature = "failpoints")] 198 | FailPoint => write!(f, "Fail point has been triggered."), 199 | Io(ref e) => write!(f, "IO error: {}", e), 200 | Corruption { at, ref bt } => write!( 201 | f, 202 | "Read corrupted data at file offset {:?} backtrace {:?}", 203 | at, bt 204 | ), 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /experiments/epoch/src/main.rs: -------------------------------------------------------------------------------- 1 | /// A simple implementation of epoch-based reclamation. 2 | /// 3 | /// Using the `pin` method, a thread checks into an epoch 4 | /// before operating on a shared resource. If that thread 5 | /// makes a shared resource inaccessible, it can defer its 6 | /// destruction until all threads that may have already 7 | /// checked in have moved on. 8 | use std::{ 9 | cell::RefCell, 10 | sync::{ 11 | atomic::{AtomicPtr, AtomicUsize, Ordering::SeqCst}, 12 | Arc, 13 | }, 14 | }; 15 | 16 | const EPOCH_SZ: usize = 16; 17 | 18 | #[derive(Default)] 19 | struct Epoch { 20 | garbage: [AtomicPtr>; EPOCH_SZ], 21 | offset: AtomicUsize, 22 | next: AtomicPtr, 23 | id: u64, 24 | } 25 | 26 | impl Drop for Epoch { 27 | fn drop(&mut self) { 28 | let count = std::cmp::min(EPOCH_SZ, self.offset.load(SeqCst)); 29 | for offset in 0..count { 30 | let mut garbage_ptr: *mut Box = 31 | self.garbage[offset].load(SeqCst); 32 | while garbage_ptr.is_null() { 33 | // maybe this is impossible, but this is to 34 | // be defensive against race conditions. 35 | garbage_ptr = self.garbage[offset].load(SeqCst); 36 | } 37 | 38 | let garbage: Box> = 39 | unsafe { Box::from_raw(garbage_ptr) }; 40 | 41 | drop(garbage); 42 | } 43 | 44 | let next = self.next.swap(std::ptr::null_mut(), SeqCst); 45 | if !next.is_null() { 46 | let arc = unsafe { Arc::from_raw(next) }; 47 | drop(arc); 48 | } 49 | } 50 | } 51 | 52 | struct Collector { 53 | head: AtomicPtr, 54 | } 55 | 56 | unsafe impl Send for Collector {} 57 | unsafe impl Sync for Collector {} 58 | 59 | impl Default for Collector { 60 | fn default() -> Collector { 61 | let ptr = Arc::into_raw(Arc::new(Epoch::default())) as *mut Epoch; 62 | Collector { head: AtomicPtr::new(ptr) } 63 | } 64 | } 65 | 66 | impl Collector { 67 | fn pin(&self) -> Guard { 68 | let head_ptr = self.head.load(SeqCst); 69 | assert!(!head_ptr.is_null()); 70 | let mut head = unsafe { Arc::from_raw(head_ptr) }; 71 | let mut next = head.next.load(SeqCst); 72 | let mut last_head = head_ptr; 73 | 74 | // forward head to current tip 75 | while !next.is_null() { 76 | std::mem::forget(head); 77 | 78 | let res = self.head.compare_and_swap(last_head, next, SeqCst); 79 | if res == last_head { 80 | head = unsafe { Arc::from_raw(next) }; 81 | last_head = next; 82 | } else { 83 | head = unsafe { Arc::from_raw(res) }; 84 | last_head = res; 85 | } 86 | 87 | next = head.next.load(SeqCst); 88 | } 89 | 90 | let (a1, a2) = (head.clone(), head.clone()); 91 | std::mem::forget(head); 92 | 93 | Guard { 94 | _entry_epoch: a1, 95 | current_epoch: a2, 96 | trash_sack: RefCell::new(vec![]), 97 | } 98 | } 99 | } 100 | 101 | impl Drop for Collector { 102 | fn drop(&mut self) { 103 | let head_ptr = self.head.load(SeqCst); 104 | assert!(!head_ptr.is_null()); 105 | unsafe { 106 | let head = Arc::from_raw(head_ptr); 107 | drop(head); 108 | } 109 | } 110 | } 111 | 112 | pub(crate) struct Guard { 113 | _entry_epoch: Arc, 114 | current_epoch: Arc, 115 | trash_sack: RefCell>>, 116 | } 117 | 118 | impl Guard { 119 | pub fn defer(&self, f: F) 120 | where 121 | F: FnOnce() + Send + 'static, 122 | { 123 | let garbage_ptr = 124 | Box::into_raw(Box::new(Box::new(f) as Box)); 125 | let mut trash_sack = self.trash_sack.borrow_mut(); 126 | trash_sack.push(garbage_ptr); 127 | } 128 | } 129 | 130 | impl Drop for Guard { 131 | fn drop(&mut self) { 132 | let trash_sack = self.trash_sack.replace(vec![]); 133 | 134 | for garbage_ptr in trash_sack.into_iter() { 135 | // try to reserve 136 | let mut offset = self.current_epoch.offset.fetch_add(1, SeqCst); 137 | while offset >= EPOCH_SZ { 138 | let next = self.current_epoch.next.load(SeqCst); 139 | if !next.is_null() { 140 | unsafe { 141 | let raced_arc = Arc::from_raw(next); 142 | self.current_epoch = raced_arc.clone(); 143 | std::mem::forget(raced_arc); 144 | } 145 | offset = self.current_epoch.offset.fetch_add(1, SeqCst); 146 | continue; 147 | } 148 | 149 | // push epoch forward if we're full 150 | let mut next_epoch = Epoch::default(); 151 | next_epoch.id = self.current_epoch.id + 1; 152 | 153 | let next_epoch_arc = Arc::new(next_epoch); 154 | let next_ptr = 155 | Arc::into_raw(next_epoch_arc.clone()) as *mut Epoch; 156 | let old = self.current_epoch.next.compare_and_swap( 157 | std::ptr::null_mut(), 158 | next_ptr, 159 | SeqCst, 160 | ); 161 | if old != std::ptr::null_mut() { 162 | // somebody else already installed a new segment 163 | unsafe { 164 | let unneeded = Arc::from_raw(next_ptr); 165 | drop(unneeded); 166 | 167 | let raced_arc = Arc::from_raw(old); 168 | self.current_epoch = raced_arc.clone(); 169 | std::mem::forget(raced_arc); 170 | } 171 | offset = self.current_epoch.offset.fetch_add(1, SeqCst); 172 | continue; 173 | } 174 | 175 | self.current_epoch = next_epoch_arc; 176 | offset = self.current_epoch.offset.fetch_add(1, SeqCst); 177 | } 178 | 179 | let old = 180 | self.current_epoch.garbage[offset].swap(garbage_ptr, SeqCst); 181 | assert!(old.is_null()); 182 | } 183 | } 184 | } 185 | 186 | #[derive(Debug)] 187 | struct S(usize); 188 | 189 | fn main() { 190 | let collector = Arc::new(Collector::default()); 191 | 192 | let mut threads = vec![]; 193 | 194 | for t in 0..100 { 195 | use std::thread::spawn; 196 | 197 | let collector = collector.clone(); 198 | let thread = spawn(move || { 199 | for _ in 0..1000000 { 200 | let guard = collector.pin(); 201 | guard.defer(move || { 202 | S(t as usize); 203 | }); 204 | 205 | let guard = crossbeam_epoch::pin(); 206 | guard.defer(move || { 207 | S(t as usize); 208 | }); 209 | } 210 | }); 211 | 212 | threads.push(thread); 213 | } 214 | 215 | for thread in threads.into_iter() { 216 | thread.join().unwrap(); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/event_log.rs: -------------------------------------------------------------------------------- 1 | //! The `EventLog` lets us cheaply record and query behavior 2 | //! in a concurrent system. It lets us reconstruct stories about 3 | //! what happened to our data. It lets us write tests like: 4 | //! 1. no keys are lost through tree structural modifications 5 | //! 2. no nodes are made inaccessible through structural modifications 6 | //! 3. no segments are zeroed and reused before all resident 7 | //! pages have been relocated and stabilized. 8 | //! 4. recovery does not skip active segments 9 | //! 5. no page is double-allocated or double-freed 10 | //! 6. pages before restart match pages after restart 11 | //! 12 | //! What does it mean for data to be accessible? 13 | //! 1. key -> page 14 | //! 2. page -> lid 15 | //! 3. lid ranges get stabiized over time 16 | //! 4. lid ranges get zeroed over time 17 | //! 5. segment trailers get written over time 18 | //! 6. if a page's old location is zeroed before 19 | //! `io_bufs` segment trailers have been written, 20 | //! we are vulnerable to data loss 21 | //! 3. segments have lifespans from fsync to zero 22 | //! 4. 23 | #![allow(missing_docs)] 24 | 25 | use crate::pagecache::DiskPtr; 26 | use crate::*; 27 | 28 | use crate::stack::{Iter as StackIter, Stack}; 29 | 30 | /// A thing that happens at a certain time. 31 | #[derive(Debug, Clone)] 32 | enum Event { 33 | PagesOnShutdown { pages: Map> }, 34 | PagesOnRecovery { pages: Map> }, 35 | MetaOnShutdown { meta: Meta }, 36 | MetaOnRecovery { meta: Meta }, 37 | RecoveredLsn(Lsn), 38 | Stabilized(Lsn), 39 | } 40 | 41 | /// A lock-free queue of Events. 42 | #[derive(Default, Debug)] 43 | pub struct EventLog { 44 | inner: Stack, 45 | } 46 | 47 | impl EventLog { 48 | pub(crate) fn reset(&self) { 49 | self.verify(); 50 | let guard = pin(); 51 | while self.inner.pop(&guard).is_some() {} 52 | } 53 | 54 | fn iter<'a>(&self, guard: &'a Guard) -> StackIter<'a, Event> { 55 | let head = self.inner.head(guard); 56 | StackIter::from_ptr(head, guard) 57 | } 58 | 59 | pub(crate) fn verify(&self) { 60 | let guard = pin(); 61 | let iter = self.iter(&guard); 62 | 63 | // if we encounter a `PagesOnRecovery`, then we should 64 | // compare it to any subsequent `PagesOnShutdown` 65 | 66 | let mut recovered_pages = None; 67 | let mut recovered_meta = None; 68 | let mut minimum_lsn = None; 69 | 70 | for event in iter { 71 | match event { 72 | Event::Stabilized(lsn) | Event::RecoveredLsn(lsn) => { 73 | if let Some(later_lsn) = minimum_lsn { 74 | assert!( 75 | later_lsn >= lsn, 76 | "lsn must never go down between recoveries \ 77 | or stabilizations. It was {} but later became {}. history: {:?}", 78 | lsn, 79 | later_lsn, 80 | self.iter(&guard) 81 | .filter(|e| matches!(e, Event::Stabilized(_)) 82 | || matches!(e, Event::RecoveredLsn(_))) 83 | .collect::>(), 84 | ); 85 | } 86 | minimum_lsn = Some(lsn); 87 | } 88 | Event::PagesOnRecovery { pages } => { 89 | recovered_pages = Some(pages.clone()); 90 | } 91 | Event::PagesOnShutdown { pages } => { 92 | if let Some(ref par) = recovered_pages { 93 | let pids = par 94 | .iter() 95 | .map(|(pid, _frag_locations)| *pid) 96 | .chain( 97 | pages.iter().map(|(pid, _frag_locations)| *pid), 98 | ) 99 | .collect::>() 100 | .into_iter(); 101 | 102 | for pid in pids { 103 | // we filter out the blob pointer in the log 104 | // because it is expected that upon recovery, 105 | // any blob pointers will be forgotten from 106 | // the log now that they are present in the 107 | // snapshot. 108 | let locations_before_restart: Vec<_> = pages 109 | .get(&pid) 110 | .unwrap() 111 | .iter() 112 | .map(|ptr| { 113 | let mut ptr = *ptr; 114 | ptr.forget_heap_log_coordinates(); 115 | ptr 116 | }) 117 | .collect(); 118 | let locations_after_restart: Vec<_> = par 119 | .get(&pid) 120 | .unwrap() 121 | .iter() 122 | .copied() 123 | .collect(); 124 | assert_eq!( 125 | locations_before_restart, 126 | locations_after_restart, 127 | "page {} had frag locations {:?} before \ 128 | restart, but {:?} after restart", 129 | pid, 130 | locations_before_restart, 131 | locations_after_restart 132 | ); 133 | } 134 | } 135 | } 136 | Event::MetaOnRecovery { meta } => { 137 | recovered_meta = Some(meta); 138 | } 139 | Event::MetaOnShutdown { meta } => { 140 | if let Some(rec_meta) = recovered_meta { 141 | assert_eq!(meta, rec_meta); 142 | } 143 | } 144 | } 145 | } 146 | 147 | debug!("event log verified \u{2713}"); 148 | } 149 | 150 | pub(crate) fn stabilized_lsn(&self, lsn: Lsn) { 151 | let guard = pin(); 152 | self.inner.push(Event::Stabilized(lsn), &guard); 153 | } 154 | 155 | pub(crate) fn recovered_lsn(&self, lsn: Lsn) { 156 | let guard = pin(); 157 | self.inner.push(Event::RecoveredLsn(lsn), &guard); 158 | } 159 | 160 | pub(crate) fn pages_before_restart( 161 | &self, 162 | pages: Map>, 163 | ) { 164 | let guard = pin(); 165 | self.inner.push(Event::PagesOnShutdown { pages }, &guard); 166 | } 167 | 168 | pub(crate) fn pages_after_restart(&self, pages: Map>) { 169 | let guard = pin(); 170 | self.inner.push(Event::PagesOnRecovery { pages }, &guard); 171 | } 172 | 173 | pub fn meta_before_restart(&self, meta: Meta) { 174 | let guard = pin(); 175 | self.inner.push(Event::MetaOnShutdown { meta }, &guard); 176 | } 177 | 178 | pub fn meta_after_restart(&self, meta: Meta) { 179 | let guard = pin(); 180 | self.inner.push(Event::MetaOnRecovery { meta }, &guard); 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /bindings/sled-native/src/lib.rs: -------------------------------------------------------------------------------- 1 | use sled; 2 | 3 | use std::ffi::CString; 4 | use std::mem; 5 | use std::ptr; 6 | use std::slice; 7 | 8 | use libc::*; 9 | 10 | use sled::{Config, Db, IVec, Iter}; 11 | 12 | fn leak_buf(v: Vec, vallen: *mut size_t) -> *mut c_char { 13 | unsafe { 14 | *vallen = v.len(); 15 | } 16 | let mut bsv = v.into_boxed_slice(); 17 | let val = bsv.as_mut_ptr() as *mut _; 18 | mem::forget(bsv); 19 | val 20 | } 21 | 22 | /// Create a new configuration. 23 | #[no_mangle] 24 | pub unsafe extern "C" fn sled_create_config() -> *mut Config { 25 | Box::into_raw(Box::new(Config::new())) 26 | } 27 | 28 | /// Destroy a configuration. 29 | #[no_mangle] 30 | pub unsafe extern "C" fn sled_free_config(config: *mut Config) { 31 | drop(Box::from_raw(config)); 32 | } 33 | 34 | /// Set the configured file path. The caller is responsible for freeing the path 35 | /// string after calling this (it is copied in this function). 36 | #[no_mangle] 37 | pub unsafe extern "C" fn sled_config_set_path( 38 | config: *mut Config, 39 | path: *const c_char, 40 | ) -> *mut Config { 41 | let c_str = CString::from_raw(path as *mut _); 42 | let value = c_str.into_string().unwrap(); 43 | 44 | let config = Box::from_raw(config); 45 | Box::into_raw(Box::from(config.path(value))) 46 | } 47 | 48 | /// Set the configured cache capacity in bytes. 49 | #[no_mangle] 50 | pub unsafe extern "C" fn sled_config_set_cache_capacity( 51 | config: *mut Config, 52 | capacity: size_t, 53 | ) -> *mut Config { 54 | let config = Box::from_raw(config); 55 | Box::into_raw(Box::from(config.cache_capacity(capacity as u64))) 56 | } 57 | 58 | /// Configure the use of the zstd compression library. 59 | #[no_mangle] 60 | pub unsafe extern "C" fn sled_config_use_compression( 61 | config: *mut Config, 62 | use_compression: c_uchar, 63 | ) -> *mut Config { 64 | let config = Box::from_raw(config); 65 | Box::into_raw(Box::from(config.use_compression(use_compression == 1))) 66 | } 67 | 68 | /// Set the configured IO buffer flush interval in milliseconds. 69 | #[no_mangle] 70 | pub unsafe extern "C" fn sled_config_flush_every_ms( 71 | config: *mut Config, 72 | flush_every: c_int, 73 | ) -> *mut Config { 74 | let val = if flush_every < 0 { None } else { Some(flush_every as u64) }; 75 | let config = Box::from_raw(config); 76 | Box::into_raw(Box::from(config.flush_every_ms(val))) 77 | } 78 | 79 | /// Open a sled lock-free log-structured tree. Consumes the passed-in config. 80 | #[no_mangle] 81 | pub unsafe extern "C" fn sled_open_db(config: *mut Config) -> *mut Db { 82 | let config = Box::from_raw(config); 83 | Box::into_raw(Box::new(config.open().unwrap())) 84 | } 85 | 86 | /// Close a sled lock-free log-structured tree. 87 | #[no_mangle] 88 | pub unsafe extern "C" fn sled_close(db: *mut Db) { 89 | drop(Box::from_raw(db)); 90 | } 91 | 92 | /// Free a buffer originally allocated by sled. 93 | #[no_mangle] 94 | pub unsafe extern "C" fn sled_free_buf(buf: *mut c_char, sz: size_t) { 95 | drop(Vec::from_raw_parts(buf, sz, sz)); 96 | } 97 | 98 | /// Free an iterator. 99 | #[no_mangle] 100 | pub unsafe extern "C" fn sled_free_iter(iter: *mut Iter) { 101 | drop(Box::from_raw(iter)); 102 | } 103 | 104 | /// Set a key to a value. 105 | #[no_mangle] 106 | pub unsafe extern "C" fn sled_set( 107 | db: *mut Db, 108 | key: *const c_uchar, 109 | keylen: size_t, 110 | val: *const c_uchar, 111 | vallen: size_t, 112 | ) { 113 | let k = IVec::from(slice::from_raw_parts(key, keylen)); 114 | let v = IVec::from(slice::from_raw_parts(val, vallen)); 115 | (*db).insert(k, v).unwrap(); 116 | } 117 | 118 | /// Get the value of a key. 119 | /// Caller is responsible for freeing the returned value with `sled_free_buf` if 120 | /// it's non-null. 121 | #[no_mangle] 122 | pub unsafe extern "C" fn sled_get( 123 | db: *mut Db, 124 | key: *const c_char, 125 | keylen: size_t, 126 | vallen: *mut size_t, 127 | ) -> *mut c_char { 128 | let k = slice::from_raw_parts(key as *const u8, keylen); 129 | let res = (*db).get(k); 130 | match res { 131 | Ok(Some(v)) => leak_buf(v.to_vec(), vallen), 132 | Ok(None) => ptr::null_mut(), 133 | // TODO proper error propagation 134 | Err(e) => panic!("{:?}", e), 135 | } 136 | } 137 | 138 | /// Delete the value of a key. 139 | #[no_mangle] 140 | pub unsafe extern "C" fn sled_del( 141 | db: *mut Db, 142 | key: *const c_char, 143 | keylen: size_t, 144 | ) { 145 | let k = slice::from_raw_parts(key as *const u8, keylen); 146 | (*db).remove(k).unwrap(); 147 | } 148 | 149 | /// Compare and swap. 150 | /// Returns 1 if successful, 0 if unsuccessful. 151 | /// Otherwise sets `actual_val` and `actual_vallen` to the current value, 152 | /// which must be freed using `sled_free_buf` by the caller if non-null. 153 | /// `actual_val` will be null and `actual_vallen` 0 if the current value is not 154 | /// set. 155 | #[no_mangle] 156 | pub unsafe extern "C" fn sled_compare_and_swap( 157 | db: *mut Db, 158 | key: *const c_char, 159 | keylen: size_t, 160 | old_val: *const c_uchar, 161 | old_vallen: size_t, 162 | new_val: *const c_uchar, 163 | new_vallen: size_t, 164 | actual_val: *mut *const c_uchar, 165 | actual_vallen: *mut size_t, 166 | ) -> c_uchar { 167 | let k = IVec::from(slice::from_raw_parts(key as *const u8, keylen)); 168 | 169 | let old = if old_vallen == 0 { 170 | None 171 | } else { 172 | let copy = 173 | IVec::from(slice::from_raw_parts(old_val as *const u8, old_vallen)); 174 | Some(copy) 175 | }; 176 | 177 | let new = if new_vallen == 0 { 178 | None 179 | } else { 180 | let copy = 181 | IVec::from(slice::from_raw_parts(new_val as *const u8, new_vallen)); 182 | Some(copy) 183 | }; 184 | 185 | let res = (*db).compare_and_swap(k, old, new); 186 | 187 | match res { 188 | Ok(Ok(())) => 1, 189 | Ok(Err(sled::CompareAndSwapError { current: None, .. })) => { 190 | *actual_vallen = 0; 191 | 0 192 | } 193 | Ok(Err(sled::CompareAndSwapError { current: Some(v), .. })) => { 194 | *actual_val = leak_buf(v.to_vec(), actual_vallen) as *const u8; 195 | 0 196 | } 197 | // TODO proper error propagation 198 | Err(e) => panic!("{:?}", e), 199 | } 200 | } 201 | 202 | /// Iterate over tuples which have specified key prefix. 203 | /// Caller is responsible for freeing the returned iterator with 204 | /// `sled_free_iter`. 205 | #[no_mangle] 206 | pub unsafe extern "C" fn sled_scan_prefix( 207 | db: *mut Db, 208 | key: *const c_char, 209 | keylen: size_t, 210 | ) -> *mut Iter { 211 | let k = slice::from_raw_parts(key as *const u8, keylen); 212 | Box::into_raw(Box::new((*db).scan_prefix(k))) 213 | } 214 | 215 | /// Get they next kv pair from an iterator. 216 | /// Caller is responsible for freeing the key and value with `sled_free_buf`. 217 | /// Returns 0 when exhausted. 218 | #[no_mangle] 219 | pub unsafe extern "C" fn sled_iter_next( 220 | iter: *mut Iter, 221 | key: *mut *const c_char, 222 | keylen: *mut size_t, 223 | val: *mut *const c_char, 224 | vallen: *mut size_t, 225 | ) -> c_uchar { 226 | match (*iter).next() { 227 | Some(Ok((k, v))) => { 228 | *key = leak_buf(k.to_vec(), keylen); 229 | *val = leak_buf(v.to_vec(), vallen); 230 | 1 231 | } 232 | // TODO proper error propagation 233 | Some(Err(e)) => panic!("{:?}", e), 234 | None => 0, 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /src/atomic_shim.rs: -------------------------------------------------------------------------------- 1 | ///! Inline of `https://github.com/bltavares/atomic-shim` 2 | 3 | #[cfg(not(any( 4 | target_arch = "mips", 5 | target_arch = "powerpc", 6 | feature = "mutex" 7 | )))] 8 | pub use std::sync::atomic::{AtomicI64, AtomicU64}; 9 | #[cfg(any(target_arch = "mips", target_arch = "powerpc", feature = "mutex"))] 10 | mod shim { 11 | use parking_lot::{const_rwlock, RwLock}; 12 | use std::sync::atomic::Ordering; 13 | 14 | #[derive(Debug, Default)] 15 | pub struct AtomicU64 { 16 | value: RwLock, 17 | } 18 | 19 | impl AtomicU64 { 20 | pub const fn new(v: u64) -> Self { 21 | Self { value: const_rwlock(v) } 22 | } 23 | 24 | #[allow(dead_code)] 25 | pub fn load(&self, _: Ordering) -> u64 { 26 | *self.value.read() 27 | } 28 | 29 | #[allow(dead_code)] 30 | pub fn store(&self, value: u64, _: Ordering) { 31 | let mut lock = self.value.write(); 32 | *lock = value; 33 | } 34 | 35 | #[allow(dead_code)] 36 | pub fn swap(&self, value: u64, _: Ordering) -> u64 { 37 | let mut lock = self.value.write(); 38 | let prev = *lock; 39 | *lock = value; 40 | prev 41 | } 42 | 43 | #[allow(dead_code)] 44 | pub fn compare_exchange( 45 | &self, 46 | current: u64, 47 | new: u64, 48 | _: Ordering, 49 | _: Ordering, 50 | ) -> Result { 51 | let mut lock = self.value.write(); 52 | let prev = *lock; 53 | if prev == current { 54 | *lock = new; 55 | Ok(current) 56 | } else { 57 | Err(prev) 58 | } 59 | } 60 | 61 | #[allow(dead_code)] 62 | pub fn compare_exchange_weak( 63 | &self, 64 | current: u64, 65 | new: u64, 66 | success: Ordering, 67 | failure: Ordering, 68 | ) -> Result { 69 | self.compare_exchange(current, new, success, failure) 70 | } 71 | 72 | #[allow(dead_code)] 73 | pub fn fetch_add(&self, val: u64, _: Ordering) -> u64 { 74 | let mut lock = self.value.write(); 75 | let prev = *lock; 76 | *lock = prev.wrapping_add(val); 77 | prev 78 | } 79 | 80 | #[allow(dead_code)] 81 | pub fn fetch_sub(&self, val: u64, _: Ordering) -> u64 { 82 | let mut lock = self.value.write(); 83 | let prev = *lock; 84 | *lock = prev.wrapping_sub(val); 85 | prev 86 | } 87 | 88 | #[allow(dead_code)] 89 | pub fn fetch_and(&self, val: u64, _: Ordering) -> u64 { 90 | let mut lock = self.value.write(); 91 | let prev = *lock; 92 | *lock = prev & val; 93 | prev 94 | } 95 | 96 | #[allow(dead_code)] 97 | pub fn fetch_nand(&self, val: u64, _: Ordering) -> u64 { 98 | let mut lock = self.value.write(); 99 | let prev = *lock; 100 | *lock = !(prev & val); 101 | prev 102 | } 103 | 104 | #[allow(dead_code)] 105 | pub fn fetch_or(&self, val: u64, _: Ordering) -> u64 { 106 | let mut lock = self.value.write(); 107 | let prev = *lock; 108 | *lock = prev | val; 109 | prev 110 | } 111 | 112 | #[allow(dead_code)] 113 | pub fn fetch_xor(&self, val: u64, _: Ordering) -> u64 { 114 | let mut lock = self.value.write(); 115 | let prev = *lock; 116 | *lock = prev ^ val; 117 | prev 118 | } 119 | } 120 | 121 | impl From for AtomicU64 { 122 | fn from(value: u64) -> Self { 123 | AtomicU64::new(value) 124 | } 125 | } 126 | 127 | #[derive(Debug, Default)] 128 | pub struct AtomicI64 { 129 | value: RwLock, 130 | } 131 | 132 | impl AtomicI64 { 133 | pub fn new(v: i64) -> Self { 134 | Self { value: const_rwlock(v) } 135 | } 136 | 137 | #[allow(dead_code)] 138 | pub fn load(&self, _: Ordering) -> i64 { 139 | *self.value.read() 140 | } 141 | 142 | #[allow(dead_code)] 143 | pub fn store(&self, value: i64, _: Ordering) { 144 | let mut lock = self.value.write(); 145 | *lock = value; 146 | } 147 | 148 | #[allow(dead_code)] 149 | pub fn swap(&self, value: i64, _: Ordering) -> i64 { 150 | let mut lock = self.value.write(); 151 | let prev = *lock; 152 | *lock = value; 153 | prev 154 | } 155 | 156 | #[allow(dead_code)] 157 | pub fn compare_exchange( 158 | &self, 159 | current: i64, 160 | new: i64, 161 | _: Ordering, 162 | _: Ordering, 163 | ) -> Result { 164 | let mut lock = self.value.write(); 165 | let prev = *lock; 166 | if prev == current { 167 | *lock = new; 168 | Ok(current) 169 | } else { 170 | Err(prev) 171 | } 172 | } 173 | 174 | #[allow(dead_code)] 175 | pub fn compare_exchange_weak( 176 | &self, 177 | current: i64, 178 | new: i64, 179 | success: Ordering, 180 | failure: Ordering, 181 | ) -> Result { 182 | self.compare_exchange(current, new, success, failure) 183 | } 184 | 185 | #[allow(dead_code)] 186 | pub fn fetch_add(&self, val: i64, _: Ordering) -> i64 { 187 | let mut lock = self.value.write(); 188 | let prev = *lock; 189 | *lock = prev.wrapping_add(val); 190 | prev 191 | } 192 | 193 | #[allow(dead_code)] 194 | pub fn fetch_sub(&self, val: i64, _: Ordering) -> i64 { 195 | let mut lock = self.value.write(); 196 | let prev = *lock; 197 | *lock = prev.wrapping_sub(val); 198 | prev 199 | } 200 | 201 | #[allow(dead_code)] 202 | pub fn fetch_and(&self, val: i64, _: Ordering) -> i64 { 203 | let mut lock = self.value.write(); 204 | let prev = *lock; 205 | *lock = prev & val; 206 | prev 207 | } 208 | 209 | #[allow(dead_code)] 210 | pub fn fetch_nand(&self, val: i64, _: Ordering) -> i64 { 211 | let mut lock = self.value.write(); 212 | let prev = *lock; 213 | *lock = !(prev & val); 214 | prev 215 | } 216 | 217 | #[allow(dead_code)] 218 | pub fn fetch_or(&self, val: i64, _: Ordering) -> i64 { 219 | let mut lock = self.value.write(); 220 | let prev = *lock; 221 | *lock = prev | val; 222 | prev 223 | } 224 | 225 | #[allow(dead_code)] 226 | pub fn fetch_xor(&self, val: i64, _: Ordering) -> i64 { 227 | let mut lock = self.value.write(); 228 | let prev = *lock; 229 | *lock = prev ^ val; 230 | prev 231 | } 232 | } 233 | 234 | impl From for AtomicI64 { 235 | fn from(value: i64) -> Self { 236 | AtomicI64::new(value) 237 | } 238 | } 239 | } 240 | 241 | #[cfg(any( 242 | target_arch = "mips", 243 | target_arch = "powerpc", 244 | feature = "mutex" 245 | ))] 246 | pub use shim::{AtomicI64, AtomicU64}; 247 | -------------------------------------------------------------------------------- /src/stack.rs: -------------------------------------------------------------------------------- 1 | #![allow(unsafe_code)] 2 | 3 | use std::{ 4 | fmt::{self, Debug}, 5 | ops::Deref, 6 | sync::atomic::Ordering::{Acquire, Release}, 7 | }; 8 | 9 | use crossbeam_epoch::{unprotected, Atomic, Guard, Owned, Shared}; 10 | 11 | use crate::debug_delay; 12 | 13 | /// A node in the lock-free `Stack`. 14 | #[derive(Debug)] 15 | pub struct Node { 16 | pub(crate) inner: T, 17 | pub(crate) next: Atomic>, 18 | } 19 | 20 | impl Drop for Node { 21 | fn drop(&mut self) { 22 | unsafe { 23 | let mut cursor = self.next.load(Acquire, unprotected()); 24 | 25 | while !cursor.is_null() { 26 | // we carefully unset the next pointer here to avoid 27 | // a stack overflow when freeing long lists. 28 | let node = cursor.into_owned(); 29 | cursor = node.next.swap(Shared::null(), Acquire, unprotected()); 30 | drop(node); 31 | } 32 | } 33 | } 34 | } 35 | 36 | /// A simple lock-free stack, with the ability to atomically 37 | /// append or entirely swap-out entries. 38 | pub struct Stack { 39 | head: Atomic>, 40 | } 41 | 42 | impl Default for Stack { 43 | fn default() -> Self { 44 | Self { head: Atomic::null() } 45 | } 46 | } 47 | 48 | impl Drop for Stack { 49 | fn drop(&mut self) { 50 | unsafe { 51 | let curr = self.head.load(Acquire, unprotected()); 52 | if !curr.as_raw().is_null() { 53 | drop(curr.into_owned()); 54 | } 55 | } 56 | } 57 | } 58 | 59 | impl Debug for Stack 60 | where 61 | T: Clone + Debug + Send + 'static + Sync, 62 | { 63 | fn fmt( 64 | &self, 65 | formatter: &mut fmt::Formatter<'_>, 66 | ) -> Result<(), fmt::Error> { 67 | let guard = crossbeam_epoch::pin(); 68 | let head = self.head(&guard); 69 | let iter = Iter::from_ptr(head, &guard); 70 | 71 | formatter.write_str("Stack [")?; 72 | let mut written = false; 73 | for node in iter { 74 | if written { 75 | formatter.write_str(", ")?; 76 | } 77 | formatter.write_str(&*format!("({:?}) ", &node))?; 78 | node.fmt(formatter)?; 79 | written = true; 80 | } 81 | formatter.write_str("]")?; 82 | Ok(()) 83 | } 84 | } 85 | 86 | impl Deref for Node { 87 | type Target = T; 88 | fn deref(&self) -> &T { 89 | &self.inner 90 | } 91 | } 92 | 93 | impl Stack { 94 | /// Add an item to the stack, spinning until successful. 95 | pub(crate) fn push(&self, inner: T, guard: &Guard) { 96 | debug_delay(); 97 | let node = Owned::new(Node { inner, next: Atomic::null() }); 98 | 99 | unsafe { 100 | let node = node.into_shared(guard); 101 | 102 | loop { 103 | let head = self.head(guard); 104 | node.deref().next.store(head, Release); 105 | if self.head.compare_and_set(head, node, Release, guard).is_ok() 106 | { 107 | return; 108 | } 109 | } 110 | } 111 | } 112 | 113 | /// Clears the stack and returns all items 114 | pub(crate) fn take_iter<'a>( 115 | &self, 116 | guard: &'a Guard, 117 | ) -> impl Iterator { 118 | debug_delay(); 119 | let node = self.head.swap(Shared::null(), Release, guard); 120 | 121 | let iter = Iter { inner: node, guard }; 122 | 123 | if !node.is_null() { 124 | unsafe { 125 | guard.defer_destroy(node); 126 | } 127 | } 128 | 129 | iter 130 | } 131 | 132 | /// Pop the next item off the stack. Returns None if nothing is there. 133 | pub(crate) fn pop(&self, guard: &Guard) -> Option { 134 | use std::ptr; 135 | use std::sync::atomic::Ordering::SeqCst; 136 | debug_delay(); 137 | let mut head = self.head(guard); 138 | loop { 139 | match unsafe { head.as_ref() } { 140 | Some(h) => { 141 | let next = h.next.load(Acquire, guard); 142 | match self.head.compare_and_set(head, next, Release, guard) 143 | { 144 | Ok(_) => unsafe { 145 | // we unset the next pointer before destruction 146 | // to avoid double-frees. 147 | h.next.store(Shared::default(), SeqCst); 148 | guard.defer_destroy(head); 149 | return Some(ptr::read(&h.inner)); 150 | }, 151 | Err(h) => head = h.current, 152 | } 153 | } 154 | None => return None, 155 | } 156 | } 157 | } 158 | 159 | /// Returns the current head pointer of the stack, which can 160 | /// later be used as the key for cas and cap operations. 161 | pub(crate) fn head<'g>(&self, guard: &'g Guard) -> Shared<'g, Node> { 162 | self.head.load(Acquire, guard) 163 | } 164 | } 165 | 166 | /// An iterator over nodes in a lock-free stack. 167 | pub struct Iter<'a, T> 168 | where 169 | T: Send + 'static + Sync, 170 | { 171 | inner: Shared<'a, Node>, 172 | guard: &'a Guard, 173 | } 174 | 175 | impl<'a, T> Iter<'a, T> 176 | where 177 | T: 'a + Send + 'static + Sync, 178 | { 179 | /// Creates a `Iter` from a pointer to one. 180 | pub(crate) fn from_ptr<'b>( 181 | ptr: Shared<'b, Node>, 182 | guard: &'b Guard, 183 | ) -> Iter<'b, T> { 184 | Iter { inner: ptr, guard } 185 | } 186 | } 187 | 188 | impl<'a, T> Iterator for Iter<'a, T> 189 | where 190 | T: Send + 'static + Sync, 191 | { 192 | type Item = &'a T; 193 | 194 | fn next(&mut self) -> Option { 195 | debug_delay(); 196 | if self.inner.is_null() { 197 | None 198 | } else { 199 | unsafe { 200 | let ret = &self.inner.deref().inner; 201 | self.inner = self.inner.deref().next.load(Acquire, self.guard); 202 | Some(ret) 203 | } 204 | } 205 | } 206 | 207 | fn size_hint(&self) -> (usize, Option) { 208 | let mut size = 0; 209 | let mut cursor = self.inner; 210 | 211 | while !cursor.is_null() { 212 | unsafe { 213 | cursor = cursor.deref().next.load(Acquire, self.guard); 214 | } 215 | size += 1; 216 | } 217 | 218 | (size, Some(size)) 219 | } 220 | } 221 | 222 | #[test] 223 | #[cfg(not(miri))] // can't create threads 224 | fn basic_functionality() { 225 | use crossbeam_epoch::pin; 226 | use crossbeam_utils::CachePadded; 227 | use std::sync::Arc; 228 | use std::thread; 229 | 230 | let guard = pin(); 231 | let ll = Arc::new(Stack::default()); 232 | assert_eq!(ll.pop(&guard), None); 233 | ll.push(CachePadded::new(1), &guard); 234 | let ll2 = Arc::clone(&ll); 235 | let t = thread::spawn(move || { 236 | let guard = pin(); 237 | ll2.push(CachePadded::new(2), &guard); 238 | ll2.push(CachePadded::new(3), &guard); 239 | ll2.push(CachePadded::new(4), &guard); 240 | guard.flush(); 241 | }); 242 | t.join().unwrap(); 243 | ll.push(CachePadded::new(5), &guard); 244 | assert_eq!(ll.pop(&guard), Some(CachePadded::new(5))); 245 | assert_eq!(ll.pop(&guard), Some(CachePadded::new(4))); 246 | let ll3 = Arc::clone(&ll); 247 | let t = thread::spawn(move || { 248 | let guard = pin(); 249 | assert_eq!(ll3.pop(&guard), Some(CachePadded::new(3))); 250 | assert_eq!(ll3.pop(&guard), Some(CachePadded::new(2))); 251 | guard.flush(); 252 | }); 253 | t.join().unwrap(); 254 | assert_eq!(ll.pop(&guard), Some(CachePadded::new(1))); 255 | let ll4 = Arc::clone(&ll); 256 | let t = thread::spawn(move || { 257 | let guard = pin(); 258 | assert_eq!(ll4.pop(&guard), None); 259 | guard.flush(); 260 | }); 261 | t.join().unwrap(); 262 | drop(ll); 263 | guard.flush(); 264 | drop(guard); 265 | } 266 | -------------------------------------------------------------------------------- /src/arc.rs: -------------------------------------------------------------------------------- 1 | #![allow(unsafe_code)] 2 | 3 | /// We create our own `Arc` because we never use the weak 4 | /// count on the std `Arc`, but we use a LOT of `Arc`'s, so 5 | /// the extra 8 bytes turn into a huge overhead. 6 | use std::{ 7 | alloc::{alloc, dealloc, Layout}, 8 | convert::TryFrom, 9 | fmt::{self, Debug}, 10 | mem, 11 | ops::Deref, 12 | ptr, 13 | sync::atomic::{AtomicUsize, Ordering}, 14 | }; 15 | 16 | // we make this repr(C) because we do a raw 17 | // write to the beginning where we expect 18 | // the rc to be. 19 | #[repr(C)] 20 | struct ArcInner { 21 | rc: AtomicUsize, 22 | inner: T, 23 | } 24 | 25 | pub struct Arc { 26 | ptr: *mut ArcInner, 27 | } 28 | 29 | unsafe impl Send for Arc {} 30 | 31 | unsafe impl Sync for Arc {} 32 | 33 | impl Debug for Arc { 34 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { 35 | Debug::fmt(&**self, f) 36 | } 37 | } 38 | 39 | impl Arc { 40 | pub fn new(inner: T) -> Arc { 41 | let bx = Box::new(ArcInner { inner, rc: AtomicUsize::new(1) }); 42 | let ptr = Box::into_raw(bx); 43 | Arc { ptr } 44 | } 45 | 46 | // See std::sync::arc::Arc::copy_from_slice, 47 | // "Unsafe because the caller must either take ownership or bind `T: Copy`" 48 | unsafe fn copy_from_slice(s: &[T]) -> Arc<[T]> { 49 | let align = 50 | std::cmp::max(mem::align_of::(), mem::align_of::()); 51 | 52 | let rc_width = std::cmp::max(align, mem::size_of::()); 53 | let data_width = mem::size_of::().checked_mul(s.len()).unwrap(); 54 | 55 | let size_unpadded = rc_width.checked_add(data_width).unwrap(); 56 | // Pad size out to alignment 57 | let size_padded = (size_unpadded + align - 1) & !(align - 1); 58 | 59 | let layout = Layout::from_size_align(size_padded, align).unwrap(); 60 | 61 | let ptr = alloc(layout); 62 | 63 | assert!(!ptr.is_null(), "failed to allocate Arc"); 64 | #[allow(clippy::cast_ptr_alignment)] 65 | ptr::write(ptr as _, AtomicUsize::new(1)); 66 | 67 | let data_ptr = ptr.add(rc_width); 68 | ptr::copy_nonoverlapping(s.as_ptr(), data_ptr as _, s.len()); 69 | 70 | let fat_ptr: *const ArcInner<[T]> = Arc::fatten(ptr, s.len()); 71 | 72 | Arc { ptr: fat_ptr as *mut _ } 73 | } 74 | 75 | /// 76 | #[allow(trivial_casts)] 77 | fn fatten(data: *const u8, len: usize) -> *const ArcInner<[T]> { 78 | // Requirements of slice::from_raw_parts. 79 | assert!(!data.is_null()); 80 | assert!(isize::try_from(len).is_ok()); 81 | 82 | let slice = 83 | unsafe { core::slice::from_raw_parts(data as *const (), len) }; 84 | slice as *const [()] as *const _ 85 | } 86 | 87 | pub fn into_raw(arc: Arc) -> *const T { 88 | let ptr = unsafe { &(*arc.ptr).inner }; 89 | #[allow(clippy::mem_forget)] 90 | mem::forget(arc); 91 | ptr 92 | } 93 | 94 | pub unsafe fn from_raw(ptr: *const T) -> Arc { 95 | let align = 96 | std::cmp::max(mem::align_of::(), mem::align_of::()); 97 | 98 | let rc_width = std::cmp::max(align, mem::size_of::()); 99 | 100 | let sub_ptr = (ptr as *const u8).sub(rc_width) as *mut ArcInner; 101 | 102 | Arc { ptr: sub_ptr } 103 | } 104 | } 105 | 106 | impl Arc { 107 | pub fn strong_count(arc: &Arc) -> usize { 108 | unsafe { (*arc.ptr).rc.load(Ordering::Acquire) } 109 | } 110 | 111 | pub fn get_mut(arc: &mut Arc) -> Option<&mut T> { 112 | if Arc::strong_count(arc) == 1 { 113 | Some(unsafe { &mut arc.ptr.as_mut().unwrap().inner }) 114 | } else { 115 | None 116 | } 117 | } 118 | } 119 | 120 | impl Arc { 121 | pub fn make_mut(arc: &mut Arc) -> &mut T { 122 | if Arc::strong_count(arc) != 1 { 123 | *arc = Arc::new((**arc).clone()); 124 | assert_eq!(Arc::strong_count(arc), 1); 125 | } 126 | Arc::get_mut(arc).unwrap() 127 | } 128 | } 129 | 130 | impl Default for Arc { 131 | fn default() -> Arc { 132 | Arc::new(T::default()) 133 | } 134 | } 135 | 136 | impl Clone for Arc { 137 | fn clone(&self) -> Arc { 138 | // safe to use Relaxed ordering below because 139 | // of the required synchronization for passing 140 | // any objects to another thread. 141 | let last_count = 142 | unsafe { (*self.ptr).rc.fetch_add(1, Ordering::Relaxed) }; 143 | 144 | if last_count == usize::max_value() { 145 | std::process::abort(); 146 | } 147 | 148 | Arc { ptr: self.ptr } 149 | } 150 | } 151 | 152 | impl Drop for Arc { 153 | fn drop(&mut self) { 154 | unsafe { 155 | let rc = (*self.ptr).rc.fetch_sub(1, Ordering::Release) - 1; 156 | if rc == 0 { 157 | std::sync::atomic::fence(Ordering::Acquire); 158 | Box::from_raw(self.ptr); 159 | } 160 | } 161 | } 162 | } 163 | 164 | impl From<&[T]> for Arc<[T]> { 165 | #[inline] 166 | fn from(s: &[T]) -> Arc<[T]> { 167 | unsafe { Arc::copy_from_slice(s) } 168 | } 169 | } 170 | 171 | #[allow(clippy::fallible_impl_from)] 172 | impl From> for Arc<[T]> { 173 | #[inline] 174 | fn from(b: Box<[T]>) -> Arc<[T]> { 175 | let len = b.len(); 176 | unsafe { 177 | let src = Box::into_raw(b); 178 | let value_layout = Layout::for_value(&*src); 179 | let align = std::cmp::max( 180 | value_layout.align(), 181 | mem::align_of::(), 182 | ); 183 | let rc_width = std::cmp::max(align, mem::size_of::()); 184 | let unpadded_size = 185 | rc_width.checked_add(value_layout.size()).unwrap(); 186 | // pad the total `Arc` allocation size to the alignment of 187 | // `max(value, AtomicUsize)` 188 | let size = (unpadded_size + align - 1) & !(align - 1); 189 | let dst_layout = Layout::from_size_align(size, align).unwrap(); 190 | let dst = alloc(dst_layout); 191 | assert!(!dst.is_null(), "failed to allocate Arc"); 192 | 193 | #[allow(clippy::cast_ptr_alignment)] 194 | ptr::write(dst as _, AtomicUsize::new(1)); 195 | let data_ptr = dst.add(rc_width); 196 | ptr::copy_nonoverlapping( 197 | src as *const u8, 198 | data_ptr, 199 | value_layout.size(), 200 | ); 201 | 202 | // free the old box memory without running Drop 203 | if value_layout.size() != 0 { 204 | dealloc(src as *mut u8, value_layout); 205 | } 206 | 207 | let fat_ptr: *const ArcInner<[T]> = Arc::fatten(dst, len); 208 | 209 | Arc { ptr: fat_ptr as *mut _ } 210 | } 211 | } 212 | } 213 | 214 | #[test] 215 | fn boxed_slice_to_arc_slice() { 216 | let box1: Box<[u8]> = Box::new([1, 2, 3]); 217 | let arc1: Arc<[u8]> = box1.into(); 218 | assert_eq!(&*arc1, &*vec![1, 2, 3]); 219 | let box2: Box<[u8]> = Box::new([]); 220 | let arc2: Arc<[u8]> = box2.into(); 221 | assert_eq!(&*arc2, &*vec![]); 222 | let box3: Box<[u64]> = Box::new([1, 2, 3]); 223 | let arc3: Arc<[u64]> = box3.into(); 224 | assert_eq!(&*arc3, &*vec![1, 2, 3]); 225 | } 226 | 227 | impl From> for Arc<[T]> { 228 | #[inline] 229 | fn from(mut v: Vec) -> Arc<[T]> { 230 | unsafe { 231 | let arc = Arc::copy_from_slice(&v); 232 | 233 | // Allow the Vec to free its memory, but not destroy its contents 234 | v.set_len(0); 235 | 236 | arc 237 | } 238 | } 239 | } 240 | 241 | impl Deref for Arc { 242 | type Target = T; 243 | 244 | fn deref(&self) -> &T { 245 | unsafe { &(*self.ptr).inner } 246 | } 247 | } 248 | 249 | impl std::borrow::Borrow for Arc { 250 | fn borrow(&self) -> &T { 251 | &**self 252 | } 253 | } 254 | 255 | impl AsRef for Arc { 256 | fn as_ref(&self) -> &T { 257 | &**self 258 | } 259 | } 260 | --------------------------------------------------------------------------------