├── rust-toolchain.toml ├── docs ├── images │ ├── 0001-query-lifecycle.png │ ├── 0002-predicate-pushdown.png │ ├── 0001-query-lifecycle.puml │ └── 0002-predicate-pushdown.puml └── rfcs │ ├── 0000-rfc-style-guide.md │ └── 0006-mvcc-sidecar.md ├── src ├── tests_internal │ ├── common │ │ └── mod.rs │ ├── mod.rs │ ├── compaction_loop_spawn.rs │ ├── wasm_compat_e2e.rs │ ├── conflict_e2e.rs │ ├── backend.rs │ ├── time_travel_e2e.rs │ ├── wal_rotation_e2e.rs │ └── wal_policy_e2e.rs ├── db │ ├── tests │ │ ├── mod.rs │ │ ├── core │ │ │ ├── mod.rs │ │ │ ├── common.rs │ │ │ ├── scan.rs │ │ │ └── flush.rs │ │ └── wasm_web.rs │ ├── error.rs │ └── compaction.rs ├── inmem │ ├── mod.rs │ ├── mutable │ │ ├── mod.rs │ │ └── metrics.rs │ └── immutable │ │ └── mod.rs ├── key │ ├── mod.rs │ ├── heap_size.rs │ └── ts.rs ├── ondisk │ └── mod.rs ├── mutation.rs ├── prelude.rs ├── compaction │ ├── handle.rs │ ├── mod.rs │ ├── scheduler.rs │ └── minor.rs ├── id.rs ├── manifest │ ├── version.rs │ ├── mod.rs │ └── codec.rs ├── extractor │ ├── mod.rs │ └── errors.rs ├── wal │ ├── metrics.rs │ └── manifest_ext.rs ├── test.rs ├── query │ ├── mod.rs │ └── scan.rs ├── mvcc │ └── mod.rs ├── mode │ ├── dyn_config.rs │ └── mod.rs └── schema.rs ├── .gitignore ├── examples ├── cloudflare-worker │ ├── .gitignore │ ├── wrangler.toml │ └── Cargo.toml ├── 01_basic.rs ├── 02b_snapshot.rs ├── 02_transaction.rs ├── 10_dynamic │ ├── 10b_dynamic_metadata.rs │ ├── 10a_dynamic_basic.rs │ ├── 10c_dynamic_composite.rs │ └── 10d_dynamic_transaction.rs ├── 04_s3.rs ├── 07_streaming.rs ├── 06_composite_key.rs ├── 08_nested_types.rs ├── 09_time_travel.rs └── 03_filter.rs ├── rustfmt.toml ├── predicate ├── Cargo.toml └── src │ ├── lib.rs │ └── core │ ├── operand.rs │ ├── row_set.rs │ ├── builder.rs │ └── visitor.rs ├── tests ├── common │ └── mod.rs ├── web_executor.rs ├── README.md ├── s3_smoke.sh ├── s3_localstack_env.sh └── s3_smoke.rs ├── .githooks └── pre-commit └── Cargo.toml /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "1.90" 3 | components = ["clippy", "rust-analyzer", "rustfmt"] 4 | -------------------------------------------------------------------------------- /docs/images/0001-query-lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonbo-io/tonbo/HEAD/docs/images/0001-query-lifecycle.png -------------------------------------------------------------------------------- /docs/images/0002-predicate-pushdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonbo-io/tonbo/HEAD/docs/images/0002-predicate-pushdown.png -------------------------------------------------------------------------------- /src/tests_internal/common/mod.rs: -------------------------------------------------------------------------------- 1 | //! Common test utilities for integration tests. 2 | 3 | pub use crate::test_support::config_with_pk; 4 | -------------------------------------------------------------------------------- /src/db/tests/mod.rs: -------------------------------------------------------------------------------- 1 | mod core; 2 | mod wal_gc; 3 | mod wal_recovery; 4 | 5 | #[cfg(all(target_arch = "wasm32", feature = "web"))] 6 | mod wasm_web; 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | .serena 4 | 5 | /.idea 6 | 7 | # python 8 | __pycache__ 9 | *.so 10 | 11 | # node 12 | node_modules 13 | dist 14 | 15 | .DS_Store 16 | -------------------------------------------------------------------------------- /src/db/tests/core/mod.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | mod compaction; 3 | mod flush; 4 | mod ingest; 5 | mod metadata; 6 | mod recovery; 7 | mod scan; 8 | mod wal; 9 | mod wal_pruning; 10 | -------------------------------------------------------------------------------- /examples/cloudflare-worker/.gitignore: -------------------------------------------------------------------------------- 1 | # Build artifacts 2 | /target/ 3 | /build/ 4 | 5 | # Wrangler 6 | .wrangler/ 7 | 8 | # Local development secrets (never commit real credentials!) 9 | .dev.vars 10 | 11 | # Rust lock file (optional, can be committed for reproducibility) 12 | # Cargo.lock 13 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | comment_width = 100 2 | edition = "2021" 3 | format_code_in_doc_comments = true 4 | format_strings = true 5 | group_imports = "StdExternalCrate" 6 | imports_granularity = "Crate" 7 | max_width = 100 8 | normalize_comments = true 9 | normalize_doc_attributes = true 10 | wrap_comments = true 11 | -------------------------------------------------------------------------------- /src/inmem/mod.rs: -------------------------------------------------------------------------------- 1 | //! In-memory structures (mutable and immutable memtables). 2 | //! 3 | //! - `mutable/` contains the columnar mutable memtable with a last-writer key index. 4 | //! - `immutable/` contains typed arrays and the zero-copy immutable memtable. 5 | 6 | pub(crate) mod immutable; 7 | pub(crate) mod mutable; 8 | pub(crate) mod policy; 9 | -------------------------------------------------------------------------------- /src/tests_internal/mod.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | 3 | pub mod backend; 4 | pub mod compaction_gc_e2e; 5 | pub mod compaction_loop_spawn; 6 | pub mod conflict_e2e; 7 | pub mod durability_public; 8 | pub mod public_api_e2e; 9 | pub mod read_smoke; 10 | pub mod scan_plan_e2e; 11 | pub mod time_travel_e2e; 12 | pub mod wal_policy_e2e; 13 | pub mod wal_rotation_e2e; 14 | pub mod wasm_compat_e2e; 15 | -------------------------------------------------------------------------------- /src/inmem/mutable/mod.rs: -------------------------------------------------------------------------------- 1 | //! Mutable memtable(s). 2 | //! 3 | //! This module implements the columnar-style mutable memtable used by Tonbo's 4 | //! dynamic layout. The earlier trait-based abstraction has been flattened: the 5 | //! engine now only builds the dynamic `DynMem` implementation. 6 | 7 | pub(crate) mod memtable; 8 | mod metrics; 9 | 10 | pub(crate) use memtable::DynMem; 11 | pub(crate) use metrics::MutableMemTableMetrics; 12 | -------------------------------------------------------------------------------- /predicate/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2024" 3 | name = "tonbo-predicate" 4 | version = "0.1.0" 5 | description = "Predicate evaluation for Tonbo embedded database" 6 | license = "Apache-2.0" 7 | repository = "https://github.com/tonbo-io/tonbo" 8 | readme = "../README.md" 9 | 10 | [lib] 11 | path = "src/lib.rs" 12 | 13 | [features] 14 | default = [] 15 | 16 | [dependencies] 17 | roaring = "0.11" 18 | typed-arrow-dyn = { version = "0.0.6", features = ["serde"] } 19 | -------------------------------------------------------------------------------- /src/inmem/immutable/mod.rs: -------------------------------------------------------------------------------- 1 | //! Read-only immutable memtables. 2 | //! 3 | //! Currently only the dynamic runtime-schema layout is wired through. 4 | 5 | pub(crate) mod memtable; 6 | 7 | /// Immutable segment emitted by sealing the dynamic mutable memtable. 8 | pub(crate) type ImmutableSegment = memtable::ImmutableMemTable; 9 | 10 | /// Lightweight pruning helper; currently returns all segment indexes. 11 | pub(crate) fn prune_segments(segments: &[&ImmutableSegment]) -> Vec { 12 | (0..segments.len()).collect() 13 | } 14 | -------------------------------------------------------------------------------- /src/key/mod.rs: -------------------------------------------------------------------------------- 1 | //! Zero-copy key views and owned key wrappers over Arrow buffers. 2 | //! 3 | //! This module introduces the skeleton for the zero-copy key design. 4 | //! The goal is to keep hot-path key handling on borrow-based views 5 | //! that reference Arrow buffers directly while providing an owned form only 6 | //! where durability requires it. 7 | mod heap_size; 8 | mod owned; 9 | mod row; 10 | mod ts; 11 | 12 | pub use owned::{KeyOwned, KeyOwnedError}; 13 | pub use row::{KeyRow, KeyRowError}; 14 | pub use ts::{KeyTsOwned, KeyTsViewRaw}; 15 | -------------------------------------------------------------------------------- /src/ondisk/mod.rs: -------------------------------------------------------------------------------- 1 | //! On-disk SST scaffolding (writers/readers, merge, scan skeletons). 2 | //! 3 | //! This wip module captures the entry points for durable table structures such 4 | //! as SSTables. The concrete encoding and IO plumbing will land in follow-up 5 | //! changes; for now we outline the core types so downstream components can 6 | //! start wiring mode-agnostic APIs. 7 | 8 | /// Sorted-string-table primitives and planning helpers. 9 | pub mod sstable; 10 | 11 | /// Merge pipeline scaffolding used by major compaction. 12 | pub mod merge; 13 | 14 | /// Scan stream for a sstable 15 | pub mod scan; 16 | -------------------------------------------------------------------------------- /predicate/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![deny(missing_docs)] 2 | //! Tonbo predicate facade crate. 3 | //! 4 | //! This crate is Arrow-first: predicate operands and literals are expressed 5 | //! directly in terms of `typed-arrow-dyn` cells, and evaluation assumes Arrow 6 | //! semantics (including NULL handling and mixed-width numeric coercions). There 7 | //! is no alternate storage or layout backend — keep the surface tight and Arrow 8 | //! native. 9 | 10 | mod core; 11 | 12 | pub use core::{ 13 | BitmapRowSet, ColumnRef, ComparisonOp, Operand, Predicate, PredicateNode, PredicateVisitor, 14 | RowId, RowIdIter, RowSet, ScalarValue, ScalarValueRef, VisitOutcome, 15 | }; 16 | -------------------------------------------------------------------------------- /src/mutation.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | /// Generic mutation container used across dynamic ingest paths. 4 | #[derive(Clone, PartialEq, Eq)] 5 | pub(crate) enum DynMutation { 6 | /// Insert or update payload materialised at commit. 7 | Upsert(U), 8 | /// Logical delete recorded at commit. 9 | Delete(D), 10 | } 11 | 12 | impl fmt::Debug for DynMutation { 13 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 14 | match self { 15 | DynMutation::Upsert(_) => f.write_str("DynMutation::Upsert"), 16 | DynMutation::Delete(_) => f.write_str("DynMutation::Delete"), 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/prelude.rs: -------------------------------------------------------------------------------- 1 | //! Convenience re-exports for common Tonbo usage. 2 | //! 3 | //! # Usage 4 | //! 5 | //! ```rust,ignore 6 | //! use tonbo::prelude::*; 7 | //! 8 | //! #[derive(Record)] 9 | //! struct User { 10 | //! #[metadata(k = "tonbo.key", v = "true")] 11 | //! id: String, 12 | //! name: String, 13 | //! } 14 | //! 15 | //! let db = DbBuilder::from_schema(User::schema())? 16 | //! .on_disk("/tmp/users")?.open().await?; 17 | //! ``` 18 | 19 | #[cfg(feature = "typed-arrow")] 20 | pub use typed_arrow::{Record, prelude::*, schema::SchemaMeta}; 21 | 22 | pub use crate::db::{ 23 | BatchesThreshold, ColumnRef, CommitAckMode, DB, DbBuilder, Predicate, ScalarValue, 24 | }; 25 | -------------------------------------------------------------------------------- /src/compaction/handle.rs: -------------------------------------------------------------------------------- 1 | //! Unified handle for background compaction workers. 2 | 3 | use std::marker::PhantomData; 4 | 5 | use fusio::executor::Executor; 6 | use futures::future::AbortHandle; 7 | 8 | /// Handle to a background compaction worker. 9 | /// 10 | /// Provides control over the worker lifecycle. The worker is automatically 11 | /// aborted when the handle is dropped. 12 | pub(crate) struct CompactionHandle { 13 | abort: AbortHandle, 14 | _marker: PhantomData, 15 | } 16 | 17 | impl CompactionHandle { 18 | /// Create a new compaction handle. 19 | pub(crate) fn new(abort: AbortHandle, _join: Option>) -> Self { 20 | Self { 21 | abort, 22 | _marker: PhantomData, 23 | } 24 | } 25 | } 26 | 27 | impl Drop for CompactionHandle { 28 | fn drop(&mut self) { 29 | self.abort.abort(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /docs/images/0001-query-lifecycle.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | ' Automatically number the steps 3 | autonumber 4 | 5 | ' Define participants (poles) 6 | participant User 7 | participant SQL 8 | participant QueryEngine 9 | participant Predicate 10 | participant Tonbo 11 | participant Parquet 12 | 13 | ' Define the sequence of events 14 | SQL -> QueryEngine: SQL 15 | QueryEngine -> Predicate : scan() (translation) 16 | Predicate -> Tonbo : [predicates], projection 17 | Tonbo -> Parquet : read metadata & prune 18 | Parquet -> Tonbo : RowSet 19 | Tonbo -> Predicate: scan plan with statistic 20 | Predicate -> QueryEngine : ExecutionPlan (translation) 21 | QueryEngine -> QueryEngine : plan optimization 22 | QueryEngine -> Predicate : optimizaed ExecutionPlan 23 | Predicate -> Tonbo : scan plan (translation) 24 | Tonbo -> Tonbo: execute() 25 | Tonbo -> Parquet : push down filter 26 | Parquet -> Tonbo: materialized record batch 27 | Tonbo -> Tonbo: aggregate 28 | Tonbo -> User : Data 29 | 30 | @enduml -------------------------------------------------------------------------------- /tests/common/mod.rs: -------------------------------------------------------------------------------- 1 | //! Common test utilities for integration tests. 2 | 3 | use std::sync::Arc; 4 | 5 | use arrow_schema::{Field, Schema}; 6 | use tonbo::{db::DynModeConfig, schema::SchemaBuilder}; 7 | 8 | /// Convenience helper that builds a DynMode configuration with embedded PK metadata. 9 | pub fn config_with_pk(fields: Vec, primary_key: &[&str]) -> DynModeConfig { 10 | assert!( 11 | !primary_key.is_empty(), 12 | "schema builder requires at least one primary-key column" 13 | ); 14 | 15 | let schema = Arc::new(Schema::new(fields)); 16 | let builder = SchemaBuilder::from_schema(schema); 17 | let builder = if primary_key.len() == 1 { 18 | builder.primary_key(primary_key[0].to_string()) 19 | } else { 20 | builder.composite_key(primary_key.iter().copied().collect::>()) 21 | } 22 | .with_metadata(); 23 | 24 | builder 25 | .build() 26 | .expect("schema builder configuration should succeed") 27 | } 28 | -------------------------------------------------------------------------------- /src/db/error.rs: -------------------------------------------------------------------------------- 1 | use typed_arrow_dyn::DynViewError; 2 | 3 | use crate::{ 4 | db::KeyExtractError, manifest::ManifestError, ondisk::sstable::SsTableError, 5 | query::stream::StreamError, transaction::SnapshotError, 6 | }; 7 | 8 | /// Error returned for DB 9 | #[derive(Debug, thiserror::Error)] 10 | pub enum DBError { 11 | /// Key extract error 12 | #[error("key extract error: {0}")] 13 | Key(#[from] KeyExtractError), 14 | /// Manifest error 15 | #[error("manifest error: {0}")] 16 | Manifest(#[from] ManifestError), 17 | /// Read stream composition failed. 18 | #[error("stream error: {0}")] 19 | Stream(#[from] StreamError), 20 | /// SSTable read/write error. 21 | #[error("sstable error: {0}")] 22 | SsTable(#[from] SsTableError), 23 | /// Snapshot creation error. 24 | #[error("snapshot error: {0}")] 25 | Snapshot(#[from] SnapshotError), 26 | /// Dynamic view error. 27 | #[error("dynamic view error: {0}")] 28 | DynView(#[from] DynViewError), 29 | } 30 | -------------------------------------------------------------------------------- /src/compaction/mod.rs: -------------------------------------------------------------------------------- 1 | //! Compaction coordinators and planners for merging SSTables across all backends. 2 | //! 3 | //! These helpers sit on top of the in-memory staging surfaces and decide when 4 | //! to drain immutable runs into SSTables, whether those SSTables live on local 5 | //! disk or object storage (S3-compatible) via Fusio-backed Parquet writers. 6 | 7 | /// Compaction driver for orchestrating compaction operations. 8 | mod driver; 9 | /// Compaction executor interfaces. 10 | pub(crate) mod executor; 11 | /// Unified handle for background compaction workers. 12 | mod handle; 13 | /// Naïve minor-compaction driver for flushing immutable memtables. 14 | mod minor; 15 | /// Pure orchestration functions for version/outcome manipulation. 16 | pub(crate) mod orchestrator; 17 | /// Leveled compaction planning helpers. 18 | pub mod planner; 19 | /// Scheduler scaffolding for background/remote compaction (native builds only for now). 20 | mod scheduler; 21 | 22 | pub(crate) use driver::CompactionDriver; 23 | pub(crate) use handle::CompactionHandle; 24 | pub(crate) use minor::MinorCompactor; 25 | -------------------------------------------------------------------------------- /examples/cloudflare-worker/wrangler.toml: -------------------------------------------------------------------------------- 1 | name = "tonbo-example-worker" 2 | main = "build/worker/shim.mjs" 3 | compatibility_date = "2024-01-01" 4 | 5 | # Required: Tell wrangler this is a Rust Workers project 6 | [build] 7 | command = "cargo install -q worker-build && worker-build --release" 8 | 9 | # For local development with LocalStack: 10 | [vars] 11 | TONBO_S3_ENDPOINT = "http://localhost:4566" 12 | TONBO_S3_BUCKET = "tonbo-test" 13 | TONBO_S3_REGION = "us-east-1" 14 | 15 | # For production with Cloudflare R2, change to: 16 | # TONBO_S3_ENDPOINT = "https://YOUR_ACCOUNT_ID.r2.cloudflarestorage.com" 17 | # TONBO_S3_BUCKET = "your-bucket-name" 18 | # TONBO_S3_REGION = "auto" 19 | 20 | # Secrets (set via `npx wrangler secret put`): 21 | # - TONBO_S3_ACCESS_KEY 22 | # - TONBO_S3_SECRET_KEY 23 | 24 | # For local development with LocalStack, create .dev.vars: 25 | # TONBO_S3_ACCESS_KEY=test 26 | # TONBO_S3_SECRET_KEY=test 27 | # And override vars in wrangler.toml or use --var flag: 28 | # npx wrangler dev --var TONBO_S3_ENDPOINT:http://localhost:4566 --var TONBO_S3_BUCKET:tonbo-test --var TONBO_S3_REGION:us-east-1 29 | -------------------------------------------------------------------------------- /src/db/tests/core/common.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs, 3 | path::{Path, PathBuf}, 4 | time::{SystemTime, UNIX_EPOCH}, 5 | }; 6 | 7 | pub(super) fn workspace_temp_dir(prefix: &str) -> PathBuf { 8 | let base = std::env::current_dir().expect("cwd"); 9 | let unique = SystemTime::now() 10 | .duration_since(UNIX_EPOCH) 11 | .expect("time") 12 | .as_nanos(); 13 | let dir = base 14 | .join("target") 15 | .join("tmp") 16 | .join(format!("{prefix}-{unique}")); 17 | fs::create_dir_all(&dir).expect("create temp dir"); 18 | dir 19 | } 20 | 21 | pub(super) fn wal_segment_paths(dir: &Path) -> Vec { 22 | if !dir.exists() { 23 | return Vec::new(); 24 | } 25 | let mut files = Vec::new(); 26 | if let Ok(entries) = fs::read_dir(dir) { 27 | for entry in entries.flatten() { 28 | let path = entry.path(); 29 | if path.extension().and_then(|ext| ext.to_str()) == Some("tonwal") { 30 | files.push(path); 31 | } 32 | } 33 | } 34 | files.sort(); 35 | files 36 | } 37 | -------------------------------------------------------------------------------- /src/id.rs: -------------------------------------------------------------------------------- 1 | //! Stable identifiers for files and objects across storage backends. 2 | 3 | use std::sync::Mutex; 4 | 5 | use ulid::{Generator, Ulid}; 6 | 7 | /// Identifier used for files and other persisted artifacts. 8 | pub(crate) type FileId = Ulid; 9 | 10 | /// Thread-safe ULID generator scoped to a single database instance. 11 | pub(crate) struct FileIdGenerator { 12 | inner: Mutex, 13 | } 14 | 15 | impl FileIdGenerator { 16 | /// Create a new generator seeded with the current time. 17 | pub(crate) fn new() -> Self { 18 | Self { 19 | inner: Mutex::new(Generator::new()), 20 | } 21 | } 22 | 23 | /// Produce the next [`FileId`] in a monotonic, time-ordered sequence. 24 | pub(crate) fn generate(&self) -> FileId { 25 | let mut guard = self 26 | .inner 27 | .lock() 28 | .expect("file id generator mutex should not be poisoned"); 29 | guard 30 | .generate() 31 | .expect("file id generator should advance without error") 32 | } 33 | } 34 | 35 | impl Default for FileIdGenerator { 36 | fn default() -> Self { 37 | Self::new() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/manifest/version.rs: -------------------------------------------------------------------------------- 1 | use super::domain::{SstEntry, WalSegmentRef}; 2 | use crate::ondisk::sstable::SsTableId; 3 | 4 | /// Manifest edits applied sequentially to produce a new version. 5 | 6 | #[derive(Debug, Clone)] 7 | pub(crate) enum VersionEdit { 8 | /// Add SST entries to the specified level. 9 | AddSsts { 10 | /// Compaction level receiving the new SSTs. 11 | level: u32, 12 | /// SST descriptors to attach to the level. 13 | entries: Vec, 14 | }, 15 | /// Remove SST entries (by id) from the specified level. 16 | RemoveSsts { 17 | /// Level whose SST set should be pruned. 18 | level: u32, 19 | /// Identifiers of SSTs that must be removed. 20 | sst_ids: Vec, 21 | }, 22 | /// Replace the WAL segments referenced by the version with the provided set. 23 | SetWalSegments { 24 | /// Complete set of WAL fragments backing the version. 25 | segments: Vec, 26 | }, 27 | /// Update the tombstone watermark. 28 | SetTombstoneWatermark { 29 | /// Upper MVCC bound for tombstones visible in the version. 30 | watermark: u64, 31 | }, 32 | } 33 | -------------------------------------------------------------------------------- /examples/cloudflare-worker/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tonbo-cloudflare-worker" 3 | version = "0.1.0" 4 | edition = "2024" 5 | publish = false 6 | 7 | # Standalone workspace (not part of parent tonbo workspace) 8 | [workspace] 9 | 10 | [lib] 11 | crate-type = ["cdylib"] 12 | 13 | [dependencies] 14 | # Tonbo with web (WASM) features 15 | tonbo = { path = "../..", default-features = false, features = ["web"] } 16 | 17 | # Fusio for WebExecutor and AmazonS3 types 18 | fusio = { version = "0.5.0", default-features = false, features = [ 19 | "aws", 20 | "executor-web", 21 | ] } 22 | 23 | # Arrow for RecordBatch creation 24 | arrow-array = "56.2.0" 25 | arrow-schema = "56.2.0" 26 | 27 | # Cloudflare Workers runtime 28 | worker = "0.7" 29 | console_error_panic_hook = "0.1" 30 | 31 | # getrandom needs wasm_js feature for WASM 32 | getrandom = { version = "0.3", features = ["wasm_js"] } 33 | 34 | # For async streams 35 | futures = "0.3" 36 | 37 | # Size optimizations - Cloudflare Workers have a 10MB limit 38 | [profile.release] 39 | opt-level = "z" # Optimize for size 40 | lto = "fat" # Full link-time optimization 41 | strip = "symbols" # Strip debug symbols 42 | codegen-units = 1 # Better optimization (slower compile) 43 | panic = "abort" # Smaller panic handling 44 | -------------------------------------------------------------------------------- /predicate/src/core/operand.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use super::ScalarValue; 4 | 5 | /// Reference identifying a column used inside predicates. 6 | /// 7 | /// This is a logical column reference using only the column name. 8 | /// Physical binding (resolving to schema indices) happens during 9 | /// query planning, not at predicate construction time. 10 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 11 | pub struct ColumnRef { 12 | /// Canonical column name. 13 | pub name: Arc, 14 | } 15 | 16 | impl ColumnRef { 17 | /// Creates a new column reference from a name. 18 | #[must_use] 19 | pub fn new(name: N) -> Self 20 | where 21 | N: Into>, 22 | { 23 | Self { name: name.into() } 24 | } 25 | } 26 | 27 | /// Operand used by predicate comparisons and function calls. 28 | #[derive(Clone, Debug, PartialEq)] 29 | pub enum Operand { 30 | /// Reference to a column. 31 | Column(ColumnRef), 32 | /// Literal value. 33 | Literal(ScalarValue), 34 | } 35 | 36 | impl From for Operand { 37 | fn from(value: ColumnRef) -> Self { 38 | Self::Column(value) 39 | } 40 | } 41 | 42 | impl From for Operand { 43 | fn from(value: ScalarValue) -> Self { 44 | Self::Literal(value) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /docs/images/0002-predicate-pushdown.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | top to bottom direction 3 | 4 | title "Predicate Pushdown" 5 | 6 | rectangle "Step 1: Plan" { 7 | collections "SST" as SSTs 8 | collections "Transaction" as Transactions 9 | collections "Memtable" as Memtables 10 | 11 | [Transactions] --> [PrunedRowSet\n(Transaction)] : Predicate 12 | [Memtables] --> [PrunedRowSet\n(Memtable)] : Predicate 13 | [SSTs] --> [PrunedRowSet\n(SST)] : Predicate 14 | } 15 | 16 | 17 | rectangle "Step 2: Execute" { 18 | [PrunedRowSet\n(Transaction)] --> [FilteredRowSet\n(Transaction)] : Predicate 19 | [PrunedRowSet\n(Memtable)] --> [FilteredRowSet\n(Memtable)] : Predicate 20 | [PrunedRowSet\n(SST)] --> (Pushdown Filter) : Predicate 21 | (Pushdown Filter) --> [FilteredRowSet\n(SST)] : pushdown predicate 22 | } 23 | 24 | rectangle "Step 3: Merge" { 25 | rectangle "Merged Stream" as MS { 26 | rectangle "DedupedRowSetMaps" as Inner 27 | } 28 | 29 | [FilteredRowSet\n(Transaction)] --> Inner : deduplicate & MVCC 30 | [FilteredRowSet\n(Memtable)] --> Inner 31 | [FilteredRowSet\n(SST)] --> Inner 32 | } 33 | 34 | rectangle "Step 4: Materialize" { 35 | MS--> [Materialized Row Stream] : materialize row from sources&\napply residual predicates 36 | } 37 | 38 | rectangle "Step 5: Package/Aggregation" { 39 | [Materialized Row Stream] --> [RecordBatch] : aggregate & package 40 | } 41 | 42 | @enduml 43 | -------------------------------------------------------------------------------- /src/extractor/mod.rs: -------------------------------------------------------------------------------- 1 | //! Arrow `RecordBatch` key extraction into zero-copy key views. 2 | //! 3 | //! These APIs are Tonbo-specific shims that turn Arrow batches into the 4 | //! zero-copy key views defined under [`crate::key`]. The compile-time typed 5 | //! record support that previously lived here has been removed; keeping the module 6 | //! focused on extraction keeps the ingest path clear. 7 | 8 | mod errors; 9 | mod extractors; 10 | 11 | use arrow_array::RecordBatch; 12 | use arrow_schema::SchemaRef; 13 | pub use errors::KeyExtractError; 14 | pub(crate) use extractors::{ 15 | map_view_err, projection_for_columns, projection_for_field, row_from_batch, 16 | }; 17 | 18 | use crate::key::KeyRow; 19 | 20 | /// Schema-validated projection that can materialise logical keys from record batches. 21 | pub trait KeyProjection: Send + Sync { 22 | /// Ensure the projection is compatible with `schema`. 23 | fn validate_schema(&self, schema: &SchemaRef) -> Result<(), KeyExtractError>; 24 | 25 | /// Schema describing just the key columns (no MVCC sidecars). 26 | fn key_schema(&self) -> SchemaRef; 27 | 28 | /// Column indices (in schema order) that form the key projection. 29 | fn key_indices(&self) -> &[usize]; 30 | 31 | /// Project borrowed key views for the requested `rows` (in order) from `batch`. 32 | fn project_view( 33 | &self, 34 | batch: &RecordBatch, 35 | rows: &[usize], 36 | ) -> Result, KeyExtractError>; 37 | } 38 | -------------------------------------------------------------------------------- /src/manifest/mod.rs: -------------------------------------------------------------------------------- 1 | //! Manifest coordination atop `fusio-manifest` for versioned metadata. 2 | //! 3 | //! Internals are generic, but we expose concrete helpers for the supported 4 | //! filesystem backends instead of a new abstraction layer. 5 | 6 | use fusio::{ 7 | dynamic::{MaybeSend, MaybeSync}, 8 | executor::{Executor, Timer}, 9 | fs::{Fs, FsCas}, 10 | }; 11 | 12 | pub(crate) mod bootstrap; 13 | pub(crate) mod codec; 14 | mod domain; 15 | mod driver; 16 | mod version; 17 | 18 | /// Filesystem bound required by manifest store implementations. 19 | pub trait ManifestFs: Fs + FsCas + Clone + MaybeSend + MaybeSync + 'static 20 | where 21 | E: Executor + Timer + Clone + 'static, 22 | Self: fusio_manifest::ObjectHead, 23 | ::File: fusio::durability::FileCommit, 24 | { 25 | } 26 | 27 | impl ManifestFs for FS 28 | where 29 | FS: Fs + FsCas + Clone + MaybeSend + MaybeSync + 'static, 30 | E: Executor + Timer + Clone + 'static, 31 | FS: fusio_manifest::ObjectHead, 32 | ::File: fusio::durability::FileCommit, 33 | { 34 | } 35 | 36 | #[cfg(all(test, feature = "tokio"))] 37 | pub(crate) use bootstrap::init_fs_manifest_in_memory; 38 | #[cfg(test)] 39 | pub(crate) use bootstrap::init_in_memory_manifest; 40 | pub(crate) use bootstrap::{TableSnapshot, TonboManifest}; 41 | #[cfg(all(test, feature = "tokio"))] 42 | pub(crate) use domain::TableHead; 43 | pub use domain::VersionState; 44 | pub(crate) use domain::{ 45 | GcPlanState, GcSstRef, SstEntry, TableDefinition, TableId, TableMeta, WalSegmentRef, 46 | }; 47 | pub use driver::ManifestError; 48 | pub(crate) use driver::ManifestResult; 49 | pub(crate) use version::VersionEdit; 50 | -------------------------------------------------------------------------------- /tests/web_executor.rs: -------------------------------------------------------------------------------- 1 | #![cfg(all(target_arch = "wasm32", feature = "web"))] 2 | 3 | use std::time::Duration; 4 | 5 | use fusio::{ 6 | Read, Write, 7 | executor::{Executor, JoinHandle, Timer, web::WebExecutor}, 8 | fs::{Fs, OpenOptions}, 9 | impls::mem::fs::InMemoryFs, 10 | }; 11 | use wasm_bindgen_test::{wasm_bindgen_test, wasm_bindgen_test_configure}; 12 | 13 | wasm_bindgen_test_configure!(run_in_browser); 14 | 15 | #[wasm_bindgen_test] 16 | async fn spawn_and_sleep_progresses() { 17 | let exec = WebExecutor::new(); 18 | let value = WebExecutor::rw_lock(0); 19 | 20 | let handle = exec.spawn({ 21 | let value = value.clone(); 22 | async move { 23 | let mut guard = value.write().await; 24 | *guard = 7; 25 | } 26 | }); 27 | 28 | // Join is unjoinable on web; it should error but the task should still run. 29 | assert!(handle.join().await.is_err()); 30 | 31 | exec.sleep(Duration::from_millis(5)).await; 32 | assert_eq!(*value.read().await, 7); 33 | } 34 | 35 | #[wasm_bindgen_test] 36 | async fn in_memory_fs_roundtrip() { 37 | let fs = InMemoryFs::new(); 38 | let mut file = fs 39 | .open_options( 40 | &"web/roundtrip.txt".into(), 41 | OpenOptions::default() 42 | .write(true) 43 | .create(true) 44 | .truncate(true), 45 | ) 46 | .await 47 | .expect("open file"); 48 | 49 | let (res, _) = file.write_all(&b"tonbo-web"[..]).await; 50 | res.expect("write"); 51 | let (res, buf) = file.read_to_end_at(Vec::new(), 0).await; 52 | res.expect("read"); 53 | assert_eq!(buf, b"tonbo-web"); 54 | file.close().await.expect("close"); 55 | } 56 | -------------------------------------------------------------------------------- /examples/01_basic.rs: -------------------------------------------------------------------------------- 1 | //! Basic Tonbo example: define schema, insert, query 2 | //! 3 | //! Run: cargo run --example 01_basic 4 | 5 | use tonbo::prelude::*; 6 | 7 | #[derive(Record)] 8 | struct User { 9 | #[metadata(k = "tonbo.key", v = "true")] 10 | id: String, 11 | name: String, 12 | score: Option, 13 | } 14 | 15 | #[tokio::main] 16 | async fn main() -> Result<(), Box> { 17 | // 1. Open database (key detected from schema metadata) 18 | let db = DbBuilder::from_schema(User::schema())? 19 | .on_disk("/tmp/tonbo_example")? 20 | .open() 21 | .await?; 22 | 23 | // 2. Insert data 24 | let users = vec![ 25 | User { 26 | id: "u1".into(), 27 | name: "Alice".into(), 28 | score: Some(100), 29 | }, 30 | User { 31 | id: "u2".into(), 32 | name: "Bob".into(), 33 | score: Some(85), 34 | }, 35 | User { 36 | id: "u3".into(), 37 | name: "Carol".into(), 38 | score: None, 39 | }, 40 | ]; 41 | 42 | let mut builders = User::new_builders(users.len()); 43 | builders.append_rows(users); 44 | db.ingest(builders.finish().into_record_batch()).await?; 45 | 46 | // 3. Query: score > 80 47 | let filter = Predicate::gt(ColumnRef::new("score"), ScalarValue::from(80_i64)); 48 | let batches = db.scan().filter(filter).collect().await?; 49 | 50 | println!("Users with score > 80:"); 51 | for batch in &batches { 52 | for user in batch.iter_views::()?.try_flatten()? { 53 | println!(" {} - {} ({:?})", user.id, user.name, user.score); 54 | } 55 | } 56 | 57 | Ok(()) 58 | } 59 | -------------------------------------------------------------------------------- /src/tests_internal/compaction_loop_spawn.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::{sync::Arc, time::Duration}; 4 | 5 | use arrow_schema::{DataType, Field, Schema}; 6 | use fusio::{executor::tokio::TokioExecutor, mem::fs::InMemoryFs, path::Path}; 7 | use tokio::{task::LocalSet, time::sleep}; 8 | 9 | use crate::{ 10 | compaction::planner::CompactionStrategy, db::DB, ondisk::sstable::SsTableConfig, 11 | schema::SchemaBuilder, 12 | }; 13 | 14 | #[tokio::test(flavor = "current_thread")] 15 | async fn compaction_loop_is_spawned_when_configured() { 16 | let local = LocalSet::new(); 17 | local 18 | .run_until(async { 19 | let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); 20 | let cfg = SchemaBuilder::from_schema(Arc::clone(&schema)) 21 | .primary_key("id") 22 | .build() 23 | .expect("schema"); 24 | 25 | let fs = Arc::new(InMemoryFs::default()); 26 | let root = Path::parse("compaction").expect("path"); 27 | let sst_cfg = Arc::new(SsTableConfig::new(Arc::clone(&schema), fs, root)); 28 | 29 | let db = DB::::builder(cfg) 30 | .in_memory("compaction-loop") 31 | .expect("in_memory config") 32 | .with_compaction_strategy(CompactionStrategy::default()) 33 | .with_compaction_loop(Duration::from_millis(5), Arc::clone(&sst_cfg), 1) 34 | .build() 35 | .await 36 | .expect("build"); 37 | 38 | assert!( 39 | db.has_compaction_worker(), 40 | "compaction loop should be spawned when requested" 41 | ); 42 | 43 | // Let the loop tick at least once before dropping. 44 | sleep(Duration::from_millis(10)).await; 45 | }) 46 | .await; 47 | } 48 | -------------------------------------------------------------------------------- /.githooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | ROOT_DIR="$(git rev-parse --show-toplevel)" 6 | cd "${ROOT_DIR}" 7 | 8 | HOOK_NAME="tonbo pre-commit" 9 | S3_ENV_SCRIPT="${ROOT_DIR}/tests/s3_localstack_env.sh" 10 | 11 | log() { 12 | echo "[${HOOK_NAME}] $1" 13 | } 14 | 15 | if ! command -v cargo >/dev/null 2>&1; then 16 | echo "[${HOOK_NAME}] skipping: cargo not found" >&2 17 | exit 0 18 | fi 19 | 20 | if ! command -v rustup >/dev/null 2>&1; then 21 | echo "[${HOOK_NAME}] rustup not detected; install rustup and the nightly toolchain" >&2 22 | exit 1 23 | fi 24 | 25 | if ! rustup toolchain list | grep -q "nightly"; then 26 | echo "[${HOOK_NAME}] nightly toolchain required (rustup toolchain install nightly)" >&2 27 | exit 1 28 | fi 29 | 30 | FMT_CHECK_CMD="cargo +nightly fmt --all -- --check" 31 | CLIPPY_CMD="cargo clippy --workspace -- -D warnings" 32 | BUILD_CMD="cargo build --verbose" 33 | TEST_CMD="cargo test --verbose" 34 | 35 | run_step() { 36 | local cmd="$1" 37 | log "$cmd" 38 | if ! eval "$cmd"; then 39 | echo "[${HOOK_NAME}] command failed: $cmd" >&2 40 | if [[ "$cmd" == "$FMT_CHECK_CMD" ]]; then 41 | echo "[${HOOK_NAME}] run 'cargo +nightly fmt --all' to apply formatting and re-stage changes before committing." >&2 42 | fi 43 | exit 1 44 | fi 45 | } 46 | 47 | run_step "$FMT_CHECK_CMD" 48 | run_step "$CLIPPY_CMD" 49 | run_step "$BUILD_CMD" 50 | run_step "$TEST_CMD" 51 | 52 | log "bootstrap LocalStack (if available) for S3-backed public_api_e2e" 53 | if source "${S3_ENV_SCRIPT}"; then 54 | run_step "cargo test public_api_e2e:: -- --nocapture" 55 | if [ "${TONBO_LOCALSTACK_STARTED_BY_SCRIPT:-0}" = "1" ] && command -v docker >/dev/null 2>&1; then 56 | docker rm -f "${TONBO_LOCALSTACK_CONTAINER}" >/dev/null 2>&1 || true 57 | fi 58 | else 59 | log "LocalStack not available; running public_api_e2e (local only)" 60 | run_step "cargo test public_api_e2e:: -- --nocapture" 61 | fi 62 | 63 | exit 0 64 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Integration Test Harnesses 2 | 3 | ## `s3_smoke` 4 | 5 | - **What**: Exercises Tonbo's S3 object-store path end-to-end using LocalStack. 6 | - **How to run**: 7 | 1. Ensure Docker is available. 8 | 2. From the repo root, run `./tests/s3_smoke.sh`. 9 | - The script starts LocalStack, provisions the bucket, seeds the required 10 | `TONBO_S3_*` environment variables, and runs 11 | `cargo test --features s3-smoke --test s3_smoke`. 12 | 13 | | Variable | Description | Default when using script | 14 | |-------------------------|------------------------------------------|---------------------------| 15 | | `TONBO_S3_ENDPOINT` | HTTP endpoint for the S3-compatible API | `http://localhost:4566` | 16 | | `TONBO_S3_BUCKET` | Bucket name used for the smoke run | `tonbo-smoke` | 17 | | `TONBO_S3_REGION` | Region reported to fusio | `us-east-1` | 18 | | `TONBO_S3_ACCESS_KEY` | Access key ID | `test` | 19 | | `TONBO_S3_SECRET_KEY` | Secret access key | `test` | 20 | | `TONBO_S3_SESSION_TOKEN`| Optional session token (only if needed) | unset | 21 | 22 | The script derives the defaults from its own `AWS_*` variables. Override any of 23 | them before running the script to point at an existing deployment. 24 | 25 | The smoke test writes a single batch, so expect a fresh prefix such as 26 | `smoke-/` containing `wal/wal-000…0.tonwal` and `wal/state.json` in 27 | your bucket when it finishes. 28 | 3. On success it prints "Smoke test complete." and tails recent LocalStack 29 | logs; on failure it exits non-zero and shows a longer log tail for 30 | debugging. 31 | - **Alternative**: If you already have an S3-compatible endpoint up, export the 32 | `TONBO_S3_*` variables yourself and run 33 | `cargo test --features s3-smoke --test s3_smoke`. 34 | 35 | The `s3_smoke` test is gated behind the `s3-smoke` feature, so the regular test 36 | suite will skip it unless explicitly enabled. 37 | -------------------------------------------------------------------------------- /examples/02b_snapshot.rs: -------------------------------------------------------------------------------- 1 | //! Snapshots: read-only consistent view of the database 2 | //! 3 | //! Run: cargo run --example 02b_snapshot 4 | 5 | use tonbo::prelude::*; 6 | 7 | #[derive(Record)] 8 | struct User { 9 | #[metadata(k = "tonbo.key", v = "true")] 10 | id: String, 11 | name: String, 12 | score: Option, 13 | } 14 | 15 | #[tokio::main] 16 | async fn main() -> Result<(), Box> { 17 | let db = DbBuilder::from_schema(User::schema())? 18 | .on_disk("/tmp/tonbo_snapshot_example")? 19 | .open() 20 | .await?; 21 | 22 | // Insert initial data 23 | let users = vec![User { 24 | id: "u1".into(), 25 | name: "Alice".into(), 26 | score: Some(100), 27 | }]; 28 | let mut builders = User::new_builders(users.len()); 29 | builders.append_rows(users); 30 | db.ingest(builders.finish().into_record_batch()).await?; 31 | 32 | // Take a snapshot (read-only, consistent view) 33 | let snapshot = db.begin_snapshot().await?; 34 | 35 | // Insert more data after snapshot 36 | let more = vec![User { 37 | id: "u2".into(), 38 | name: "Bob".into(), 39 | score: Some(85), 40 | }]; 41 | let mut builders = User::new_builders(more.len()); 42 | builders.append_rows(more); 43 | db.ingest(builders.finish().into_record_batch()).await?; 44 | 45 | // Snapshot sees only data at snapshot time 46 | let filter = Predicate::is_not_null(ColumnRef::new("id")); 47 | let snapshot_data = snapshot.scan(&db).filter(filter.clone()).collect().await?; 48 | 49 | println!("Snapshot (frozen in time):"); 50 | for batch in &snapshot_data { 51 | for user in batch.iter_views::()?.try_flatten()? { 52 | println!(" {} - {}", user.id, user.name); 53 | } 54 | } 55 | 56 | // Current DB sees all data 57 | let current_data = db.scan().filter(filter).collect().await?; 58 | 59 | println!("\nCurrent DB:"); 60 | for batch in ¤t_data { 61 | for user in batch.iter_views::()?.try_flatten()? { 62 | println!(" {} - {}", user.id, user.name); 63 | } 64 | } 65 | 66 | Ok(()) 67 | } 68 | -------------------------------------------------------------------------------- /src/wal/metrics.rs: -------------------------------------------------------------------------------- 1 | //! Metrics and observability glue for the WAL. 2 | 3 | /// Collection of WAL metrics exposed to monitoring systems. 4 | #[derive(Default, Debug)] 5 | pub struct WalMetrics { 6 | /// Current depth of the writer queue. 7 | pub queue_depth: usize, 8 | /// Bytes written since process start. 9 | pub bytes_written: u64, 10 | /// Number of durability operations performed. 11 | pub sync_operations: u64, 12 | /// Number of times the manifest advanced the WAL floor. 13 | pub wal_floor_advancements: u64, 14 | /// Total WAL segments physically pruned. 15 | pub wal_segments_pruned: u64, 16 | /// Total WAL segments flagged for deletion during dry-runs. 17 | pub wal_prune_dry_runs: u64, 18 | /// Number of failed prune attempts. 19 | pub wal_prune_failures: u64, 20 | } 21 | 22 | impl WalMetrics { 23 | /// Record a queue depth update. 24 | pub fn record_queue_depth(&mut self, depth: usize) { 25 | self.queue_depth = depth; 26 | } 27 | 28 | /// Record additional written bytes. 29 | pub fn record_bytes_written(&mut self, bytes: u64) { 30 | self.bytes_written = self.bytes_written.saturating_add(bytes); 31 | } 32 | 33 | /// Record a durability operation. 34 | pub fn record_sync(&mut self) { 35 | self.sync_operations = self.sync_operations.saturating_add(1); 36 | } 37 | 38 | /// Record an advancement of the WAL retention floor. 39 | pub fn record_wal_floor_advance(&mut self) { 40 | self.wal_floor_advancements = self.wal_floor_advancements.saturating_add(1); 41 | } 42 | 43 | /// Record physical WAL segment deletions. 44 | pub fn record_wal_pruned(&mut self, segments: u64) { 45 | self.wal_segments_pruned = self.wal_segments_pruned.saturating_add(segments); 46 | } 47 | 48 | /// Record the number of segments that would be deleted in dry-run mode. 49 | pub fn record_wal_prune_dry_run(&mut self, segments: u64) { 50 | self.wal_prune_dry_runs = self.wal_prune_dry_runs.saturating_add(segments); 51 | } 52 | 53 | /// Record a prune failure. 54 | pub fn record_wal_prune_failure(&mut self) { 55 | self.wal_prune_failures = self.wal_prune_failures.saturating_add(1); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/inmem/mutable/metrics.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; 2 | 3 | #[derive(Debug)] 4 | pub(crate) struct MutableMemTableMetrics { 5 | entries: AtomicUsize, 6 | inserts: AtomicU64, 7 | replaces: AtomicU64, 8 | approx_key_bytes: AtomicUsize, 9 | entry_overhead: AtomicUsize, 10 | } 11 | 12 | #[derive(Debug, Default, Clone, Copy)] 13 | pub(crate) struct MutableMemTableMetricsSnapshot { 14 | pub entries: usize, 15 | pub inserts: u64, 16 | pub replaces: u64, 17 | pub approx_key_bytes: usize, 18 | pub entry_overhead: usize, 19 | } 20 | 21 | impl MutableMemTableMetrics { 22 | pub(crate) fn new(entry_overhead: usize) -> Self { 23 | Self { 24 | entries: AtomicUsize::new(0), 25 | inserts: AtomicU64::new(0), 26 | replaces: AtomicU64::new(0), 27 | approx_key_bytes: AtomicUsize::new(0), 28 | entry_overhead: AtomicUsize::new(entry_overhead), 29 | } 30 | } 31 | 32 | pub(crate) fn record_write(&self, has_existing: bool, key_bytes: usize) { 33 | self.inserts.fetch_add(1, Ordering::Relaxed); 34 | if has_existing { 35 | self.replaces.fetch_add(1, Ordering::Relaxed); 36 | } else { 37 | self.entries.fetch_add(1, Ordering::Relaxed); 38 | self.approx_key_bytes 39 | .fetch_add(key_bytes, Ordering::Relaxed); 40 | } 41 | } 42 | 43 | pub(crate) fn snapshot(&self) -> MutableMemTableMetricsSnapshot { 44 | MutableMemTableMetricsSnapshot { 45 | entries: self.entries.load(Ordering::Relaxed), 46 | inserts: self.inserts.load(Ordering::Relaxed), 47 | replaces: self.replaces.load(Ordering::Relaxed), 48 | approx_key_bytes: self.approx_key_bytes.load(Ordering::Relaxed), 49 | entry_overhead: self.entry_overhead.load(Ordering::Relaxed), 50 | } 51 | } 52 | 53 | pub(crate) fn reset_counters(&self) { 54 | self.entries.store(0, Ordering::Relaxed); 55 | self.inserts.store(0, Ordering::Relaxed); 56 | self.replaces.store(0, Ordering::Relaxed); 57 | self.approx_key_bytes.store(0, Ordering::Relaxed); 58 | } 59 | } 60 | 61 | impl Default for MutableMemTableMetrics { 62 | fn default() -> Self { 63 | Self::new(0) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/test.rs: -------------------------------------------------------------------------------- 1 | //! Test utilities and helpers for tonbo. 2 | //! 3 | //! This module contains internal test utilities available under `#[cfg(test)]`. 4 | 5 | use arrow_array::RecordBatch; 6 | use arrow_schema::SchemaRef; 7 | #[cfg(feature = "tokio")] 8 | use arrow_schema::{Field, Schema}; 9 | use typed_arrow_dyn::{DynBuilders, DynCell, DynError, DynRow}; 10 | 11 | #[cfg(feature = "tokio")] 12 | use crate::{mode::DynModeConfig, schema::SchemaBuilder}; 13 | 14 | /// Trait for types that can be converted into a `DynRow`. 15 | pub(crate) trait IntoDynRow { 16 | /// Convert into a `DynRow`. 17 | fn into_dyn_row(self) -> DynRow; 18 | } 19 | 20 | impl IntoDynRow for DynRow { 21 | fn into_dyn_row(self) -> DynRow { 22 | self 23 | } 24 | } 25 | 26 | impl IntoDynRow for Vec> { 27 | fn into_dyn_row(self) -> DynRow { 28 | DynRow(self) 29 | } 30 | } 31 | 32 | /// Build a `RecordBatch` from dynamic rows, validating nullability. 33 | /// 34 | /// Accepts either `Vec` or `Vec>>` for convenience. 35 | /// 36 | /// # Errors 37 | /// Returns [`DynError`] if any row violates the schema or array construction fails. 38 | pub(crate) fn build_batch( 39 | schema: SchemaRef, 40 | rows: Vec, 41 | ) -> Result { 42 | let mut builders = DynBuilders::new(schema.clone(), rows.len()); 43 | for row in rows { 44 | builders.append_option_row(Some(row.into_dyn_row()))?; 45 | } 46 | builders.try_finish_into_batch() 47 | } 48 | 49 | /// Convenience helper that builds a DynMode configuration with embedded PK metadata. 50 | #[cfg(feature = "tokio")] 51 | pub(crate) fn config_with_pk(fields: Vec, primary_key: &[&str]) -> DynModeConfig { 52 | assert!( 53 | !primary_key.is_empty(), 54 | "schema builder requires at least one primary-key column" 55 | ); 56 | 57 | let schema = SchemaRef::new(Schema::new(fields)); 58 | let builder = SchemaBuilder::from_schema(schema); 59 | let builder = if primary_key.len() == 1 { 60 | builder.primary_key(primary_key[0].to_string()) 61 | } else { 62 | builder.composite_key(primary_key.iter().copied().collect::>()) 63 | } 64 | .with_metadata(); 65 | 66 | builder 67 | .build() 68 | .expect("schema builder configuration should succeed") 69 | } 70 | -------------------------------------------------------------------------------- /docs/rfcs/0000-rfc-style-guide.md: -------------------------------------------------------------------------------- 1 | # RFC: RFC Style Guide 2 | 3 | - Status: Accepted 4 | - Authors: Tonbo team 5 | - Created: 2025-12-03 6 | - Area: Process 7 | 8 | ## Summary 9 | 10 | Define the structure and style for Tonbo RFCs. RFCs document design intent and semantics, not implementation details. 11 | 12 | ## Motivation 13 | 14 | - Ensure consistent RFC structure across the project 15 | - Make RFCs easy to navigate and review 16 | - Keep design documents focused on *what* and *why*, not *how* 17 | 18 | ## Goals 19 | 20 | - Establish a standard RFC template 21 | - Define required and optional sections 22 | - Set expectations for content style 23 | 24 | ## Non-Goals 25 | 26 | - Prescribing implementation approaches 27 | - Defining code review or approval processes 28 | 29 | ## Design 30 | 31 | ### Header Metadata 32 | 33 | Every RFC starts with: 34 | 35 | ``` 36 | # RFC: 37 | 38 | - Status: Draft | Accepted | Implementing | Superseded 39 | - Authors: <team or individuals> 40 | - Created: <YYYY-MM-DD> 41 | - Updated: <YYYY-MM-DD> (if revised) 42 | - Area: <affected components> 43 | ``` 44 | 45 | ### Required Sections 46 | 47 | | Section | Purpose | 48 | |---------|---------| 49 | | Summary | One paragraph describing the design | 50 | | Motivation | Why this design is needed | 51 | | Goals | What the design achieves | 52 | | Non-Goals | Explicit scope boundaries | 53 | | Design | The core design with subsections as needed | 54 | 55 | ### Optional Sections 56 | 57 | | Section | Purpose | 58 | |---------|---------| 59 | | Alternatives Considered | Other approaches and why they were rejected | 60 | | Comparison with Other Systems | How similar systems (Iceberg, RocksDB, etc.) solve the same problem and trade-offs of our approach | 61 | | Future Work | Known limitations and planned extensions | 62 | 63 | ### Style Principles 64 | 65 | - **Precursory**: Write or update the RFC before starting implementation; use it to gather feedback and align direction 66 | - **Skimmable**: Structure with clear headings; readers should locate relevant information quickly 67 | - **Current**: Incorrect documentation is worse than missing documentation; update RFCs when implementation diverges 68 | - **Consistent**: Use consistent terminology across RFCs; align with `docs/overview.md` vocabulary 69 | - **Semantic**: Describe *what* and *why*, not *how*; focus on contracts, not code 70 | - Use tables and diagrams to clarify complex relationships 71 | -------------------------------------------------------------------------------- /src/manifest/codec.rs: -------------------------------------------------------------------------------- 1 | use std::hash::Hash; 2 | 3 | use serde::{Serialize, de::DeserializeOwned}; 4 | 5 | use crate::manifest::{ 6 | ManifestError, 7 | domain::{CatalogKey, CatalogValue, GcPlanKey, GcPlanValue, VersionKey, VersionValue}, 8 | }; 9 | 10 | /// Trait describing the key/value serialization used for a manifest instance. 11 | pub(crate) trait ManifestCodec { 12 | /// Key type stored inside `fusio-manifest`. 13 | type Key: Clone + Ord + Eq + Hash + Serialize + DeserializeOwned + Send + Sync + 'static; 14 | /// Value payload paired with each key. 15 | type Value: Clone + Serialize + DeserializeOwned + Send + Sync + 'static; 16 | 17 | /// Ensure the provided key/value pair is well-formed for the codec. 18 | fn validate_key_value(key: &Self::Key, value: &Self::Value) -> Result<(), ManifestError>; 19 | } 20 | 21 | /// Marker codec binding the catalog key/value types together. 22 | 23 | #[derive(Debug, Clone, Copy, Default)] 24 | pub(crate) struct CatalogCodec; 25 | 26 | impl ManifestCodec for CatalogCodec { 27 | type Key = CatalogKey; 28 | type Value = CatalogValue; 29 | 30 | fn validate_key_value(_key: &Self::Key, _value: &Self::Value) -> Result<(), ManifestError> { 31 | Ok(()) 32 | } 33 | } 34 | 35 | /// Marker codec binding the version key/value types together. 36 | #[derive(Debug, Clone, Copy, Default)] 37 | pub(crate) struct VersionCodec; 38 | 39 | impl ManifestCodec for VersionCodec { 40 | type Key = VersionKey; 41 | type Value = VersionValue; 42 | 43 | fn validate_key_value(key: &Self::Key, value: &Self::Value) -> Result<(), ManifestError> { 44 | match (key, value) { 45 | (VersionKey::TableHead { .. }, VersionValue::TableHead(_)) => Ok(()), 46 | (VersionKey::TableVersion { .. }, VersionValue::TableVersion(_)) => Ok(()), 47 | (VersionKey::WalFloor { .. }, VersionValue::WalFloor(_)) => Ok(()), 48 | _ => Err(ManifestError::Invariant("manifest key/value type mismatch")), 49 | } 50 | } 51 | } 52 | 53 | /// Marker codec binding the GC-plan key/value types together. 54 | 55 | #[derive(Debug, Clone, Copy, Default)] 56 | pub(crate) struct GcPlanCodec; 57 | 58 | impl ManifestCodec for GcPlanCodec { 59 | type Key = GcPlanKey; 60 | type Value = GcPlanValue; 61 | 62 | fn validate_key_value(_key: &Self::Key, _value: &Self::Value) -> Result<(), ManifestError> { 63 | Ok(()) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tests/s3_smoke.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # S3 integration smoke harness. Run from the repo root via `./tests/s3_smoke.sh`. 3 | set -euo pipefail 4 | 5 | command -v docker >/dev/null 2>&1 || { echo "docker is required" >&2; exit 1; } 6 | command -v cargo >/dev/null 2>&1 || { echo "cargo is required" >&2; exit 1; } 7 | 8 | LOCALSTACK_CONTAINER=${LOCALSTACK_CONTAINER:-tonbo-localstack-smoke} 9 | LOCALSTACK_PORT=${LOCALSTACK_PORT:-4566} 10 | LOCALSTACK_IMAGE=${LOCALSTACK_IMAGE:-localstack/localstack:latest} 11 | AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-test} 12 | AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-test} 13 | AWS_REGION=${AWS_REGION:-us-east-1} 14 | BUCKET_NAME=${BUCKET_NAME:-tonbo-smoke} 15 | 16 | cleanup() { 17 | if docker ps --format '{{.Names}}' | grep -q "^${LOCALSTACK_CONTAINER}$"; then 18 | docker rm -f "${LOCALSTACK_CONTAINER}" >/dev/null 2>&1 || true 19 | fi 20 | } 21 | trap cleanup EXIT 22 | 23 | cleanup 24 | 25 | echo "Starting LocalStack (${LOCALSTACK_CONTAINER})..." 26 | docker run -d --name "${LOCALSTACK_CONTAINER}" \ 27 | -e SERVICES="s3" \ 28 | -e AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}" \ 29 | -e AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}" \ 30 | -e AWS_DEFAULT_REGION="${AWS_REGION}" \ 31 | -p "${LOCALSTACK_PORT}:4566" \ 32 | "${LOCALSTACK_IMAGE}" >/dev/null 33 | 34 | echo -n "Waiting for LocalStack to become ready" 35 | until docker exec "${LOCALSTACK_CONTAINER}" awslocal s3api list-buckets >/dev/null 2>&1; do 36 | sleep 1 37 | printf '.' 38 | done 39 | echo 40 | 41 | echo "Provisioning S3 bucket ${BUCKET_NAME}..." 42 | docker exec "${LOCALSTACK_CONTAINER}" awslocal s3api create-bucket --bucket "${BUCKET_NAME}" >/dev/null 2>&1 || true 43 | 44 | env_vars=( 45 | "TONBO_S3_ENDPOINT=http://localhost:${LOCALSTACK_PORT}" 46 | "TONBO_S3_BUCKET=${BUCKET_NAME}" 47 | "TONBO_S3_REGION=${AWS_REGION}" 48 | "TONBO_S3_ACCESS_KEY=${AWS_ACCESS_KEY_ID}" 49 | "TONBO_S3_SECRET_KEY=${AWS_SECRET_ACCESS_KEY}" 50 | ) 51 | 52 | export "${env_vars[@]}" 53 | echo "Running cargo test --features s3-smoke --test s3_smoke" 54 | if cargo test --features s3-smoke --test s3_smoke; then 55 | echo "Smoke test complete." 56 | echo "-- LocalStack logs (tail after success) --" 57 | docker logs --tail 20 "${LOCALSTACK_CONTAINER}" || true 58 | exit 0 59 | else 60 | status=$? 61 | echo "Smoke test failed (status $status)." 62 | echo "-- LocalStack logs (tail) --" 63 | docker logs --tail 40 "${LOCALSTACK_CONTAINER}" || true 64 | exit $status 65 | fi 66 | -------------------------------------------------------------------------------- /src/query/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | //! Predicate and scan-planning helpers for Tonbo’s read path. 3 | //! 4 | //! This module bridges user-facing predicates into the internal scan planner 5 | //! and stream executor. It re-exports the `predicate` crate’s surface and adds 6 | //! conversions for key types used in scan planning. 7 | 8 | pub(crate) mod scan; 9 | pub(crate) mod stream; 10 | 11 | use std::convert::TryFrom; 12 | 13 | pub use tonbo_predicate::{ 14 | ColumnRef, ComparisonOp, Operand, Predicate, PredicateNode, ScalarValue, 15 | }; 16 | 17 | use crate::key::KeyOwned; 18 | 19 | /// Trait describing key types that can be derived from predicate scalar literals. 20 | pub trait KeyPredicateValue: Ord + Clone { 21 | /// Convert a predicate scalar literal into the key type. 22 | fn from_scalar(value: &ScalarValue) -> Option<Self>; 23 | } 24 | 25 | impl KeyPredicateValue for i32 { 26 | fn from_scalar(value: &ScalarValue) -> Option<Self> { 27 | let view = value.as_ref(); 28 | if let Some(v) = view.as_int_i128() { 29 | return i32::try_from(v).ok(); 30 | } 31 | if let Some(v) = view.as_uint_u128() { 32 | return i32::try_from(v).ok(); 33 | } 34 | None 35 | } 36 | } 37 | 38 | impl KeyPredicateValue for i64 { 39 | fn from_scalar(value: &ScalarValue) -> Option<Self> { 40 | let view = value.as_ref(); 41 | if let Some(v) = view.as_int_i128() { 42 | return i64::try_from(v).ok(); 43 | } 44 | if let Some(v) = view.as_uint_u128() { 45 | return i64::try_from(v).ok(); 46 | } 47 | None 48 | } 49 | } 50 | 51 | impl KeyPredicateValue for KeyOwned { 52 | fn from_scalar(value: &ScalarValue) -> Option<Self> { 53 | let view = value.as_ref(); 54 | if view.is_null() { 55 | return None; 56 | } 57 | if let Some(v) = view.as_bool() { 58 | return Some(v.into()); 59 | } 60 | if let Some(v) = view.as_int_i128() 61 | && let Ok(val) = i64::try_from(v) 62 | { 63 | return Some(KeyOwned::from(val)); 64 | } 65 | if let Some(v) = view.as_uint_u128() 66 | && let Ok(val) = u64::try_from(v) 67 | { 68 | return Some(KeyOwned::from(val)); 69 | } 70 | if let Some(v) = view.as_f64() { 71 | return Some(KeyOwned::from(v)); 72 | } 73 | if let Some(v) = view.as_utf8() { 74 | return Some(v.into()); 75 | } 76 | if let Some(v) = view.as_binary() { 77 | return Some(v.to_vec().into()); 78 | } 79 | None 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /examples/02_transaction.rs: -------------------------------------------------------------------------------- 1 | //! Transactions: upsert, delete, read-your-writes, commit 2 | //! 3 | //! Run: cargo run --example 02_transaction 4 | 5 | use tonbo::prelude::*; 6 | 7 | #[derive(Record)] 8 | struct User { 9 | #[metadata(k = "tonbo.key", v = "true")] 10 | id: String, 11 | name: String, 12 | score: Option<i64>, 13 | } 14 | 15 | #[tokio::main] 16 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 17 | let db = DbBuilder::from_schema(User::schema())? 18 | .on_disk("/tmp/tonbo_tx_example")? 19 | .open() 20 | .await?; 21 | 22 | // Insert initial data 23 | let users = vec![ 24 | User { 25 | id: "u1".into(), 26 | name: "Alice".into(), 27 | score: Some(100), 28 | }, 29 | User { 30 | id: "u2".into(), 31 | name: "Bob".into(), 32 | score: Some(85), 33 | }, 34 | ]; 35 | let mut builders = User::new_builders(users.len()); 36 | builders.append_rows(users); 37 | db.ingest(builders.finish().into_record_batch()).await?; 38 | 39 | // Begin transaction 40 | let mut tx = db.begin_transaction().await?; 41 | 42 | // Upsert: update Alice's score, add Carol 43 | let updates = vec![ 44 | User { 45 | id: "u1".into(), 46 | name: "Alice".into(), 47 | score: Some(150), 48 | }, 49 | User { 50 | id: "u3".into(), 51 | name: "Carol".into(), 52 | score: Some(90), 53 | }, 54 | ]; 55 | let mut builders = User::new_builders(updates.len()); 56 | builders.append_rows(updates); 57 | tx.upsert_batch(&builders.finish().into_record_batch())?; 58 | 59 | // Delete Bob 60 | tx.delete("u2")?; 61 | 62 | // Read-your-writes: see uncommitted changes within the transaction 63 | let filter = Predicate::is_not_null(ColumnRef::new("id")); 64 | let preview = tx.scan().filter(filter).collect().await?; 65 | 66 | println!("Before commit (read-your-writes):"); 67 | for batch in &preview { 68 | for user in batch.iter_views::<User>()?.try_flatten()? { 69 | println!(" {} - {} ({:?})", user.id, user.name, user.score); 70 | } 71 | } 72 | 73 | // Commit 74 | tx.commit().await?; 75 | 76 | // Verify after commit 77 | let filter = Predicate::is_not_null(ColumnRef::new("id")); 78 | let committed = db.scan().filter(filter).collect().await?; 79 | 80 | println!("\nAfter commit:"); 81 | for batch in &committed { 82 | for user in batch.iter_views::<User>()?.try_flatten()? { 83 | println!(" {} - {} ({:?})", user.id, user.name, user.score); 84 | } 85 | } 86 | 87 | Ok(()) 88 | } 89 | -------------------------------------------------------------------------------- /src/key/heap_size.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | use typed_arrow_dyn::DynCell; 4 | 5 | use super::{KeyOwned, KeyRow}; 6 | 7 | /// Estimate heap usage of key types used across memtables and indexes. 8 | /// Primitives and booleans report zero; buffer-backed components return their 9 | /// byte length; tuples sum their parts. 10 | pub trait KeyHeapSize { 11 | /// Approximate heap bytes consumed by the key representation. 12 | fn key_heap_size(&self) -> usize; 13 | } 14 | 15 | macro_rules! impl_key_size_prim { 16 | ($($t:ty),* $(,)?) => { 17 | $( 18 | impl KeyHeapSize for $t { 19 | fn key_heap_size(&self) -> usize { 20 | 0 21 | } 22 | } 23 | )* 24 | }; 25 | } 26 | 27 | impl_key_size_prim!(u8, u16, u32, u64, usize, i8, i16, i32, i64, isize, bool); 28 | 29 | impl<A: KeyHeapSize, B: KeyHeapSize> KeyHeapSize for (A, B) { 30 | fn key_heap_size(&self) -> usize { 31 | self.0.key_heap_size() + self.1.key_heap_size() 32 | } 33 | } 34 | 35 | impl<A: KeyHeapSize, B: KeyHeapSize, C: KeyHeapSize> KeyHeapSize for (A, B, C) { 36 | fn key_heap_size(&self) -> usize { 37 | self.0.key_heap_size() + self.1.key_heap_size() + self.2.key_heap_size() 38 | } 39 | } 40 | 41 | impl KeyHeapSize for KeyOwned { 42 | fn key_heap_size(&self) -> usize { 43 | self.as_row() 44 | .cells() 45 | .iter() 46 | .filter_map(|cell| cell.as_ref()) 47 | .map(dyn_cell_owned_heap_size) 48 | .sum() 49 | } 50 | } 51 | 52 | fn dyn_cell_owned_heap_size(cell: &DynCell) -> usize { 53 | match cell { 54 | DynCell::Str(value) => value.len(), 55 | DynCell::Bin(bytes) => bytes.len(), 56 | DynCell::Struct(values) => values 57 | .iter() 58 | .filter_map(|cell| cell.as_ref()) 59 | .map(dyn_cell_owned_heap_size) 60 | .sum(), 61 | DynCell::List(values) | DynCell::FixedSizeList(values) => values 62 | .iter() 63 | .filter_map(|cell| cell.as_ref()) 64 | .map(dyn_cell_owned_heap_size) 65 | .sum(), 66 | DynCell::Map(entries) => entries 67 | .iter() 68 | .map(|(key, value)| { 69 | dyn_cell_owned_heap_size(key) 70 | + value 71 | .as_ref() 72 | .map(dyn_cell_owned_heap_size) 73 | .unwrap_or_default() 74 | }) 75 | .sum(), 76 | DynCell::Union { value, .. } => value 77 | .as_deref() 78 | .map(dyn_cell_owned_heap_size) 79 | .unwrap_or_default(), 80 | _ => 0, 81 | } 82 | } 83 | 84 | impl KeyHeapSize for KeyRow { 85 | fn key_heap_size(&self) -> usize { 86 | self.heap_size() 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/db/compaction.rs: -------------------------------------------------------------------------------- 1 | //! DB compaction integration. 2 | //! 3 | //! This module provides the bridge between the DB type and the compaction subsystem. 4 | 5 | use fusio::executor::{Executor, Timer}; 6 | 7 | use crate::{compaction::CompactionDriver, db::DbInner, manifest::ManifestFs}; 8 | #[cfg(all(test, feature = "tokio"))] 9 | use crate::{ 10 | compaction::{ 11 | executor::{CompactionError, CompactionExecutor, CompactionOutcome}, 12 | planner::CompactionPlanner, 13 | }, 14 | manifest::ManifestResult, 15 | }; 16 | 17 | impl<FS, E> DbInner<FS, E> 18 | where 19 | FS: ManifestFs<E>, 20 | E: Executor + Timer + Clone + 'static, 21 | <FS as fusio::fs::Fs>::File: fusio::durability::FileCommit, 22 | { 23 | /// Whether a background compaction worker was spawned for this DB. 24 | #[cfg(test)] 25 | pub fn has_compaction_worker(&self) -> bool { 26 | self.compaction_worker.is_some() 27 | } 28 | 29 | /// Create a compaction driver from this DB's manifest and configuration. 30 | /// 31 | /// The driver is created on demand; callers can Arc-wrap it for background workers. 32 | pub(crate) fn compaction_driver(&self) -> CompactionDriver<FS, E> { 33 | CompactionDriver::new( 34 | self.manifest.clone(), 35 | self.manifest_table, 36 | self.wal_config.clone(), 37 | self.wal_handle().cloned(), 38 | ) 39 | } 40 | 41 | /// Remove WAL segments whose sequence is older than the manifest floor. 42 | pub(crate) async fn prune_wal_segments_below_floor(&self) { 43 | self.compaction_driver().prune_wal_below_floor().await 44 | } 45 | 46 | /// Build a compaction plan based on the latest manifest snapshot. 47 | #[cfg(all(test, feature = "tokio"))] 48 | pub(crate) async fn plan_compaction_task<P>( 49 | &self, 50 | planner: &P, 51 | ) -> ManifestResult<Option<crate::compaction::planner::CompactionTask>> 52 | where 53 | P: CompactionPlanner, 54 | { 55 | self.compaction_driver().plan_compaction_task(planner).await 56 | } 57 | 58 | /// Sequence number of the WAL floor currently recorded in the manifest. 59 | #[cfg(all(test, feature = "tokio"))] 60 | pub(crate) async fn wal_floor_seq(&self) -> Option<u64> { 61 | self.compaction_driver().wal_floor_seq().await 62 | } 63 | 64 | /// End-to-end compaction orchestrator (plan -> resolve -> execute -> apply manifest). 65 | #[cfg(all(test, feature = "tokio"))] 66 | pub(crate) async fn run_compaction_task<CE, P>( 67 | &self, 68 | planner: &P, 69 | executor: &CE, 70 | ) -> Result<Option<CompactionOutcome>, CompactionError> 71 | where 72 | CE: CompactionExecutor, 73 | P: CompactionPlanner, 74 | { 75 | self.compaction_driver() 76 | .run_compaction(planner, executor) 77 | .await 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /predicate/src/core/row_set.rs: -------------------------------------------------------------------------------- 1 | //! Shared row-set abstractions built on top of roaring bitmaps. 2 | 3 | use std::convert::TryFrom; 4 | 5 | use roaring::RoaringBitmap; 6 | 7 | /// Unique identifier for a row referenced by the planner. 8 | pub type RowId = u32; 9 | 10 | /// Borrowed iterator that yields [`RowId`] values. 11 | pub type RowIdIter<'a> = Box<dyn Iterator<Item = RowId> + Send + 'a>; 12 | 13 | /// Abstract set of row identifiers that supports basic set algebra. 14 | pub trait RowSet: Send + Sync { 15 | /// Returns the number of rows tracked by the set. 16 | fn len(&self) -> usize; 17 | 18 | /// Returns true when the set is empty. 19 | fn is_empty(&self) -> bool { 20 | self.len() == 0 21 | } 22 | 23 | /// Returns true when the set represents the whole universe of rows. 24 | fn is_full(&self) -> bool; 25 | 26 | /// Returns an iterator over row identifiers. 27 | fn iter(&self) -> RowIdIter<'_>; 28 | 29 | /// Returns the intersection between this set and `other`. 30 | fn intersect(&self, other: &Self) -> Self 31 | where 32 | Self: Sized; 33 | 34 | /// Returns the union between this set and `other`. 35 | fn union(&self, other: &Self) -> Self 36 | where 37 | Self: Sized; 38 | 39 | /// Returns the relative complement (`self \ other`). 40 | fn difference(&self, other: &Self) -> Self 41 | where 42 | Self: Sized; 43 | } 44 | 45 | /// [`RowSet`] implementation backed by a roaring bitmap. 46 | #[derive(Clone, Debug, Default)] 47 | pub struct BitmapRowSet { 48 | bitmap: RoaringBitmap, 49 | } 50 | 51 | impl BitmapRowSet { 52 | /// Creates an empty bitmap-backed row set. 53 | #[must_use] 54 | pub fn new() -> Self { 55 | Self::default() 56 | } 57 | 58 | /// Inserts a row identifier into the set. 59 | pub fn insert(&mut self, row: RowId) { 60 | self.bitmap.insert(row); 61 | } 62 | 63 | /// Returns true when the set contains the provided row identifier. 64 | #[must_use] 65 | pub fn contains(&self, row: RowId) -> bool { 66 | self.bitmap.contains(row) 67 | } 68 | } 69 | 70 | impl RowSet for BitmapRowSet { 71 | fn len(&self) -> usize { 72 | usize::try_from(self.bitmap.len()).unwrap_or(usize::MAX) 73 | } 74 | 75 | fn is_full(&self) -> bool { 76 | self.bitmap.is_full() 77 | } 78 | 79 | fn iter(&self) -> RowIdIter<'_> { 80 | Box::new(self.bitmap.iter()) 81 | } 82 | 83 | fn intersect(&self, other: &Self) -> Self { 84 | let bitmap = &self.bitmap & &other.bitmap; 85 | Self { bitmap } 86 | } 87 | 88 | fn union(&self, other: &Self) -> Self { 89 | let bitmap = &self.bitmap | &other.bitmap; 90 | Self { bitmap } 91 | } 92 | 93 | fn difference(&self, other: &Self) -> Self { 94 | let bitmap = &self.bitmap - &other.bitmap; 95 | Self { bitmap } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /examples/10_dynamic/10b_dynamic_metadata.rs: -------------------------------------------------------------------------------- 1 | // 03: Dynamic (runtime) schema: infer key from Arrow metadata 2 | 3 | use std::sync::Arc; 4 | 5 | use fusio::{executor::NoopExecutor, mem::fs::InMemoryFs}; 6 | use tonbo::prelude::*; 7 | use typed_arrow::{ 8 | arrow_array::RecordBatch, 9 | arrow_schema::{DataType, Field, Schema}, 10 | }; 11 | use typed_arrow_dyn::{DynBuilders, DynCell, DynRow}; 12 | 13 | fn build_batch(schema: Arc<Schema>, rows: Vec<DynRow>) -> RecordBatch { 14 | let mut builders = DynBuilders::new(schema.clone(), rows.len()); 15 | for row in rows { 16 | builders.append_option_row(Some(row)).expect("append row"); 17 | } 18 | builders.try_finish_into_batch().expect("record batch") 19 | } 20 | 21 | #[tokio::main] 22 | async fn main() { 23 | // Schema-level metadata: tonbo.keys = "id" 24 | let f_id = Field::new("id", DataType::Utf8, false); 25 | let f_score = Field::new("score", DataType::Int32, false); 26 | let mut md = std::collections::HashMap::new(); 27 | md.insert("tonbo.keys".to_string(), "id".to_string()); 28 | let schema = Arc::new(Schema::new(vec![f_id, f_score]).with_metadata(md)); 29 | 30 | // Build a batch 31 | let rows = vec![ 32 | DynRow(vec![Some(DynCell::Str("a".into())), Some(DynCell::I32(1))]), 33 | DynRow(vec![Some(DynCell::Str("b".into())), Some(DynCell::I32(2))]), 34 | ]; 35 | let batch: RecordBatch = build_batch(schema.clone(), rows); 36 | 37 | // Create DB from metadata 38 | let executor = Arc::new(NoopExecutor); 39 | let db: DB<InMemoryFs, NoopExecutor> = DbBuilder::from_schema_metadata(schema.clone()) 40 | .expect("metadata config") 41 | .in_memory("dynamic-metadata") 42 | .expect("in_memory config") 43 | .open_with_executor(Arc::clone(&executor)) 44 | .await 45 | .expect("metadata ok"); 46 | db.ingest(batch).await.expect("insert"); 47 | 48 | // Scan all rows using a trivial predicate 49 | let pred = Predicate::is_not_null(ColumnRef::new("id")); 50 | let rows: Vec<(String, i32)> = db 51 | .scan() 52 | .filter(pred) 53 | .collect() 54 | .await 55 | .expect("collect") 56 | .into_iter() 57 | .flat_map(|batch| { 58 | let ids = batch 59 | .column(0) 60 | .as_any() 61 | .downcast_ref::<typed_arrow::arrow_array::StringArray>() 62 | .expect("id col"); 63 | let vals = batch 64 | .column(1) 65 | .as_any() 66 | .downcast_ref::<typed_arrow::arrow_array::Int32Array>() 67 | .expect("v col"); 68 | ids.iter() 69 | .zip(vals.iter()) 70 | .filter_map(|(id, v)| Some((id?.to_string(), v?))) 71 | .collect::<Vec<_>>() 72 | }) 73 | .collect(); 74 | println!("dynamic (metadata) rows: {:?}", rows); 75 | } 76 | -------------------------------------------------------------------------------- /src/extractor/errors.rs: -------------------------------------------------------------------------------- 1 | use arrow_schema::{ArrowError, DataType, SchemaRef}; 2 | use typed_arrow_dyn::DynViewError; 3 | 4 | use crate::wal::WalError; 5 | 6 | /// Error returned when key extraction fails due to type/schema mismatches or out-of-bounds. 7 | #[derive(Debug, thiserror::Error)] 8 | pub enum KeyExtractError { 9 | /// Column index is outside the schema's field range. 10 | #[error("column index {0} out of bounds (num_columns={1})")] 11 | ColumnOutOfBounds(usize, usize), 12 | /// The field's Arrow data type does not match the extractor's expectation. 13 | #[error("unexpected data type for column {col}: expected {expected:?}, got {actual:?}")] 14 | WrongType { 15 | /// Column index with the mismatch. 16 | col: usize, 17 | /// Expected Arrow data type. 18 | expected: DataType, 19 | /// Actual Arrow data type. 20 | actual: DataType, 21 | }, 22 | /// Row index is outside the batch's row range. 23 | #[error("invalid row index {0} (num_rows={1})")] 24 | RowOutOfBounds(usize, usize), 25 | /// Encountered an unsupported Arrow type when extracting from a batch. 26 | #[error("unsupported data type for column {col}: {data_type:?}")] 27 | UnsupportedType { 28 | /// Column index of the unsupported field. 29 | col: usize, 30 | /// The Arrow data type that is not supported. 31 | data_type: DataType, 32 | }, 33 | /// Referenced field by name was not found in the schema. 34 | #[error("no such field in schema: {name}")] 35 | NoSuchField { 36 | /// The missing field name. 37 | name: String, 38 | }, 39 | /// Batch schema does not match the DB's configured schema (dynamic mode). 40 | #[error("schema mismatch: expected {expected:?}, got {actual:?}")] 41 | SchemaMismatch { 42 | /// The DB's configured schema. 43 | expected: SchemaRef, 44 | /// The incoming batch schema. 45 | actual: SchemaRef, 46 | }, 47 | /// Tombstone bitmap length does not match the batch row count. 48 | #[error("tombstone bitmap length mismatch: expected {expected}, got {actual}")] 49 | TombstoneLengthMismatch { 50 | /// Expected number of rows. 51 | expected: usize, 52 | /// Provided tombstone entries. 53 | actual: usize, 54 | }, 55 | /// WAL submission or durability hook failed while ingesting. 56 | #[error("wal error: {0}")] 57 | Wal(#[from] WalError), 58 | /// Generic Arrow failure while materializing dynamic rows. 59 | #[error("arrow error: {0}")] 60 | Arrow(#[from] ArrowError), 61 | /// Error when viewing the data. 62 | #[error("dyn view error: {0}")] 63 | DynView(#[from] DynViewError), 64 | /// Mutable memtable has reached capacity and must be sealed. 65 | #[error("memtable full: capacity {capacity} exhausted")] 66 | MemtableFull { 67 | /// Maximum number of batches the memtable can hold. 68 | capacity: usize, 69 | }, 70 | } 71 | -------------------------------------------------------------------------------- /src/tests_internal/wasm_compat_e2e.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::sync::Arc; 4 | 5 | use arrow_array::{Int32Array, RecordBatch, StringArray}; 6 | use arrow_schema::{DataType, Field}; 7 | use fusio::{executor::NoopExecutor, impls::mem::fs::InMemoryFs}; 8 | 9 | use crate::db::{BatchesThreshold, ColumnRef, DB, Predicate, WalSyncPolicy}; 10 | 11 | #[path = "common/mod.rs"] 12 | mod common; 13 | use common::config_with_pk; 14 | 15 | /// Ensure a wasm-like config (in-memory FS + no-op executor) can ingest and scan end-to-end. 16 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 17 | async fn wasm_like_in_memory_roundtrip() -> Result<(), Box<dyn std::error::Error>> { 18 | let config = config_with_pk( 19 | vec![ 20 | Field::new("id", DataType::Utf8, false), 21 | Field::new("v", DataType::Int32, false), 22 | ], 23 | &["id"], 24 | ); 25 | let schema = config.schema(); 26 | 27 | let exec = Arc::new(NoopExecutor); 28 | let mut inner: crate::db::DbInner<InMemoryFs, NoopExecutor> = 29 | DB::<InMemoryFs, NoopExecutor>::builder(config) 30 | .in_memory("wasm-compat-e2e")? 31 | .wal_sync_policy(WalSyncPolicy::Always) 32 | .open_with_executor(exec) 33 | .await? 34 | .into_inner(); 35 | 36 | inner.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 37 | 38 | let first = RecordBatch::try_new( 39 | schema.clone(), 40 | vec![ 41 | Arc::new(StringArray::from(vec!["a", "b"])) as _, 42 | Arc::new(Int32Array::from(vec![1, 2])) as _, 43 | ], 44 | )?; 45 | inner.ingest(first).await?; 46 | 47 | let second = RecordBatch::try_new( 48 | schema.clone(), 49 | vec![ 50 | Arc::new(StringArray::from(vec!["c"])) as _, 51 | Arc::new(Int32Array::from(vec![3])) as _, 52 | ], 53 | )?; 54 | inner.ingest(second).await?; 55 | 56 | let db = DB::from_inner(Arc::new(inner)); 57 | 58 | let predicate = Predicate::is_not_null(ColumnRef::new("id")); 59 | let mut rows: Vec<(String, i32)> = db 60 | .begin_snapshot() 61 | .await? 62 | .scan(&db) 63 | .filter(predicate) 64 | .collect() 65 | .await? 66 | .into_iter() 67 | .flat_map(|batch| { 68 | let ids = batch 69 | .column(0) 70 | .as_any() 71 | .downcast_ref::<arrow_array::StringArray>() 72 | .expect("id col"); 73 | let vals = batch 74 | .column(1) 75 | .as_any() 76 | .downcast_ref::<arrow_array::Int32Array>() 77 | .expect("v col"); 78 | ids.iter() 79 | .zip(vals.iter()) 80 | .filter_map(|(id, v)| Some((id?.to_string(), v?))) 81 | .collect::<Vec<_>>() 82 | }) 83 | .collect(); 84 | rows.sort(); 85 | assert_eq!( 86 | rows, 87 | vec![("a".into(), 1), ("b".into(), 2), ("c".into(), 3)] 88 | ); 89 | 90 | Ok(()) 91 | } 92 | -------------------------------------------------------------------------------- /src/compaction/scheduler.rs: -------------------------------------------------------------------------------- 1 | //! Send-friendly compaction scheduler primitives with lease issuance. 2 | 3 | use std::future::Future; 4 | 5 | use futures::{FutureExt, SinkExt, StreamExt, channel::mpsc}; 6 | use thiserror::Error; 7 | use ulid::Ulid; 8 | 9 | use crate::{ 10 | compaction::{executor::CompactionLease, planner::CompactionTask}, 11 | mvcc::Timestamp, 12 | }; 13 | 14 | /// Single scheduled compaction task bundled with CAS context and a lease token. 15 | #[derive(Debug, Clone)] 16 | pub(super) struct ScheduledCompaction { 17 | pub(super) task: CompactionTask, 18 | pub(super) manifest_head: Option<Timestamp>, 19 | pub(super) lease: CompactionLease, 20 | } 21 | 22 | /// Errors that can surface while scheduling or draining compaction jobs. 23 | #[derive(Debug, Error)] 24 | pub(super) enum CompactionScheduleError { 25 | /// Scheduler channel closed. 26 | #[error("compaction scheduler closed")] 27 | Closed, 28 | } 29 | 30 | /// In-process scheduler that hands out leases and enqueues compaction tasks. 31 | #[derive(Debug)] 32 | pub(super) struct CompactionScheduler { 33 | tx: mpsc::Sender<ScheduledCompaction>, 34 | budget: usize, 35 | } 36 | 37 | impl CompactionScheduler { 38 | /// Create a scheduler with bounded capacity and a per-cycle drain budget. 39 | #[must_use] 40 | pub(super) fn new( 41 | capacity: usize, 42 | budget: usize, 43 | ) -> (Self, mpsc::Receiver<ScheduledCompaction>) { 44 | let (tx, rx) = mpsc::channel(capacity.max(1)); 45 | ( 46 | Self { 47 | tx, 48 | budget: budget.max(1), 49 | }, 50 | rx, 51 | ) 52 | } 53 | 54 | /// Enqueue a planned compaction task with an issued lease. 55 | pub(super) async fn enqueue( 56 | &self, 57 | task: CompactionTask, 58 | manifest_head: Option<Timestamp>, 59 | owner: impl Into<String>, 60 | ttl_ms: u64, 61 | ) -> Result<(), CompactionScheduleError> { 62 | let lease = CompactionLease { 63 | id: Ulid::new(), 64 | owner: owner.into(), 65 | ttl_ms, 66 | }; 67 | let mut tx = self.tx.clone(); 68 | tx.send(ScheduledCompaction { 69 | task, 70 | manifest_head, 71 | lease, 72 | }) 73 | .await 74 | .map_err(|_| CompactionScheduleError::Closed) 75 | } 76 | 77 | /// Drain up to the configured budget of scheduled jobs, invoking `f` per job. 78 | pub(super) async fn drain_with_budget<F, Fut>( 79 | &self, 80 | rx: &mut mpsc::Receiver<ScheduledCompaction>, 81 | mut f: F, 82 | ) -> Result<(), CompactionScheduleError> 83 | where 84 | F: FnMut(ScheduledCompaction) -> Fut, 85 | Fut: Future<Output = ()>, 86 | { 87 | for _ in 0..self.budget { 88 | match rx.next().now_or_never() { 89 | Some(Some(job)) => f(job).await, 90 | Some(None) => return Err(CompactionScheduleError::Closed), 91 | None => break, 92 | } 93 | } 94 | Ok(()) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/tests_internal/conflict_e2e.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::sync::Arc; 4 | 5 | use arrow_schema::{DataType, Field}; 6 | use fusio::{executor::tokio::TokioExecutor, mem::fs::InMemoryFs}; 7 | use typed_arrow_dyn::{DynCell, DynRow}; 8 | 9 | use crate::db::{ColumnRef, DB, Predicate}; 10 | 11 | #[path = "common/mod.rs"] 12 | mod common; 13 | use common::config_with_pk; 14 | 15 | async fn make_db() -> Result<DB<InMemoryFs, TokioExecutor>, Box<dyn std::error::Error>> { 16 | let cfg = config_with_pk( 17 | vec![ 18 | Field::new("id", DataType::Utf8, false), 19 | Field::new("v", DataType::Int32, false), 20 | ], 21 | &["id"], 22 | ); 23 | let exec = Arc::new(TokioExecutor::default()); 24 | let db = DB::<InMemoryFs, TokioExecutor>::builder(cfg) 25 | .in_memory("conflict-e2e")? 26 | .open_with_executor(exec) 27 | .await?; 28 | Ok(db) 29 | } 30 | 31 | /// Conflicting writes on the same key should surface a conflict error and not apply the second 32 | /// write. 33 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 34 | async fn transactional_conflict_detection_blocks_second_writer() 35 | -> Result<(), Box<dyn std::error::Error>> { 36 | let db = make_db().await?; 37 | 38 | // First transaction stages an update but does not commit yet. 39 | let mut tx1 = db.begin_transaction().await?; 40 | tx1.upsert(DynRow(vec![ 41 | Some(DynCell::Str("user".into())), 42 | Some(DynCell::I32(1)), 43 | ]))?; 44 | 45 | // Second transaction based on same snapshot attempts to write the same key. 46 | let mut tx2 = db.begin_transaction().await?; 47 | tx2.upsert(DynRow(vec![ 48 | Some(DynCell::Str("user".into())), 49 | Some(DynCell::I32(2)), 50 | ]))?; 51 | 52 | // Commit tx1, then tx2 should see a conflict. 53 | tx1.commit().await?; 54 | let commit2 = tx2.commit().await; 55 | 56 | // Confirm final visibility matches either conflict (only first) or overwrite if conflict not 57 | // detected. 58 | let predicate = Predicate::is_not_null(ColumnRef::new("id")); 59 | let batches = db.scan().filter(predicate).collect().await?; 60 | let mut rows: Vec<(String, i32)> = batches 61 | .into_iter() 62 | .flat_map(|batch| { 63 | let ids = batch 64 | .column(0) 65 | .as_any() 66 | .downcast_ref::<arrow_array::StringArray>() 67 | .expect("id col"); 68 | let vals = batch 69 | .column(1) 70 | .as_any() 71 | .downcast_ref::<arrow_array::Int32Array>() 72 | .expect("v col"); 73 | ids.iter() 74 | .zip(vals.iter()) 75 | .filter_map(|(id, v)| Some((id?.to_string(), v?))) 76 | .collect::<Vec<_>>() 77 | }) 78 | .collect(); 79 | rows.sort(); 80 | if let Err(err) = commit2 { 81 | let msg = format!("{err}"); 82 | assert!( 83 | msg.contains("conflict") || msg.contains("Conflict"), 84 | "expected conflict error, got: {msg}" 85 | ); 86 | assert_eq!(rows, vec![("user".into(), 1)]); 87 | } else { 88 | assert_eq!(rows, vec![("user".into(), 2)]); 89 | } 90 | 91 | Ok(()) 92 | } 93 | -------------------------------------------------------------------------------- /src/db/tests/core/scan.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use arrow_schema::{DataType, Field, Schema}; 4 | use fusio::{executor::NoopExecutor, mem::fs::InMemoryFs}; 5 | use tonbo_predicate::{ColumnRef, Predicate, ScalarValue}; 6 | use typed_arrow_dyn::{DynCell, DynRow}; 7 | 8 | use crate::{ 9 | db::{DB, DbInner}, 10 | extractor, 11 | inmem::policy::BatchesThreshold, 12 | mode::DynModeConfig, 13 | test::build_batch, 14 | }; 15 | 16 | #[tokio::test(flavor = "multi_thread", worker_threads = 1)] 17 | async fn plan_scan_filters_immutable_segments() { 18 | let db = db_with_immutable_keys(&["k1", "z1"]).await; 19 | let predicate = Predicate::eq(ColumnRef::new("id"), ScalarValue::from("k1")); 20 | let snapshot = db.begin_snapshot().await.expect("snapshot"); 21 | let plan = snapshot 22 | .plan_scan(&db, &predicate, None, None) 23 | .await 24 | .expect("plan"); 25 | // Pruning is currently disabled; expect to scan all immutables and retain the predicate 26 | // for residual evaluation. 27 | assert_eq!(plan.immutable_indexes, vec![0, 1]); 28 | assert!(plan.residual_predicate.is_some()); 29 | } 30 | 31 | #[tokio::test(flavor = "multi_thread", worker_threads = 1)] 32 | async fn plan_scan_preserves_residual_predicate() { 33 | let db = db_with_immutable_keys(&["k1"]).await; 34 | let key_pred = Predicate::eq(ColumnRef::new("id"), ScalarValue::from("k1")); 35 | let value_pred = Predicate::gt(ColumnRef::new("v"), ScalarValue::from(5i64)); 36 | let predicate = Predicate::and(vec![key_pred, value_pred]); 37 | let snapshot = db.begin_snapshot().await.expect("snapshot"); 38 | let plan = snapshot 39 | .plan_scan(&db, &predicate, None, None) 40 | .await 41 | .expect("plan"); 42 | assert!(plan.residual_predicate.is_some()); 43 | } 44 | 45 | #[tokio::test(flavor = "multi_thread", worker_threads = 1)] 46 | async fn plan_scan_marks_empty_range() { 47 | let db = db_with_immutable_keys(&["k1"]).await; 48 | let pred_a = Predicate::eq(ColumnRef::new("id"), ScalarValue::from("k1")); 49 | let pred_b = Predicate::eq(ColumnRef::new("id"), ScalarValue::from("k2")); 50 | let predicate = Predicate::and(vec![pred_a, pred_b]); 51 | let snapshot = db.begin_snapshot().await.expect("snapshot"); 52 | let plan = snapshot 53 | .plan_scan(&db, &predicate, None, None) 54 | .await 55 | .expect("plan"); 56 | // Pruning is currently disabled; even contradictory predicates scan all immutables. 57 | assert_eq!(plan.immutable_indexes, vec![0]); 58 | } 59 | 60 | async fn db_with_immutable_keys(keys: &[&str]) -> DbInner<InMemoryFs, NoopExecutor> { 61 | let schema = Arc::new(Schema::new(vec![ 62 | Field::new("id", DataType::Utf8, false), 63 | Field::new("v", DataType::Int32, false), 64 | ])); 65 | let extractor = extractor::projection_for_field(schema.clone(), 0).expect("extractor"); 66 | let executor = Arc::new(NoopExecutor); 67 | let config = DynModeConfig::new(schema.clone(), extractor).expect("config"); 68 | let policy = Arc::new(BatchesThreshold { batches: 1 }); 69 | let db = DB::new_with_policy(config, Arc::clone(&executor), policy) 70 | .await 71 | .expect("db") 72 | .into_inner(); 73 | for (idx, key) in keys.iter().enumerate() { 74 | let rows = vec![DynRow(vec![ 75 | Some(DynCell::Str((*key).into())), 76 | Some(DynCell::I32(idx as i32)), 77 | ])]; 78 | let batch = build_batch(schema.clone(), rows).expect("batch"); 79 | db.ingest_with_tombstones(batch, vec![false]) 80 | .await 81 | .expect("ingest"); 82 | } 83 | db 84 | } 85 | -------------------------------------------------------------------------------- /examples/10_dynamic/10a_dynamic_basic.rs: -------------------------------------------------------------------------------- 1 | // 02: Dynamic (runtime) schema: key-by-name, insert a batch, and scan 2 | 3 | use std::sync::Arc; 4 | 5 | use fusio::{executor::tokio::TokioExecutor, mem::fs::InMemoryFs}; 6 | use tonbo::prelude::*; 7 | use typed_arrow::{ 8 | arrow_array::RecordBatch, 9 | arrow_schema::{DataType, Field, Schema}, 10 | }; 11 | use typed_arrow_dyn::{DynBuilders, DynCell, DynRow}; 12 | 13 | fn build_batch(schema: Arc<Schema>, rows: Vec<DynRow>) -> RecordBatch { 14 | let mut builders = DynBuilders::new(schema.clone(), rows.len()); 15 | for row in rows { 16 | builders.append_option_row(Some(row)).expect("append row"); 17 | } 18 | builders.try_finish_into_batch().expect("record batch") 19 | } 20 | 21 | #[tokio::main] 22 | async fn main() { 23 | // Define an Arrow schema at runtime (string key) 24 | let schema = Arc::new(Schema::new(vec![ 25 | Field::new("id", DataType::Utf8, false), 26 | Field::new("score", DataType::Int32, false), 27 | ])); 28 | 29 | // Build a RecordBatch from dynamic rows 30 | let rows = vec![ 31 | DynRow(vec![ 32 | Some(DynCell::Str("carol".into())), 33 | Some(DynCell::I32(30)), 34 | ]), 35 | DynRow(vec![ 36 | Some(DynCell::Str("dave".into())), 37 | Some(DynCell::I32(40)), 38 | ]), 39 | DynRow(vec![ 40 | Some(DynCell::Str("erin".into())), 41 | Some(DynCell::I32(50)), 42 | ]), 43 | ]; 44 | let batch: RecordBatch = build_batch(schema.clone(), rows); 45 | 46 | // Create a dynamic DB by specifying the key field name 47 | let db: DB<InMemoryFs, TokioExecutor> = DbBuilder::from_schema_key_name(schema.clone(), "id") 48 | .expect("key col") 49 | .in_memory("dynamic-basic") 50 | .expect("in_memory config") 51 | .open() 52 | .await 53 | .expect("schema ok"); 54 | db.ingest(batch).await.expect("insert dynamic batch"); 55 | 56 | let key_col = ColumnRef::new("id"); 57 | 58 | // Scan for a specific key (id == "carol") using predicate 59 | let carol_pred = Predicate::eq(key_col.clone(), ScalarValue::from("carol")); 60 | let out = scan_pairs(&db, carol_pred).await; 61 | println!("dynamic scan rows (carol): {:?}", out); 62 | 63 | // Query expression: id == "dave" 64 | let expr = Predicate::eq(key_col.clone(), ScalarValue::from("dave")); 65 | let out_q = scan_pairs(&db, expr).await; 66 | println!("dynamic query rows (id == dave): {:?}", out_q); 67 | 68 | // Scan all dynamic rows (id is not null) 69 | let all_pred = Predicate::is_not_null(key_col.clone()); 70 | let all_rows = scan_pairs(&db, all_pred).await; 71 | println!("dynamic rows (all): {:?}", all_rows); 72 | } 73 | 74 | async fn scan_pairs( 75 | db: &DB<InMemoryFs, TokioExecutor>, 76 | predicate: Predicate, 77 | ) -> Vec<(String, i32)> { 78 | let batches = db.scan().filter(predicate).collect().await.expect("scan"); 79 | batches 80 | .into_iter() 81 | .flat_map(|batch| { 82 | let ids = batch 83 | .column(0) 84 | .as_any() 85 | .downcast_ref::<typed_arrow::arrow_array::StringArray>() 86 | .expect("id col"); 87 | let vals = batch 88 | .column(1) 89 | .as_any() 90 | .downcast_ref::<typed_arrow::arrow_array::Int32Array>() 91 | .expect("v col"); 92 | ids.iter() 93 | .zip(vals.iter()) 94 | .filter_map(|(id, v)| Some((id?.to_string(), v?))) 95 | .collect::<Vec<_>>() 96 | }) 97 | .collect() 98 | } 99 | -------------------------------------------------------------------------------- /tests/s3_localstack_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Bootstrap LocalStack for S3-backed tests and export TONBO_S3_* env vars. 3 | # Source this script to set env in the current shell: 4 | # source tests/s3_localstack_env.sh 5 | # 6 | # Variables (override as needed): 7 | # LOCALSTACK_CONTAINER (default: tonbo-localstack-e2e) 8 | # LOCALSTACK_PORT (default: 4566) 9 | # AWS_ACCESS_KEY_ID (default: test) 10 | # AWS_SECRET_ACCESS_KEY(default: test) 11 | # AWS_REGION (default: us-east-1) 12 | # BUCKET_NAME (default: tonbo-e2e) 13 | 14 | set -euo pipefail 15 | 16 | LOCALSTACK_CONTAINER=${LOCALSTACK_CONTAINER:-tonbo-localstack-e2e} 17 | LOCALSTACK_PORT=${LOCALSTACK_PORT:-4566} 18 | AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-test} 19 | AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-test} 20 | AWS_REGION=${AWS_REGION:-us-east-1} 21 | BUCKET_NAME=${BUCKET_NAME:-tonbo-e2e} 22 | 23 | tonbo_localstack_started_by_script=0 24 | tonbo_localstack_available=0 25 | 26 | ensure_localstack() { 27 | if command -v docker >/dev/null 2>&1; then 28 | if docker ps --format '{{.Names}}' | grep -q "^${LOCALSTACK_CONTAINER}\$"; then 29 | tonbo_localstack_available=1 30 | return 31 | fi 32 | 33 | if docker ps -a --format '{{.Names}}' | grep -q "^${LOCALSTACK_CONTAINER}\$"; then 34 | docker rm -f "${LOCALSTACK_CONTAINER}" >/dev/null 2>&1 || true 35 | fi 36 | 37 | echo "Starting LocalStack (${LOCALSTACK_CONTAINER}) on port ${LOCALSTACK_PORT}..." 38 | docker run -d --name "${LOCALSTACK_CONTAINER}" \ 39 | -e SERVICES="s3" \ 40 | -e AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}" \ 41 | -e AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}" \ 42 | -e AWS_DEFAULT_REGION="${AWS_REGION}" \ 43 | -p "${LOCALSTACK_PORT}:4566" \ 44 | localstack/localstack:latest >/dev/null 45 | tonbo_localstack_started_by_script=1 46 | tonbo_localstack_available=1 47 | 48 | echo -n "Waiting for LocalStack to become ready" 49 | until docker exec "${LOCALSTACK_CONTAINER}" awslocal s3api list-buckets >/dev/null 2>&1; do 50 | sleep 1 51 | printf '.' 52 | done 53 | echo 54 | 55 | echo "Provisioning S3 bucket ${BUCKET_NAME}..." 56 | docker exec "${LOCALSTACK_CONTAINER}" awslocal s3api create-bucket --bucket "${BUCKET_NAME}" >/dev/null 2>&1 || true 57 | else 58 | echo "docker is required to start LocalStack; skipping LocalStack startup" >&2 59 | fi 60 | } 61 | 62 | export_s3_env() { 63 | if [ "${tonbo_localstack_available}" -ne 1 ]; then 64 | return 1 65 | fi 66 | export TONBO_S3_ENDPOINT="http://localhost:${LOCALSTACK_PORT}" 67 | export TONBO_S3_BUCKET="${BUCKET_NAME}" 68 | export TONBO_S3_REGION="${AWS_REGION}" 69 | export TONBO_S3_ACCESS_KEY="${AWS_ACCESS_KEY_ID}" 70 | export TONBO_S3_SECRET_KEY="${AWS_SECRET_ACCESS_KEY}" 71 | export TONBO_LOCALSTACK_CONTAINER="${LOCALSTACK_CONTAINER}" 72 | export TONBO_LOCALSTACK_STARTED_BY_SCRIPT="${tonbo_localstack_started_by_script}" 73 | } 74 | 75 | # If executed directly, start LocalStack and print exports for convenience. 76 | if [[ "${BASH_SOURCE[0]}" == "$0" ]]; then 77 | ensure_localstack 78 | if export_s3_env; then 79 | cat <<EOF 80 | export TONBO_S3_ENDPOINT=${TONBO_S3_ENDPOINT} 81 | export TONBO_S3_BUCKET=${TONBO_S3_BUCKET} 82 | export TONBO_S3_REGION=${TONBO_S3_REGION} 83 | export TONBO_S3_ACCESS_KEY=${TONBO_S3_ACCESS_KEY} 84 | export TONBO_S3_SECRET_KEY=${TONBO_S3_SECRET_KEY} 85 | export TONBO_LOCALSTACK_CONTAINER=${TONBO_LOCALSTACK_CONTAINER} 86 | export TONBO_LOCALSTACK_STARTED_BY_SCRIPT=${TONBO_LOCALSTACK_STARTED_BY_SCRIPT} 87 | EOF 88 | else 89 | echo "LocalStack not available; TONBO_S3_* not exported." >&2 90 | exit 1 91 | fi 92 | else 93 | ensure_localstack 94 | export_s3_env || true 95 | fi 96 | -------------------------------------------------------------------------------- /examples/10_dynamic/10c_dynamic_composite.rs: -------------------------------------------------------------------------------- 1 | // 04: Dynamic (runtime) schema: composite keys via metadata ordinals 2 | 3 | use std::{collections::HashMap, sync::Arc}; 4 | 5 | use fusio::{executor::NoopExecutor, mem::fs::InMemoryFs}; 6 | use tonbo::prelude::*; 7 | use typed_arrow::{ 8 | arrow_array::RecordBatch, 9 | arrow_schema::{DataType, Field, Schema}, 10 | }; 11 | use typed_arrow_dyn::{DynBuilders, DynCell, DynRow}; 12 | 13 | fn build_batch(schema: Arc<Schema>, rows: Vec<DynRow>) -> RecordBatch { 14 | let mut builders = DynBuilders::new(schema.clone(), rows.len()); 15 | for row in rows { 16 | builders.append_option_row(Some(row)).expect("append row"); 17 | } 18 | builders.try_finish_into_batch().expect("record batch") 19 | } 20 | 21 | #[tokio::main] 22 | async fn main() { 23 | // Field-level metadata: tonbo.key ordinals define lexicographic order 24 | let mut m1 = HashMap::new(); 25 | m1.insert("tonbo.key".to_string(), "1".to_string()); 26 | let mut m2 = HashMap::new(); 27 | m2.insert("tonbo.key".to_string(), "2".to_string()); 28 | let f_id = Field::new("id", DataType::Utf8, false).with_metadata(m1); 29 | let f_ts = Field::new("ts", DataType::Int64, false).with_metadata(m2); 30 | let f_v = Field::new("v", DataType::Int32, false); 31 | let schema = Arc::new(Schema::new(vec![f_id, f_ts, f_v])); 32 | 33 | // Create DB from metadata 34 | let executor = Arc::new(NoopExecutor); 35 | let db: DB<InMemoryFs, NoopExecutor> = DbBuilder::from_schema_metadata(schema.clone()) 36 | .expect("metadata config") 37 | .in_memory("dynamic-composite") 38 | .expect("in_memory config") 39 | .open_with_executor(Arc::clone(&executor)) 40 | .await 41 | .expect("composite ok"); 42 | 43 | // Build a batch with three rows 44 | let rows = vec![ 45 | DynRow(vec![ 46 | Some(DynCell::Str("a".into())), 47 | Some(DynCell::I64(10)), 48 | Some(DynCell::I32(1)), 49 | ]), 50 | DynRow(vec![ 51 | Some(DynCell::Str("a".into())), 52 | Some(DynCell::I64(5)), 53 | Some(DynCell::I32(2)), 54 | ]), 55 | DynRow(vec![ 56 | Some(DynCell::Str("b".into())), 57 | Some(DynCell::I64(1)), 58 | Some(DynCell::I32(3)), 59 | ]), 60 | ]; 61 | let batch: RecordBatch = build_batch(schema.clone(), rows); 62 | db.ingest(batch).await.expect("insert"); 63 | 64 | // Predicate over composite key: id = 'a' AND ts BETWEEN 5 AND 10 65 | let pred = Predicate::and(vec![ 66 | Predicate::eq(ColumnRef::new("id"), ScalarValue::from("a")), 67 | Predicate::and(vec![ 68 | Predicate::gte(ColumnRef::new("ts"), ScalarValue::from(5i64)), 69 | Predicate::lte(ColumnRef::new("ts"), ScalarValue::from(10i64)), 70 | ]), 71 | ]); 72 | 73 | let got: Vec<(String, i64)> = db 74 | .scan() 75 | .filter(pred) 76 | .collect() 77 | .await 78 | .expect("collect") 79 | .into_iter() 80 | .flat_map(|batch| { 81 | let ids = batch 82 | .column(0) 83 | .as_any() 84 | .downcast_ref::<typed_arrow::arrow_array::StringArray>() 85 | .expect("id col"); 86 | let ts = batch 87 | .column(1) 88 | .as_any() 89 | .downcast_ref::<typed_arrow::arrow_array::Int64Array>() 90 | .expect("ts col"); 91 | ids.iter() 92 | .zip(ts.iter()) 93 | .filter_map(|(id, t)| Some((id?.to_string(), t?))) 94 | .collect::<Vec<_>>() 95 | }) 96 | .collect(); 97 | println!("dynamic composite range rows: {:?}", got); 98 | } 99 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2024" 3 | name = "tonbo" 4 | resolver = "2" 5 | version = "0.4.0-a0" 6 | description = "Embedded database for serverless and edge runtimes, storing data as Parquet on S3" 7 | license = "Apache-2.0" 8 | repository = "https://github.com/tonbo-io/tonbo" 9 | readme = "README.md" 10 | 11 | [workspace] 12 | members = [".", "predicate"] 13 | 14 | 15 | [features] 16 | default = ["tokio", "typed-arrow"] 17 | tokio = [ 18 | "dep:tokio", 19 | "fusio-manifest/tokio", 20 | "fusio/executor-tokio", 21 | "fusio/fs", 22 | "fusio/tokio", 23 | "fusio/tokio-http", 24 | ] 25 | typed-arrow = ["dep:typed-arrow"] 26 | # Web/wasm build using Fusio's WebExecutor and wasm-http client stack. 27 | web = [ 28 | "fusio-manifest/web", 29 | "fusio-parquet/executor-web", 30 | "fusio/aws", 31 | "fusio/executor-web", 32 | "fusio/web-http", 33 | ] 34 | # Optional OPFS support layered on the web stack. 35 | web-opfs = ["fusio/opfs", "web"] 36 | # Enable this to compile the optional S3 smoke test in `tests/s3_smoke.rs`. 37 | s3-smoke = [] 38 | 39 | [dependencies] 40 | anyhow = "1" 41 | arrow-array = "56.2.0" 42 | arrow-buffer = "56.2.0" 43 | arrow-ipc = "56.1.0" 44 | arrow-schema = { version = "56.2.0", features = ["serde"] } 45 | arrow-select = "56.2.0" 46 | crc32c = "0.6" 47 | crossbeam-skiplist = "0.1" 48 | fusio = { version = "0.5.0", default-features = false, features = [ 49 | "aws", 50 | "dyn", 51 | "executor", 52 | "fs", 53 | ] } 54 | fusio-manifest = { version = "0.5.0", package = "fusio-manifest", default-features = false, features = [ 55 | "std", 56 | ] } 57 | fusio-parquet = { version = "0.5.0", package = "fusio-parquet" } 58 | futures = "0.3" 59 | lockable = "0.2" 60 | once_cell = "1" 61 | parking_lot = "0.12" 62 | parquet = { version = "56.2.0", default-features = false, features = [ 63 | "async", 64 | "zstd", 65 | ] } 66 | pin-project-lite = "0.2" 67 | serde = { version = "1", features = ["derive", "rc"] } 68 | serde_json = "1" 69 | sha2 = "0.10" 70 | thiserror = "2.0.17" 71 | tokio = { version = "1", default-features = false, features = [ 72 | "macros", 73 | "net", 74 | "rt", 75 | "rt-multi-thread", 76 | "sync", 77 | "time", 78 | ], optional = true } 79 | tonbo-predicate = { version = "0.1.0", path = "predicate" } 80 | typed-arrow = { version = "0.5.1", features = ["ext-hooks"], optional = true } 81 | typed-arrow-dyn = { version = "0.0.6", features = ["serde"] } 82 | ulid = { version = "1", features = ["serde"] } 83 | 84 | [target.'cfg(target_arch = "wasm32")'.dependencies] 85 | # Ensure getrandom picks the JS backend via cfg flag when building wasm32-unknown-unknown. 86 | getrandom = { version = "0.3", features = ["wasm_js"] } 87 | js-sys = "0.3" 88 | 89 | 90 | [dev-dependencies] 91 | clap = { version = "4.5.4", features = ["derive"] } 92 | futures = "0.3" 93 | tempfile = "3" 94 | typed-arrow = { version = "0.5.1", features = ["ext-hooks"] } 95 | 96 | [target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] 97 | tokio = { version = "1", default-features = false, features = [ 98 | "macros", 99 | "net", 100 | "rt", 101 | "rt-multi-thread", 102 | "sync", 103 | "time", 104 | ] } 105 | 106 | [target.'cfg(target_arch = "wasm32")'.dev-dependencies] 107 | wasm-bindgen = "0.2" 108 | wasm-bindgen-futures = "0.4" 109 | wasm-bindgen-test = "0.3" 110 | 111 | [[test]] 112 | name = "read_smoke" 113 | path = "tests/read_smoke.rs" 114 | required-features = ["tokio"] 115 | 116 | [[test]] 117 | name = "s3_smoke" 118 | path = "tests/s3_smoke.rs" 119 | required-features = ["tokio"] 120 | 121 | [[example]] 122 | name = "10a_dynamic_basic" 123 | path = "examples/10_dynamic/10a_dynamic_basic.rs" 124 | 125 | [[example]] 126 | name = "10b_dynamic_metadata" 127 | path = "examples/10_dynamic/10b_dynamic_metadata.rs" 128 | 129 | [[example]] 130 | name = "10c_dynamic_composite" 131 | path = "examples/10_dynamic/10c_dynamic_composite.rs" 132 | 133 | [[example]] 134 | name = "10d_dynamic_transaction" 135 | path = "examples/10_dynamic/10d_dynamic_transaction.rs" 136 | -------------------------------------------------------------------------------- /examples/10_dynamic/10d_dynamic_transaction.rs: -------------------------------------------------------------------------------- 1 | // 05: Transactional writes (strict WAL) with optimistic staging and commit 2 | 3 | use fusio::{disk::LocalFs, executor::tokio::TokioExecutor}; 4 | use futures::StreamExt; 5 | use tonbo::prelude::*; 6 | use typed_arrow::{ 7 | Record, 8 | arrow_array::{Int32Array, StringArray}, 9 | schema::SchemaMeta, 10 | }; 11 | 12 | #[derive(Record)] 13 | struct UserRow { 14 | id: String, 15 | v: Option<i32>, 16 | } 17 | 18 | #[tokio::main] 19 | async fn main() { 20 | // Compile-time schema via typed-arrow derive; v is nullable, id is not. 21 | let schema = <UserRow as SchemaMeta>::schema(); 22 | 23 | // Configure dynamic mode with strict (durable) commit acknowledgements. 24 | // Use a temporary on-disk layout to enable WAL-backed transactions. 25 | let db = DbBuilder::from_schema_key_name(schema.clone(), "id") 26 | .expect("config") 27 | .with_commit_ack_mode(CommitAckMode::Strict) 28 | .on_disk("/tmp/tonbo") 29 | .expect("on_disk") 30 | .open() 31 | .await 32 | .expect("open db"); 33 | 34 | // // Build a RecordBatch using typed-arrow row builders. 35 | // let rows = vec![ 36 | // UserRow { 37 | // id: "user-1".into(), 38 | // v: Some(10), 39 | // }, 40 | // UserRow { 41 | // id: "user-2".into(), 42 | // v: None, // demonstrate nullable value 43 | // }, 44 | // ]; 45 | // let mut builders = <UserRow as BuildRows>::new_builders(rows.len()); 46 | // builders.append_rows(rows); 47 | // let batch = builders.finish().into_record_batch(); 48 | 49 | // // Begin a transaction and stage mutations. 50 | let tx = db.begin_transaction().await.expect("begin tx"); 51 | // tx.upsert_batch(&batch).expect("stage batch"); 52 | // tx.delete("ghost").expect("stage delete"); 53 | 54 | // Read-your-writes inside the transaction. 55 | let pred = Predicate::eq(ColumnRef::new("id"), ScalarValue::from("user-1")); 56 | let preview_batches = tx.scan().filter(pred).collect().await.expect("preview"); 57 | let mut preview_rows = Vec::new(); 58 | for batch in &preview_batches { 59 | let ids = batch 60 | .column(0) 61 | .as_any() 62 | .downcast_ref::<StringArray>() 63 | .expect("id col"); 64 | let vals = batch 65 | .column(1) 66 | .as_any() 67 | .downcast_ref::<Int32Array>() 68 | .expect("v col"); 69 | for (id, v) in ids.iter().zip(vals.iter()) { 70 | let key = id.unwrap_or("<null>").to_string(); 71 | let val = v.unwrap_or(0); 72 | preview_rows.push(format!("id={key}, v={val}")); 73 | } 74 | } 75 | println!("preview rows: {:?}", preview_rows); 76 | 77 | // Commit with strict WAL durability. 78 | tx.commit().await.expect("commit"); 79 | 80 | // Post-commit read via the public scan path. 81 | let all_pred = Predicate::is_not_null(ColumnRef::new("id")); 82 | let committed = scan_pairs(&db, &all_pred).await; 83 | println!("committed rows: {:?}", committed); 84 | } 85 | 86 | async fn scan_pairs(db: &DB<LocalFs, TokioExecutor>, predicate: &Predicate) -> Vec<(String, i32)> { 87 | let mut stream = db 88 | .scan() 89 | .filter(predicate.clone()) 90 | .stream() 91 | .await 92 | .expect("scan"); 93 | let mut out = Vec::new(); 94 | while let Some(batch) = stream.next().await { 95 | let batch = batch.expect("batch"); 96 | let ids = batch 97 | .column(0) 98 | .as_any() 99 | .downcast_ref::<StringArray>() 100 | .expect("id col"); 101 | let vals = batch 102 | .column(1) 103 | .as_any() 104 | .downcast_ref::<Int32Array>() 105 | .expect("v col"); 106 | for (id, v) in ids.iter().zip(vals.iter()) { 107 | if let Some(id) = id { 108 | out.push((id.to_string(), v.unwrap_or_default())); 109 | } 110 | } 111 | } 112 | out 113 | } 114 | -------------------------------------------------------------------------------- /predicate/src/core/builder.rs: -------------------------------------------------------------------------------- 1 | //! Small helpers for building predicates. 2 | use super::{ComparisonOp, Operand, Predicate, PredicateNode, ScalarValue}; 3 | 4 | /// Convenience constructors mirroring DataFusion-style expression helpers. 5 | impl Predicate { 6 | /// Returns a predicate that always evaluates to true (matches all rows). 7 | #[must_use] 8 | pub fn always() -> Self { 9 | Predicate::from_kind(PredicateNode::True) 10 | } 11 | 12 | /// Create a comparison predicate. 13 | #[must_use] 14 | pub fn compare<L, R>(left: L, op: ComparisonOp, right: R) -> Self 15 | where 16 | L: Into<Operand>, 17 | R: Into<Operand>, 18 | { 19 | Predicate::from_kind(PredicateNode::Compare { 20 | left: left.into(), 21 | op, 22 | right: right.into(), 23 | }) 24 | } 25 | 26 | /// Equality predicate. 27 | #[must_use] 28 | pub fn eq<L, R>(left: L, right: R) -> Self 29 | where 30 | L: Into<Operand>, 31 | R: Into<Operand>, 32 | { 33 | Self::compare(left, ComparisonOp::Equal, right) 34 | } 35 | 36 | /// Inequality predicate. 37 | #[must_use] 38 | pub fn neq<L, R>(left: L, right: R) -> Self 39 | where 40 | L: Into<Operand>, 41 | R: Into<Operand>, 42 | { 43 | Self::compare(left, ComparisonOp::NotEqual, right) 44 | } 45 | 46 | /// Less-than predicate. 47 | #[must_use] 48 | pub fn lt<L, R>(left: L, right: R) -> Self 49 | where 50 | L: Into<Operand>, 51 | R: Into<Operand>, 52 | { 53 | Self::compare(left, ComparisonOp::LessThan, right) 54 | } 55 | 56 | /// Less-than-or-equal predicate. 57 | #[must_use] 58 | pub fn lte<L, R>(left: L, right: R) -> Self 59 | where 60 | L: Into<Operand>, 61 | R: Into<Operand>, 62 | { 63 | Self::compare(left, ComparisonOp::LessThanOrEqual, right) 64 | } 65 | 66 | /// Greater-than predicate. 67 | #[must_use] 68 | pub fn gt<L, R>(left: L, right: R) -> Self 69 | where 70 | L: Into<Operand>, 71 | R: Into<Operand>, 72 | { 73 | Self::compare(left, ComparisonOp::GreaterThan, right) 74 | } 75 | 76 | /// Greater-than-or-equal predicate. 77 | #[must_use] 78 | pub fn gte<L, R>(left: L, right: R) -> Self 79 | where 80 | L: Into<Operand>, 81 | R: Into<Operand>, 82 | { 83 | Self::compare(left, ComparisonOp::GreaterThanOrEqual, right) 84 | } 85 | 86 | /// `IN` list predicate. 87 | #[must_use] 88 | pub fn in_list<O, I>(expr: O, list: I) -> Self 89 | where 90 | O: Into<Operand>, 91 | I: IntoIterator<Item = ScalarValue>, 92 | { 93 | Predicate::from_kind(PredicateNode::InList { 94 | expr: expr.into(), 95 | list: list.into_iter().collect(), 96 | negated: false, 97 | }) 98 | } 99 | 100 | /// `NOT IN` list predicate. 101 | #[must_use] 102 | pub fn not_in_list<O, I>(expr: O, list: I) -> Self 103 | where 104 | O: Into<Operand>, 105 | I: IntoIterator<Item = ScalarValue>, 106 | { 107 | Predicate::from_kind(PredicateNode::InList { 108 | expr: expr.into(), 109 | list: list.into_iter().collect(), 110 | negated: true, 111 | }) 112 | } 113 | 114 | /// `IS NULL` predicate. 115 | #[must_use] 116 | pub fn is_null<O>(expr: O) -> Self 117 | where 118 | O: Into<Operand>, 119 | { 120 | Predicate::from_kind(PredicateNode::IsNull { 121 | expr: expr.into(), 122 | negated: false, 123 | }) 124 | } 125 | 126 | /// `IS NOT NULL` predicate. 127 | #[must_use] 128 | pub fn is_not_null<O>(expr: O) -> Self 129 | where 130 | O: Into<Operand>, 131 | { 132 | Predicate::from_kind(PredicateNode::IsNull { 133 | expr: expr.into(), 134 | negated: true, 135 | }) 136 | } 137 | 138 | /// Logical negation. 139 | #[must_use] 140 | #[allow(clippy::should_implement_trait)] 141 | pub fn not(self) -> Self { 142 | Predicate::from_kind(PredicateNode::Not(Box::new(self))) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/tests_internal/backend.rs: -------------------------------------------------------------------------------- 1 | #![cfg(test)] 2 | 3 | use std::{ 4 | env, 5 | path::PathBuf, 6 | sync::Arc, 7 | time::{Duration, SystemTime, UNIX_EPOCH}, 8 | }; 9 | 10 | use fusio::{DynFs, disk::LocalFs, path::Path as FusioPath}; 11 | 12 | use crate::{ 13 | db::{AwsCreds, ObjectSpec, S3Spec, WalConfig}, 14 | wal::{WalSyncPolicy, state::FsWalStateStore}, 15 | }; 16 | 17 | /// Local filesystem backend harness metadata. 18 | pub struct LocalHarness { 19 | pub root: PathBuf, 20 | pub wal_dir: PathBuf, 21 | pub wal_config: WalConfig, 22 | pub cleanup: Option<Box<dyn FnOnce() + Send>>, 23 | } 24 | 25 | /// S3/object-store backend harness metadata. 26 | pub struct S3Harness { 27 | pub object: ObjectSpec, 28 | pub wal_config: WalConfig, 29 | } 30 | 31 | /// Common WAL tuning for e2e forcing small segments and fast flush. 32 | pub fn wal_tuning(policy: WalSyncPolicy) -> WalConfig { 33 | WalConfig::default() 34 | .segment_max_bytes(256) 35 | .flush_interval(Duration::from_millis(1)) 36 | .sync_policy(policy) 37 | } 38 | 39 | fn workspace_temp_dir(prefix: &str) -> PathBuf { 40 | let base = std::env::current_dir().expect("cwd"); 41 | let dir = base.join("target").join("tmp").join(format!( 42 | "{prefix}-{}", 43 | SystemTime::now() 44 | .duration_since(UNIX_EPOCH) 45 | .expect("time") 46 | .as_nanos() 47 | )); 48 | std::fs::create_dir_all(&dir).expect("create workspace temp dir"); 49 | dir 50 | } 51 | 52 | /// Local filesystem backend harness. 53 | pub fn local_harness( 54 | label: &str, 55 | wal_cfg: WalConfig, 56 | ) -> Result<LocalHarness, Box<dyn std::error::Error>> { 57 | let root = workspace_temp_dir(label); 58 | let wal_dir = root.join("wal"); 59 | std::fs::create_dir_all(&wal_dir)?; 60 | 61 | let wal_path = FusioPath::from_filesystem_path(&wal_dir)?; 62 | let wal_fs = Arc::new(LocalFs {}); 63 | let wal_backend: Arc<dyn DynFs> = wal_fs.clone(); 64 | let wal_state = Arc::new(FsWalStateStore::new(wal_fs)); 65 | let wal_config = wal_cfg 66 | .clone() 67 | .wal_dir(wal_path) 68 | .segment_backend(wal_backend) 69 | .state_store(Some(wal_state)); 70 | 71 | Ok(LocalHarness { 72 | root: root.clone(), 73 | wal_dir, 74 | wal_config, 75 | cleanup: Some(Box::new(move || { 76 | if let Err(err) = std::fs::remove_dir_all(&root) { 77 | eprintln!("cleanup failed for {:?}: {err}", &root); 78 | } 79 | })), 80 | }) 81 | } 82 | 83 | fn s3_env() -> Option<(String, String, String, String, String, Option<String>)> { 84 | let endpoint = env::var("TONBO_S3_ENDPOINT").ok()?; 85 | let bucket = env::var("TONBO_S3_BUCKET").ok()?; 86 | let region = env::var("TONBO_S3_REGION").ok()?; 87 | let access = env::var("TONBO_S3_ACCESS_KEY").ok()?; 88 | let secret = env::var("TONBO_S3_SECRET_KEY").ok()?; 89 | let session = env::var("TONBO_S3_SESSION_TOKEN").ok(); 90 | Some((endpoint, bucket, region, access, secret, session)) 91 | } 92 | 93 | fn unique_label(base: &str) -> String { 94 | let nanos = SystemTime::now() 95 | .duration_since(UNIX_EPOCH) 96 | .unwrap_or_else(|_| Duration::from_secs(0)) 97 | .as_nanos(); 98 | format!("{base}-{nanos}") 99 | } 100 | 101 | /// S3/object-store backend harness. Returns None when env is not present. 102 | pub fn maybe_s3_harness( 103 | label: &str, 104 | wal_cfg: WalConfig, 105 | ) -> Result<Option<S3Harness>, Box<dyn std::error::Error>> { 106 | let Some((endpoint, bucket, region, access, secret, session)) = s3_env() else { 107 | return Ok(None); 108 | }; 109 | 110 | let credentials = match session { 111 | Some(token) => AwsCreds::with_session_token(access, secret, token), 112 | None => AwsCreds::new(access, secret), 113 | }; 114 | 115 | let mut s3 = S3Spec::new(bucket.clone(), unique_label(label), credentials); 116 | s3.endpoint = Some(endpoint); 117 | s3.region = Some(region); 118 | s3.sign_payload = Some(true); 119 | 120 | let object = ObjectSpec::s3(s3); 121 | 122 | Ok(Some(S3Harness { 123 | object, 124 | wal_config: wal_cfg, 125 | })) 126 | } 127 | -------------------------------------------------------------------------------- /examples/04_s3.rs: -------------------------------------------------------------------------------- 1 | //! S3 Object Storage: serverless database on S3-compatible storage 2 | //! 3 | //! This example shows how to use Tonbo with S3 (or MinIO, R2, etc.) 4 | //! The database is just a manifest and Parquet files - no server process needed. 5 | //! 6 | //! Required environment variables: 7 | //! TONBO_S3_BUCKET - S3 bucket name 8 | //! TONBO_S3_ENDPOINT - S3 endpoint URL (for MinIO/R2/LocalStack) 9 | //! TONBO_S3_REGION - AWS region (e.g., "us-east-1") 10 | //! AWS_ACCESS_KEY_ID - Access key 11 | //! AWS_SECRET_ACCESS_KEY - Secret key 12 | //! 13 | //! For local testing with MinIO: 14 | //! docker run -p 9000:9000 -p 9001:9001 minio/minio server /data --console-address ":9001" 15 | //! # Create bucket "tonbo-test" in MinIO console at http://localhost:9001 16 | //! 17 | //! Then run: 18 | //! TONBO_S3_BUCKET=tonbo-test \ 19 | //! TONBO_S3_ENDPOINT=http://localhost:9000 \ 20 | //! TONBO_S3_REGION=us-east-1 \ 21 | //! AWS_ACCESS_KEY_ID=minioadmin \ 22 | //! AWS_SECRET_ACCESS_KEY=minioadmin \ 23 | //! cargo run --example 04_s3 24 | //! 25 | //! Run: cargo run --example 04_s3 26 | 27 | use std::env; 28 | 29 | use tonbo::{ 30 | db::{AwsCreds, ObjectSpec, S3Spec}, 31 | prelude::*, 32 | }; 33 | 34 | #[derive(Record)] 35 | struct Event { 36 | #[metadata(k = "tonbo.key", v = "true")] 37 | id: String, 38 | event_type: String, 39 | payload: Option<String>, 40 | } 41 | 42 | #[tokio::main] 43 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 44 | // Read S3 configuration from environment 45 | let bucket = env::var("TONBO_S3_BUCKET").map_err(|_| "TONBO_S3_BUCKET not set")?; 46 | let endpoint = env::var("TONBO_S3_ENDPOINT").ok(); 47 | let region = env::var("TONBO_S3_REGION").unwrap_or_else(|_| "us-east-1".into()); 48 | let credentials = AwsCreds::from_env()?; 49 | 50 | // Create S3 specification 51 | // The "prefix" is a folder path within the bucket for this table 52 | let prefix = format!("tonbo-example-{}", std::process::id()); 53 | let mut s3_spec = S3Spec::new(&bucket, &prefix, credentials); 54 | s3_spec.endpoint = endpoint.clone(); 55 | s3_spec.region = Some(region.clone()); 56 | s3_spec.sign_payload = Some(true); // Required for MinIO 57 | 58 | println!("Connecting to S3..."); 59 | println!(" Bucket: {}", bucket); 60 | println!(" Prefix: {}", prefix); 61 | if let Some(ep) = &endpoint { 62 | println!(" Endpoint: {}", ep); 63 | } 64 | 65 | // Open database on S3 66 | let db = DbBuilder::from_schema(Event::schema())? 67 | .object_store(ObjectSpec::s3(s3_spec))? 68 | .open() 69 | .await?; 70 | 71 | println!("\nDatabase opened on S3!"); 72 | 73 | // Insert data - writes go to S3 74 | let events = vec![ 75 | Event { 76 | id: "evt-001".into(), 77 | event_type: "user.created".into(), 78 | payload: Some(r#"{"user_id": 42}"#.into()), 79 | }, 80 | Event { 81 | id: "evt-002".into(), 82 | event_type: "order.placed".into(), 83 | payload: Some(r#"{"order_id": 123}"#.into()), 84 | }, 85 | Event { 86 | id: "evt-003".into(), 87 | event_type: "user.created".into(), 88 | payload: None, 89 | }, 90 | ]; 91 | 92 | let mut builders = Event::new_builders(events.len()); 93 | builders.append_rows(events); 94 | db.ingest(builders.finish().into_record_batch()).await?; 95 | 96 | println!("Inserted 3 events to S3"); 97 | 98 | // Query from S3 99 | let filter = Predicate::eq( 100 | ColumnRef::new("event_type"), 101 | ScalarValue::from("user.created"), 102 | ); 103 | let batches = db.scan().filter(filter).collect().await?; 104 | 105 | println!("\nEvents where event_type = 'user.created':"); 106 | for batch in &batches { 107 | for event in batch.iter_views::<Event>()?.try_flatten()? { 108 | let payload = event.payload.unwrap_or("(none)"); 109 | println!(" {} - {} | {}", event.id, event.event_type, payload); 110 | } 111 | } 112 | 113 | println!("\nData is stored as Parquet files on S3:"); 114 | println!(" s3://{}/{}/", bucket, prefix); 115 | 116 | Ok(()) 117 | } 118 | -------------------------------------------------------------------------------- /examples/07_streaming.rs: -------------------------------------------------------------------------------- 1 | //! Streaming: process large datasets without loading everything into memory 2 | //! 3 | //! Run: cargo run --example 07_streaming 4 | 5 | use futures::StreamExt; 6 | use tonbo::prelude::*; 7 | 8 | #[derive(Record)] 9 | struct LogEntry { 10 | #[metadata(k = "tonbo.key", v = "true")] 11 | id: i64, 12 | level: String, 13 | message: String, 14 | } 15 | 16 | #[tokio::main] 17 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 18 | let db = DbBuilder::from_schema(LogEntry::schema())? 19 | .on_disk("/tmp/tonbo_streaming")? 20 | .open() 21 | .await?; 22 | 23 | // Insert a batch of log entries 24 | let entries: Vec<LogEntry> = (0..1000) 25 | .map(|i| LogEntry { 26 | id: i, 27 | level: match i % 3 { 28 | 0 => "INFO".into(), 29 | 1 => "WARN".into(), 30 | _ => "ERROR".into(), 31 | }, 32 | message: format!("Log message #{}", i), 33 | }) 34 | .collect(); 35 | 36 | let mut builders = LogEntry::new_builders(entries.len()); 37 | builders.append_rows(entries); 38 | db.ingest(builders.finish().into_record_batch()).await?; 39 | 40 | println!("Inserted 1000 log entries\n"); 41 | 42 | // Method 1: collect() - loads all matching rows into memory 43 | // Good for small result sets 44 | println!("=== Method 1: collect() ==="); 45 | let filter = Predicate::eq(ColumnRef::new("level"), ScalarValue::from("ERROR")); 46 | let batches = db.scan().filter(filter).collect().await?; 47 | let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); 48 | println!( 49 | "Collected {} ERROR entries in {} batches\n", 50 | total_rows, 51 | batches.len() 52 | ); 53 | 54 | // Method 2: stream() - process batches one at a time 55 | // Good for large result sets or when you want to stop early 56 | println!("=== Method 2: stream() ==="); 57 | let filter = Predicate::eq(ColumnRef::new("level"), ScalarValue::from("WARN")); 58 | let mut stream = db.scan().filter(filter).stream().await?; 59 | 60 | let mut batch_count = 0; 61 | let mut row_count = 0; 62 | while let Some(result) = stream.next().await { 63 | let batch = result?; 64 | batch_count += 1; 65 | row_count += batch.num_rows(); 66 | println!(" Batch {}: {} rows", batch_count, batch.num_rows()); 67 | } 68 | println!( 69 | "Streamed {} WARN entries in {} batches\n", 70 | row_count, batch_count 71 | ); 72 | 73 | // Method 3: stream() with early termination 74 | // Process until you find what you need 75 | println!("=== Method 3: stream() with early exit ==="); 76 | let filter = Predicate::eq(ColumnRef::new("level"), ScalarValue::from("INFO")); 77 | let mut stream = db.scan().filter(filter).stream().await?; 78 | 79 | let mut found_count = 0; 80 | let target = 5; 81 | 'outer: while let Some(result) = stream.next().await { 82 | let batch = result?; 83 | for entry in batch.iter_views::<LogEntry>()?.try_flatten()? { 84 | println!(" Found: id={}, message={}", entry.id, entry.message); 85 | found_count += 1; 86 | if found_count >= target { 87 | println!(" (stopping after {} entries)", target); 88 | break 'outer; 89 | } 90 | } 91 | } 92 | 93 | // Method 4: stream() for aggregation without storing all data 94 | println!("\n=== Method 4: stream() for aggregation ==="); 95 | let mut stream = db.scan().stream().await?; 96 | 97 | let mut info_count = 0; 98 | let mut warn_count = 0; 99 | let mut error_count = 0; 100 | 101 | while let Some(result) = stream.next().await { 102 | let batch = result?; 103 | for entry in batch.iter_views::<LogEntry>()?.try_flatten()? { 104 | match entry.level.as_ref() { 105 | "INFO" => info_count += 1, 106 | "WARN" => warn_count += 1, 107 | "ERROR" => error_count += 1, 108 | _ => {} 109 | } 110 | } 111 | } 112 | println!( 113 | "Log level counts: INFO={}, WARN={}, ERROR={}", 114 | info_count, warn_count, error_count 115 | ); 116 | 117 | Ok(()) 118 | } 119 | -------------------------------------------------------------------------------- /predicate/src/core/visitor.rs: -------------------------------------------------------------------------------- 1 | use super::{Predicate, PredicateNode}; 2 | 3 | /// Result produced while evaluating parts of a predicate tree. 4 | #[derive(Clone, Debug, Default)] 5 | pub struct VisitOutcome<T> { 6 | /// Computed value for the evaluated portion, when available. 7 | pub value: Option<T>, 8 | /// Residual predicate that still needs evaluation elsewhere. 9 | pub residual: Option<Predicate>, 10 | } 11 | 12 | impl<T> VisitOutcome<T> { 13 | /// Outcome containing only a computed value. 14 | pub fn value(value: T) -> Self { 15 | Self { 16 | value: Some(value), 17 | residual: None, 18 | } 19 | } 20 | 21 | /// Outcome containing only a residual predicate. 22 | pub fn residual(residual: Predicate) -> Self { 23 | Self { 24 | value: None, 25 | residual: Some(residual), 26 | } 27 | } 28 | 29 | /// Outcome without value or residual. 30 | pub fn empty() -> Self { 31 | Self { 32 | value: None, 33 | residual: None, 34 | } 35 | } 36 | } 37 | 38 | /// Visitor that walks predicate trees and emits custom results plus residual predicates. 39 | pub trait PredicateVisitor { 40 | /// Error type used when evaluation fails. 41 | type Error; 42 | /// Concrete value type produced while walking the predicate. 43 | type Value; 44 | 45 | /// Evaluates a leaf predicate and returns its result. 46 | fn visit_leaf( 47 | &mut self, 48 | leaf: &PredicateNode, 49 | ) -> Result<VisitOutcome<Self::Value>, Self::Error>; 50 | 51 | /// Combines the result of a negated child predicate. 52 | fn combine_not( 53 | &mut self, 54 | original: &Predicate, 55 | child: VisitOutcome<Self::Value>, 56 | ) -> Result<VisitOutcome<Self::Value>, Self::Error>; 57 | 58 | /// Combines an `AND` clause from the supplied child results. 59 | fn combine_and( 60 | &mut self, 61 | original: &Predicate, 62 | children: Vec<VisitOutcome<Self::Value>>, 63 | ) -> Result<VisitOutcome<Self::Value>, Self::Error>; 64 | 65 | /// Combines an `OR` clause from the supplied child results. 66 | fn combine_or( 67 | &mut self, 68 | original: &Predicate, 69 | children: Vec<VisitOutcome<Self::Value>>, 70 | ) -> Result<VisitOutcome<Self::Value>, Self::Error>; 71 | 72 | /// Visits the supplied predicate by walking the expression tree. 73 | fn visit_predicate( 74 | &mut self, 75 | predicate: &Predicate, 76 | ) -> Result<VisitOutcome<Self::Value>, Self::Error> { 77 | self.visit_node(predicate.kind(), predicate) 78 | } 79 | 80 | /// Internal helper that evaluates a predicate node recursively. 81 | fn visit_node( 82 | &mut self, 83 | node: &PredicateNode, 84 | original: &Predicate, 85 | ) -> Result<VisitOutcome<Self::Value>, Self::Error> { 86 | match node { 87 | PredicateNode::Not(inner) => { 88 | let child = self.visit_predicate(inner)?; 89 | self.combine_not(original, child) 90 | } 91 | PredicateNode::And(clauses) => { 92 | debug_assert!( 93 | !clauses.is_empty(), 94 | "Predicate::make_and enforces at least one clause" 95 | ); 96 | let mut children = Vec::with_capacity(clauses.len()); 97 | for clause in clauses { 98 | children.push(self.visit_predicate(clause)?); 99 | } 100 | self.combine_and(original, children) 101 | } 102 | PredicateNode::Or(clauses) => { 103 | debug_assert!( 104 | !clauses.is_empty(), 105 | "Predicate::make_or enforces at least one clause" 106 | ); 107 | let mut children = Vec::with_capacity(clauses.len()); 108 | for clause in clauses { 109 | children.push(self.visit_predicate(clause)?); 110 | } 111 | self.combine_or(original, children) 112 | } 113 | leaf => { 114 | debug_assert!(leaf.is_leaf(), "non-leaf nodes handled earlier"); 115 | self.visit_leaf(leaf) 116 | } 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/key/ts.rs: -------------------------------------------------------------------------------- 1 | //! Composite `(key, timestamp)` helpers used by MVCC-aware structures. 2 | 3 | use std::{ 4 | cmp::Ordering, 5 | hash::{Hash, Hasher}, 6 | }; 7 | 8 | use super::{KeyOwned, KeyRow}; 9 | use crate::mvcc::Timestamp; 10 | 11 | /// Borrowed composite key pairing a raw key view with its commit timestamp. 12 | #[derive(Clone, Debug)] 13 | pub struct KeyTsViewRaw { 14 | key: KeyRow, 15 | ts: Timestamp, 16 | } 17 | 18 | impl KeyTsViewRaw { 19 | /// Build a new `(key, timestamp)` view from a key row. 20 | pub(crate) fn new(key: KeyRow, ts: Timestamp) -> Self { 21 | Self { key, ts } 22 | } 23 | 24 | /// Build a raw view derived from an owned key. Safe because the owned key 25 | /// retains the backing buffers. 26 | pub(crate) fn from_owned(key: &KeyOwned, ts: Timestamp) -> Self { 27 | let key_row = 28 | KeyRow::from_owned(key).expect("KeyOwned should only contain supported key components"); 29 | Self::new(key_row, ts) 30 | } 31 | 32 | /// Borrow the key component. 33 | pub(crate) fn key(&self) -> &KeyRow { 34 | &self.key 35 | } 36 | 37 | /// Commit timestamp carried by the entry. 38 | pub(crate) fn timestamp(&self) -> Timestamp { 39 | self.ts 40 | } 41 | 42 | /// Decompose the view. 43 | pub(crate) fn into_parts(self) -> (KeyRow, Timestamp) { 44 | (self.key, self.ts) 45 | } 46 | } 47 | 48 | impl PartialEq for KeyTsViewRaw { 49 | fn eq(&self, other: &Self) -> bool { 50 | self.ts == other.ts && self.key == other.key 51 | } 52 | } 53 | 54 | impl Eq for KeyTsViewRaw {} 55 | 56 | impl PartialOrd for KeyTsViewRaw { 57 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 58 | Some(self.cmp(other)) 59 | } 60 | } 61 | 62 | impl Ord for KeyTsViewRaw { 63 | fn cmp(&self, other: &Self) -> Ordering { 64 | match self.key.cmp(&other.key) { 65 | Ordering::Equal => other.ts.cmp(&self.ts), 66 | ordering => ordering, 67 | } 68 | } 69 | } 70 | 71 | impl Hash for KeyTsViewRaw { 72 | fn hash<H: Hasher>(&self, state: &mut H) { 73 | self.key.hash(state); 74 | self.ts.hash(state); 75 | } 76 | } 77 | 78 | /// Owned `(key, timestamp)` pair. 79 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 80 | pub struct KeyTsOwned { 81 | key: KeyOwned, 82 | ts: Timestamp, 83 | } 84 | 85 | impl KeyTsOwned { 86 | /// Construct an owned composite key. 87 | pub(crate) fn new(key: KeyOwned, ts: Timestamp) -> Self { 88 | Self { key, ts } 89 | } 90 | 91 | /// Borrow the owned key. 92 | pub(crate) fn key(&self) -> &KeyOwned { 93 | &self.key 94 | } 95 | 96 | /// Commit timestamp carried by the entry. 97 | pub(crate) fn timestamp(&self) -> Timestamp { 98 | self.ts 99 | } 100 | 101 | /// Visit as a borrowed raw view. 102 | pub(crate) fn as_raw_view(&self) -> KeyTsViewRaw { 103 | KeyTsViewRaw::from_owned(&self.key, self.ts) 104 | } 105 | } 106 | 107 | impl PartialOrd for KeyTsOwned { 108 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 109 | Some(self.cmp(other)) 110 | } 111 | } 112 | 113 | impl Ord for KeyTsOwned { 114 | fn cmp(&self, other: &Self) -> Ordering { 115 | match self.key.cmp(&other.key) { 116 | Ordering::Equal => other.ts.cmp(&self.ts), 117 | ordering => ordering, 118 | } 119 | } 120 | } 121 | 122 | impl From<(KeyOwned, Timestamp)> for KeyTsOwned { 123 | fn from((key, ts): (KeyOwned, Timestamp)) -> Self { 124 | KeyTsOwned::new(key, ts) 125 | } 126 | } 127 | 128 | #[cfg(test)] 129 | mod tests { 130 | use super::*; 131 | 132 | #[test] 133 | fn view_orders_descending_timestamps_per_key() { 134 | let key = KeyOwned::from("a"); 135 | let v1 = KeyTsViewRaw::from_owned(&key, Timestamp::new(1)); 136 | let v0 = KeyTsViewRaw::from_owned(&key, Timestamp::new(0)); 137 | 138 | assert!(v1 < v0); 139 | } 140 | 141 | #[test] 142 | fn owned_orders_key_then_timestamp() { 143 | let a1 = KeyTsOwned::new(KeyOwned::from("a"), Timestamp::new(1)); 144 | let a0 = KeyTsOwned::new(KeyOwned::from("a"), Timestamp::new(0)); 145 | let b1 = KeyTsOwned::new(KeyOwned::from("b"), Timestamp::new(1)); 146 | 147 | assert!(a1 < a0); 148 | assert!(a0 < b1); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/db/tests/core/flush.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use arrow_array::RecordBatch; 4 | use arrow_schema::{DataType, Field, Schema}; 5 | use fusio::{DynFs, disk::LocalFs, executor::NoopExecutor, mem::fs::InMemoryFs, path::Path}; 6 | use typed_arrow_dyn::{DynCell, DynRow}; 7 | 8 | use crate::{ 9 | db::{DB, DbInner}, 10 | inmem::policy::BatchesThreshold, 11 | mode::DynModeConfig, 12 | mvcc::Timestamp, 13 | ondisk::sstable::{SsTableConfig, SsTableDescriptor, SsTableError, SsTableId}, 14 | test::build_batch, 15 | }; 16 | 17 | #[tokio::test(flavor = "current_thread")] 18 | async fn flush_without_immutables_errors() { 19 | let schema = std::sync::Arc::new(Schema::new(vec![ 20 | Field::new("id", DataType::Utf8, false), 21 | Field::new("v", DataType::Int32, false), 22 | ])); 23 | let config = DynModeConfig::from_key_name(schema.clone(), "id").expect("key name config"); 24 | let executor = Arc::new(NoopExecutor); 25 | let db: DbInner<InMemoryFs, NoopExecutor> = DB::new(config, executor) 26 | .await 27 | .expect("db init") 28 | .into_inner(); 29 | 30 | let fs: Arc<dyn DynFs> = Arc::new(LocalFs {}); 31 | let sstable_cfg = Arc::new(SsTableConfig::new( 32 | schema.clone(), 33 | fs, 34 | Path::from("/tmp/tonbo-flush-test"), 35 | )); 36 | let descriptor = SsTableDescriptor::new(SsTableId::new(1), 0); 37 | 38 | let result = db 39 | .flush_immutables_with_descriptor(sstable_cfg, descriptor.clone()) 40 | .await; 41 | assert!(matches!(result, Err(SsTableError::NoImmutableSegments))); 42 | assert_eq!(db.num_immutable_segments(), 0); 43 | } 44 | 45 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 46 | async fn flush_publishes_manifest_version() { 47 | let schema = Arc::new(Schema::new(vec![ 48 | Field::new("id", DataType::Utf8, false), 49 | Field::new("v", DataType::Int32, false), 50 | ])); 51 | let extractor = crate::extractor::projection_for_field(schema.clone(), 0).expect("extractor"); 52 | let executor = Arc::new(NoopExecutor); 53 | let config = DynModeConfig::new(schema.clone(), extractor).expect("config"); 54 | let mut db: DbInner<InMemoryFs, NoopExecutor> = DB::new(config, Arc::clone(&executor)) 55 | .await 56 | .expect("db") 57 | .into_inner(); 58 | db.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 59 | 60 | let rows = vec![ 61 | DynRow(vec![Some(DynCell::Str("a".into())), Some(DynCell::I32(1))]), 62 | DynRow(vec![Some(DynCell::Str("b".into())), Some(DynCell::I32(2))]), 63 | ]; 64 | let batch: RecordBatch = build_batch(schema.clone(), rows).expect("valid dyn rows"); 65 | db.ingest(batch).await.expect("ingest triggers seal"); 66 | assert_eq!(db.num_immutable_segments(), 1); 67 | 68 | let fs: Arc<dyn DynFs> = Arc::new(LocalFs {}); 69 | let sstable_cfg = Arc::new(SsTableConfig::new( 70 | schema.clone(), 71 | fs, 72 | Path::from("/tmp/tonbo-flush-ok"), 73 | )); 74 | let descriptor = SsTableDescriptor::new(SsTableId::new(7), 0); 75 | 76 | let table = db 77 | .flush_immutables_with_descriptor(sstable_cfg, descriptor.clone()) 78 | .await 79 | .expect("flush succeeds"); 80 | assert_eq!(db.num_immutable_segments(), 0); 81 | 82 | let snapshot = db 83 | .manifest 84 | .snapshot_latest(db.manifest_table) 85 | .await 86 | .expect("manifest snapshot"); 87 | assert_eq!( 88 | snapshot.head.last_manifest_txn, 89 | Some(Timestamp::new(1)), 90 | "first flush should publish manifest txn 1" 91 | ); 92 | let latest = snapshot 93 | .latest_version 94 | .expect("latest version must exist after flush"); 95 | assert_eq!( 96 | latest.commit_timestamp(), 97 | Timestamp::new(1), 98 | "latest version should reflect manifest txn 1" 99 | ); 100 | assert_eq!(latest.ssts().len(), 1); 101 | assert_eq!(latest.ssts()[0].len(), 1); 102 | let recorded = &latest.ssts()[0][0]; 103 | assert_eq!(recorded.sst_id(), descriptor.id()); 104 | assert!( 105 | recorded.stats().is_some() || table.descriptor().stats().is_none(), 106 | "stats should propagate when available" 107 | ); 108 | assert!( 109 | recorded.wal_segments().is_none(), 110 | "no WAL segments recorded since none were attached" 111 | ); 112 | } 113 | -------------------------------------------------------------------------------- /src/tests_internal/time_travel_e2e.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::{fs, path::PathBuf, sync::Arc}; 4 | 5 | use arrow_array::{Int32Array, RecordBatch, StringArray}; 6 | use arrow_schema::{DataType, Field}; 7 | use fusio::{disk::LocalFs, executor::tokio::TokioExecutor}; 8 | 9 | use crate::db::{BatchesThreshold, ColumnRef, DB, Predicate}; 10 | 11 | #[path = "common/mod.rs"] 12 | mod common; 13 | use common::config_with_pk; 14 | 15 | fn workspace_temp_dir(prefix: &str) -> PathBuf { 16 | let base = std::env::current_dir().expect("cwd"); 17 | let dir = base.join("target").join("tmp").join(format!( 18 | "{prefix}-{}", 19 | std::time::SystemTime::now() 20 | .duration_since(std::time::UNIX_EPOCH) 21 | .expect("time") 22 | .as_nanos() 23 | )); 24 | fs::create_dir_all(&dir).expect("create workspace temp dir"); 25 | dir 26 | } 27 | 28 | fn extract_rows(batches: Vec<RecordBatch>) -> Vec<(String, i32)> { 29 | let mut rows = Vec::new(); 30 | for batch in batches { 31 | let ids = batch 32 | .column(0) 33 | .as_any() 34 | .downcast_ref::<StringArray>() 35 | .expect("id col"); 36 | let vals = batch 37 | .column(1) 38 | .as_any() 39 | .downcast_ref::<Int32Array>() 40 | .expect("v col"); 41 | for (id, v) in ids.iter().zip(vals.iter()) { 42 | if let (Some(id), Some(v)) = (id, v) { 43 | rows.push((id.to_string(), v)); 44 | } 45 | } 46 | } 47 | rows.sort(); 48 | rows 49 | } 50 | 51 | /// Verify snapshot_at can time-travel between manifest versions. 52 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 53 | async fn snapshot_at_reads_older_manifest_version() -> Result<(), Box<dyn std::error::Error>> { 54 | let temp_root = workspace_temp_dir("time-travel"); 55 | let root_str = temp_root.to_string_lossy().into_owned(); 56 | 57 | let config = config_with_pk( 58 | vec![ 59 | Field::new("id", DataType::Utf8, false), 60 | Field::new("v", DataType::Int32, false), 61 | ], 62 | &["id"], 63 | ); 64 | let schema = config.schema(); 65 | 66 | let executor = Arc::new(TokioExecutor::default()); 67 | let mut inner = DB::<LocalFs, TokioExecutor>::builder(config) 68 | .on_disk(root_str.clone())? 69 | .with_minor_compaction(1, 0, 1) 70 | .open_with_executor(Arc::clone(&executor)) 71 | .await? 72 | .into_inner(); 73 | inner.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 74 | let db = DB::from_inner(Arc::new(inner)); 75 | 76 | let batch_v1 = RecordBatch::try_new( 77 | schema.clone(), 78 | vec![ 79 | Arc::new(StringArray::from(vec!["v1-a", "v1-b"])) as _, 80 | Arc::new(Int32Array::from(vec![1, 2])) as _, 81 | ], 82 | )?; 83 | db.ingest(batch_v1).await?; 84 | 85 | let batch_v2 = RecordBatch::try_new( 86 | schema.clone(), 87 | vec![ 88 | Arc::new(StringArray::from(vec!["v2-a"])) as _, 89 | Arc::new(Int32Array::from(vec![99])) as _, 90 | ], 91 | )?; 92 | db.ingest(batch_v2).await?; 93 | 94 | let versions = db.list_versions(10).await?; 95 | if versions.len() < 2 { 96 | eprintln!("insufficient manifest versions recorded; skipping time-travel assertion"); 97 | return Ok(()); 98 | } 99 | 100 | let earliest = versions.last().expect("earliest version"); 101 | let snapshot_old = db.snapshot_at(earliest.timestamp).await?; 102 | let predicate = Predicate::is_not_null(ColumnRef::new("id")); 103 | 104 | let old_rows = extract_rows( 105 | snapshot_old 106 | .scan(&db) 107 | .filter(predicate.clone()) 108 | .collect() 109 | .await?, 110 | ); 111 | assert_eq!( 112 | old_rows, 113 | vec![("v1-a".into(), 1), ("v1-b".into(), 2)], 114 | "older snapshot should not see later writes" 115 | ); 116 | 117 | let latest_rows = extract_rows(db.scan().filter(predicate).collect().await?); 118 | assert!( 119 | latest_rows.contains(&("v2-a".into(), 99)), 120 | "latest view should include second batch" 121 | ); 122 | 123 | if let Err(err) = fs::remove_dir_all(&temp_root) { 124 | eprintln!("failed to clean temp dir {:?}: {err}", &temp_root); 125 | } 126 | Ok(()) 127 | } 128 | -------------------------------------------------------------------------------- /src/query/scan.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::BTreeSet, sync::Arc}; 2 | 3 | use arrow_schema::{Schema, SchemaRef}; 4 | use tonbo_predicate::{Operand, Predicate, PredicateNode}; 5 | 6 | use crate::{ 7 | extractor::KeyExtractError, 8 | manifest::{SstEntry, TableSnapshot}, 9 | mvcc::Timestamp, 10 | }; 11 | 12 | /// Internal representation of a scan plan. Things included in the plan: 13 | /// * predicate: the caller-supplied predicate used for pruning and residual evaluation 14 | /// * range_set: cached primary-key ranges derived from the predicate for pruning 15 | /// * immutable_memtable_idxes: which immutable memtables need to be scanned in execution phase 16 | /// * ssts: level-ed sstable where entry contains the identifier and its corresponding pruning row 17 | /// set result 18 | /// * limit: the raw limit 19 | /// * read_ts: snapshot/read timestamp 20 | pub(crate) struct ScanPlan { 21 | pub(crate) _predicate: Predicate, 22 | pub(crate) immutable_indexes: Vec<usize>, 23 | pub(crate) residual_predicate: Option<Predicate>, 24 | pub(crate) projected_schema: Option<SchemaRef>, 25 | pub(crate) scan_schema: SchemaRef, 26 | pub(crate) limit: Option<usize>, 27 | pub(crate) read_ts: Timestamp, 28 | 29 | pub(crate) _snapshot: TableSnapshot, 30 | } 31 | 32 | pub(crate) fn projection_with_predicate( 33 | base_schema: &SchemaRef, 34 | projection: &SchemaRef, 35 | predicate: Option<&Predicate>, 36 | ) -> Result<SchemaRef, KeyExtractError> { 37 | let mut required = BTreeSet::new(); 38 | if let Some(pred) = predicate { 39 | collect_predicate_columns(pred, &mut required); 40 | } 41 | extend_projection_schema(base_schema, projection, &required) 42 | } 43 | 44 | fn extend_projection_schema( 45 | base_schema: &SchemaRef, 46 | projection: &SchemaRef, 47 | required: &BTreeSet<Arc<str>>, 48 | ) -> Result<SchemaRef, KeyExtractError> { 49 | if required.is_empty() 50 | || required.iter().all(|name| { 51 | projection 52 | .fields() 53 | .iter() 54 | .any(|field| field.name() == name.as_ref()) 55 | }) 56 | { 57 | return Ok(Arc::clone(projection)); 58 | } 59 | 60 | let mut needed: BTreeSet<Arc<str>> = projection 61 | .fields() 62 | .iter() 63 | .map(|field| Arc::<str>::from(field.name().as_str())) 64 | .collect(); 65 | needed.extend(required.iter().cloned()); 66 | 67 | let mut fields = Vec::new(); 68 | for field in base_schema.fields() { 69 | if needed.remove(field.name().as_str()) { 70 | fields.push(field.clone()); 71 | } 72 | } 73 | 74 | if !needed.is_empty() { 75 | // TODO: add nested-column support once predicates can address nested fields. 76 | let missing = needed.iter().next().expect("missing column present"); 77 | return Err(KeyExtractError::NoSuchField { 78 | name: missing.to_string(), 79 | }); 80 | } 81 | 82 | Ok(Arc::new(Schema::new(fields))) 83 | } 84 | 85 | fn collect_predicate_columns(predicate: &Predicate, out: &mut BTreeSet<Arc<str>>) { 86 | match predicate.kind() { 87 | PredicateNode::True => {} 88 | PredicateNode::Compare { left, right, .. } => { 89 | collect_operand_column(left, out); 90 | collect_operand_column(right, out); 91 | } 92 | PredicateNode::InList { expr, .. } | PredicateNode::IsNull { expr, .. } => { 93 | collect_operand_column(expr, out); 94 | } 95 | PredicateNode::Not(child) => collect_predicate_columns(child, out), 96 | PredicateNode::And(children) | PredicateNode::Or(children) => { 97 | for child in children { 98 | collect_predicate_columns(child, out); 99 | } 100 | } 101 | } 102 | } 103 | 104 | fn collect_operand_column(operand: &Operand, out: &mut BTreeSet<Arc<str>>) { 105 | if let Operand::Column(column) = operand { 106 | out.insert(Arc::clone(&column.name)); 107 | } 108 | } 109 | 110 | impl ScanPlan { 111 | /// Access SST entries from the snapshot, grouped by compaction level. 112 | /// 113 | /// Returns all SST entries across all levels that should be scanned. 114 | /// Pruning based on key ranges or statistics will be added in future iterations. 115 | pub(crate) fn sst_entries(&self) -> impl Iterator<Item = &SstEntry> { 116 | self._snapshot 117 | .latest_version 118 | .as_ref() 119 | .map(|v| v.ssts()) 120 | .unwrap_or(&[]) 121 | .iter() 122 | .flatten() 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/tests_internal/wal_rotation_e2e.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::{fs, path::PathBuf, sync::Arc}; 4 | 5 | use arrow_array::{Int32Array, RecordBatch, StringArray}; 6 | use arrow_schema::{DataType, Field}; 7 | use fusio::{DynFs, disk::LocalFs, executor::tokio::TokioExecutor, path::Path as FusioPath}; 8 | use serde::Deserialize; 9 | 10 | use crate::{ 11 | db::{BatchesThreshold, WalSyncPolicy}, 12 | test_support::{TestFsWalStateStore, TestWalExt as WalExt}, 13 | }; 14 | 15 | #[path = "common/mod.rs"] 16 | mod common; 17 | use common::config_with_pk; 18 | 19 | fn workspace_temp_dir(prefix: &str) -> PathBuf { 20 | let base = std::env::current_dir().expect("cwd"); 21 | let dir = base.join("target").join("tmp").join(format!( 22 | "{prefix}-{}", 23 | std::time::SystemTime::now() 24 | .duration_since(std::time::UNIX_EPOCH) 25 | .expect("time") 26 | .as_nanos() 27 | )); 28 | fs::create_dir_all(&dir).expect("create workspace temp dir"); 29 | dir 30 | } 31 | 32 | fn wal_cfg_with_backend(wal_dir: &PathBuf) -> crate::db::WalConfig { 33 | use std::time::Duration; 34 | fs::create_dir_all(wal_dir).expect("wal dir"); 35 | let wal_path = FusioPath::from_filesystem_path(wal_dir).expect("wal path"); 36 | let wal_fs = Arc::new(LocalFs {}); 37 | let wal_backend: Arc<dyn DynFs> = wal_fs.clone(); 38 | let wal_state = Arc::new(TestFsWalStateStore::new(wal_fs)); 39 | crate::db::WalConfig::default() 40 | .wal_dir(wal_path) 41 | .segment_backend(wal_backend) 42 | .state_store(Some(wal_state)) 43 | .segment_max_bytes(256) 44 | .flush_interval(Duration::from_millis(1)) 45 | .sync_policy(WalSyncPolicy::Always) 46 | } 47 | 48 | #[derive(Debug, Deserialize)] 49 | struct WalStateDisk { 50 | last_segment_seq: Option<u64>, 51 | } 52 | 53 | /// WAL with sync policy `Always` should rotate segments and persist state before shutdown. 54 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 55 | async fn wal_rotation_and_state_persisted() -> Result<(), Box<dyn std::error::Error>> { 56 | let temp_root = workspace_temp_dir("wal-rotation-e2e"); 57 | let root_str = temp_root.to_string_lossy().into_owned(); 58 | 59 | let config = config_with_pk( 60 | vec![ 61 | Field::new("id", DataType::Utf8, false), 62 | Field::new("v", DataType::Int32, false), 63 | ], 64 | &["id"], 65 | ); 66 | let schema = config.schema(); 67 | 68 | let wal_dir = temp_root.join("wal"); 69 | let wal_cfg = wal_cfg_with_backend(&wal_dir); 70 | let executor = Arc::new(TokioExecutor::default()); 71 | 72 | let mut db = crate::db::DB::<LocalFs, TokioExecutor>::builder(config) 73 | .on_disk(root_str.clone())? 74 | .wal_config(wal_cfg.clone()) 75 | .with_minor_compaction(1, 0, 1) 76 | .open_with_executor(Arc::clone(&executor)) 77 | .await? 78 | .into_inner(); 79 | db.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 80 | 81 | // Two batches large enough to force multiple WAL frames and at least one rotation. 82 | for idx in 0..3 { 83 | let ids: Vec<String> = (0..64).map(|n| format!("user-{idx}-{n:02}")).collect(); 84 | let vals: Vec<i32> = (0..64).map(|n| idx as i32 * 100 + n as i32).collect(); 85 | let batch = RecordBatch::try_new( 86 | schema.clone(), 87 | vec![ 88 | Arc::new(StringArray::from(ids)) as _, 89 | Arc::new(Int32Array::from(vals)) as _, 90 | ], 91 | )?; 92 | db.ingest(batch).await?; 93 | } 94 | 95 | // Ensure writer drains and state is flushed. 96 | db.disable_wal().await?; 97 | 98 | // Expect multiple WAL segment files on disk. 99 | let wal_files: Vec<_> = fs::read_dir(&wal_dir)? 100 | .flatten() 101 | .filter(|entry| { 102 | entry.file_name().to_string_lossy().starts_with("wal-") 103 | && entry.file_name().to_string_lossy().ends_with(".tonwal") 104 | }) 105 | .collect(); 106 | assert!( 107 | wal_files.len() >= 2, 108 | "expected wal rotation to produce multiple segments" 109 | ); 110 | 111 | // State store should record the latest segment sequence to allow recovery. 112 | let state_path = wal_dir.join("state.json"); 113 | let state: WalStateDisk = serde_json::from_slice(&fs::read(&state_path)?)?; 114 | assert!( 115 | state.last_segment_seq.unwrap_or(0) >= 1, 116 | "state should capture last wal segment" 117 | ); 118 | 119 | if let Err(err) = fs::remove_dir_all(&temp_root) { 120 | eprintln!("failed to clean temp dir {:?}: {err}", &temp_root); 121 | } 122 | 123 | Ok(()) 124 | } 125 | -------------------------------------------------------------------------------- /src/wal/manifest_ext.rs: -------------------------------------------------------------------------------- 1 | //! Helpers that bridge WAL metadata into manifest structures. 2 | 3 | use std::sync::Arc; 4 | 5 | use crate::{ 6 | manifest::WalSegmentRef, 7 | wal::{ 8 | WalConfig, WalError, 9 | storage::{SegmentDescriptor, SegmentFrameBounds, WalStorage}, 10 | wal_segment_file_id, 11 | }, 12 | }; 13 | 14 | /// Collect WAL segment references using the configuration supplied to the writer. 15 | pub(crate) async fn collect_wal_segment_refs( 16 | cfg: &WalConfig, 17 | manifest_floor: Option<&WalSegmentRef>, 18 | live_frame_floor: Option<u64>, 19 | ) -> Result<Vec<WalSegmentRef>, WalError> { 20 | let storage = WalStorage::new(Arc::clone(&cfg.segment_backend), cfg.dir.clone()); 21 | let wal_state_hint = storage 22 | .load_state_handle(cfg.state_store.as_ref()) 23 | .await? 24 | .and_then(|handle| handle.state().last_segment_seq); 25 | 26 | let Some(tail) = storage.tail_metadata_with_hint(wal_state_hint).await? else { 27 | return Ok(Vec::new()); 28 | }; 29 | 30 | let mut refs = Vec::with_capacity(2); 31 | let manifest_cutoff = manifest_floor.map(|f| f.seq()); 32 | for descriptor in tail 33 | .completed 34 | .iter() 35 | .filter(|descriptor| descriptor.bytes > 0) 36 | { 37 | let bounds = storage 38 | .segment_frame_bounds(&descriptor.path) 39 | .await? 40 | .ok_or({ 41 | WalError::Corrupt("wal segment contained no frames despite non-zero length") 42 | })?; 43 | if segment_required(descriptor.seq, &bounds, manifest_cutoff, live_frame_floor) { 44 | refs.push(wal_segment_ref_from_descriptor(descriptor, bounds)); 45 | } 46 | } 47 | 48 | if tail.active.bytes > 0 { 49 | let bounds = storage 50 | .segment_frame_bounds(&tail.active.path) 51 | .await? 52 | .ok_or({ 53 | WalError::Corrupt("active wal segment contained no frames despite non-zero length") 54 | })?; 55 | if segment_required(tail.active.seq, &bounds, manifest_cutoff, live_frame_floor) { 56 | refs.push(wal_segment_ref_from_descriptor(&tail.active, bounds)); 57 | } 58 | } 59 | 60 | // Note: We intentionally don't re-add the old floor when refs is empty. 61 | // If no segments are required (all data persisted), the floor should be cleared. 62 | // This ensures WAL replay doesn't replay already-persisted data. 63 | 64 | refs.sort_by_key(|segment| segment.seq()); 65 | refs.dedup_by_key(|segment| segment.seq()); 66 | 67 | Ok(refs) 68 | } 69 | 70 | fn wal_segment_ref_from_descriptor( 71 | descriptor: &SegmentDescriptor, 72 | bounds: SegmentFrameBounds, 73 | ) -> WalSegmentRef { 74 | let file_id = wal_segment_file_id(descriptor.seq); 75 | WalSegmentRef::new(descriptor.seq, file_id, bounds.first_seq, bounds.last_seq) 76 | } 77 | 78 | fn segment_required( 79 | seq: u64, 80 | bounds: &SegmentFrameBounds, 81 | manifest_cutoff: Option<u64>, 82 | live_frame_floor: Option<u64>, 83 | ) -> bool { 84 | if let Some(live_floor) = live_frame_floor 85 | && bounds.last_seq >= live_floor 86 | { 87 | return true; 88 | } 89 | 90 | match manifest_cutoff { 91 | Some(cutoff) => seq > cutoff, 92 | None => true, 93 | } 94 | } 95 | 96 | #[cfg(test)] 97 | mod tests { 98 | use super::*; 99 | 100 | #[test] 101 | fn manifest_floor_filters_when_unpinned() { 102 | let bounds = SegmentFrameBounds { 103 | first_seq: 10, 104 | last_seq: 20, 105 | }; 106 | assert!(!segment_required(5, &bounds, Some(5), None)); 107 | assert!(segment_required(6, &bounds, Some(5), None)); 108 | } 109 | 110 | #[test] 111 | fn live_floor_keeps_older_segments() { 112 | let bounds = SegmentFrameBounds { 113 | first_seq: 30, 114 | last_seq: 40, 115 | }; 116 | assert!(segment_required(3, &bounds, Some(5), Some(35))); 117 | assert!(!segment_required(3, &bounds, Some(5), Some(45))); 118 | } 119 | } 120 | 121 | /// Prune WAL segments whose sequence is strictly below the manifest floor. 122 | pub(crate) async fn prune_wal_segments( 123 | cfg: &WalConfig, 124 | floor: &WalSegmentRef, 125 | ) -> Result<usize, WalError> { 126 | let storage = WalStorage::new(Arc::clone(&cfg.segment_backend), cfg.dir.clone()); 127 | if cfg.prune_dry_run { 128 | let segments = storage.list_segments_with_hint(None).await?; 129 | let removable = segments 130 | .into_iter() 131 | .filter(|descriptor| descriptor.seq < floor.seq()) 132 | .count(); 133 | Ok(removable) 134 | } else { 135 | storage.prune_below(floor.seq()).await 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/mvcc/mod.rs: -------------------------------------------------------------------------------- 1 | //! MVCC timestamps, read visibility, and commit clock helpers. 2 | 3 | use std::fmt; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | /// Canonical column name storing MVCC commit timestamps alongside Arrow payloads. 8 | pub(crate) const MVCC_COMMIT_COL: &str = "_commit_ts"; 9 | 10 | /// Logical commit timestamp assigned to mutations and read views. 11 | /// 12 | /// Timestamps are monotonically increasing `u64` values that identify 13 | /// each committed version of the database. They can be used for: 14 | /// - Point-in-time queries via [`DB::snapshot_at`](crate::db::DB::snapshot_at) 15 | /// - Listing historical versions via [`DB::list_versions`](crate::db::DB::list_versions) 16 | #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] 17 | pub struct Timestamp(u64); 18 | 19 | impl Timestamp { 20 | /// Least possible timestamp (used for uninitialised clocks). 21 | pub const MIN: Self = Self(0); 22 | /// Greatest possible timestamp (used for open-ended visibility). 23 | pub const MAX: Self = Self(u64::MAX); 24 | 25 | /// Construct a timestamp from a raw `u64`. 26 | #[inline] 27 | pub const fn new(raw: u64) -> Self { 28 | Self(raw) 29 | } 30 | 31 | /// Returns the raw `u64` value backing this timestamp. 32 | #[inline] 33 | pub const fn get(self) -> u64 { 34 | self.0 35 | } 36 | 37 | /// Returns the next timestamp after `self`, saturating on overflow. 38 | #[inline] 39 | pub(crate) const fn next(self) -> Self { 40 | Self(self.0.saturating_add(1)) 41 | } 42 | 43 | /// Add `delta` while saturating on overflow. 44 | #[inline] 45 | pub(crate) const fn saturating_add(self, delta: u64) -> Self { 46 | Self(self.0.saturating_add(delta)) 47 | } 48 | 49 | /// Subtract `delta` while saturating at [`Timestamp::MIN`]. 50 | #[inline] 51 | pub(crate) const fn saturating_sub(self, delta: u64) -> Self { 52 | Self(self.0.saturating_sub(delta)) 53 | } 54 | } 55 | 56 | impl From<u64> for Timestamp { 57 | fn from(value: u64) -> Self { 58 | Self(value) 59 | } 60 | } 61 | 62 | impl From<Timestamp> for u64 { 63 | fn from(ts: Timestamp) -> Self { 64 | ts.0 65 | } 66 | } 67 | 68 | impl fmt::Debug for Timestamp { 69 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 70 | f.debug_tuple("Timestamp").field(&self.0).finish() 71 | } 72 | } 73 | 74 | /// Tracks the next commit timestamp to allocate. 75 | use std::sync::atomic::{AtomicU64, Ordering}; 76 | 77 | /// Monotonic allocator for MVCC commit timestamps backed by atomics. 78 | #[derive(Debug)] 79 | pub struct CommitClock { 80 | next: AtomicU64, 81 | } 82 | 83 | impl CommitClock { 84 | /// Create a new clock that will hand out timestamps starting from `start`. 85 | #[inline] 86 | pub(crate) const fn new(start: Timestamp) -> Self { 87 | Self { 88 | next: AtomicU64::new(start.get()), 89 | } 90 | } 91 | 92 | /// Allocate and return the next commit timestamp. 93 | #[inline] 94 | pub(crate) fn alloc(&self) -> Timestamp { 95 | let current = self.next.fetch_add(1, Ordering::Relaxed); 96 | Timestamp::new(current) 97 | } 98 | 99 | /// Return the timestamp that will be handed out next. 100 | #[inline] 101 | pub(crate) fn peek(&self) -> Timestamp { 102 | Timestamp::new(self.next.load(Ordering::Relaxed)) 103 | } 104 | 105 | /// Advance the clock so that it will hand out at least `candidate`. 106 | /// 107 | /// Useful after recovery where the highest observed commit is already known. 108 | #[inline] 109 | pub(crate) fn advance_to_at_least(&self, candidate: Timestamp) { 110 | // Use a loop with compare_exchange to avoid losing larger candidates. 111 | let mut current = self.next.load(Ordering::Relaxed); 112 | while candidate.get() > current { 113 | match self.next.compare_exchange( 114 | current, 115 | candidate.get(), 116 | Ordering::Relaxed, 117 | Ordering::Relaxed, 118 | ) { 119 | Ok(_) => break, 120 | Err(actual) => current = actual, 121 | } 122 | } 123 | } 124 | } 125 | 126 | impl Default for CommitClock { 127 | fn default() -> Self { 128 | Self::new(Timestamp::MIN) 129 | } 130 | } 131 | 132 | /// Immutable view acquired by readers to evaluate MVCC visibility. 133 | #[derive(Debug, Clone, Copy)] 134 | pub struct ReadView { 135 | read_ts: Timestamp, 136 | } 137 | 138 | impl ReadView { 139 | /// Build a read view pinned at `read_ts`. 140 | #[inline] 141 | pub(crate) const fn new(read_ts: Timestamp) -> Self { 142 | Self { read_ts } 143 | } 144 | 145 | /// Commit timestamp visible to the view (inclusive). 146 | #[inline] 147 | pub(crate) const fn read_ts(&self) -> Timestamp { 148 | self.read_ts 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/db/tests/wasm_web.rs: -------------------------------------------------------------------------------- 1 | //! Wasm/web integration tests for DB with S3 backend. 2 | 3 | use std::sync::Arc; 4 | 5 | use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; 6 | use arrow_schema::{DataType, Field, Schema}; 7 | use fusio::{executor::web::WebExecutor, impls::remotes::aws::fs::AmazonS3, path::Path}; 8 | use futures::StreamExt; 9 | use js_sys::Date; 10 | use wasm_bindgen_test::{wasm_bindgen_test, wasm_bindgen_test_configure}; 11 | 12 | use super::{AwsCreds, DB, ObjectSpec, S3Spec}; 13 | use crate::{ 14 | inmem::policy::BatchesThreshold, 15 | mvcc::Timestamp, 16 | ondisk::sstable::{SsTableConfig, SsTableDescriptor, SsTableId, SsTableReader}, 17 | schema::SchemaBuilder, 18 | wal::{WalSyncPolicy, frame::WalEvent, replay::Replayer}, 19 | }; 20 | 21 | wasm_bindgen_test_configure!(run_in_browser); 22 | 23 | fn memory_s3_spec(prefix: String) -> S3Spec { 24 | let mut spec = S3Spec::new( 25 | "wasm-mock-bucket", 26 | prefix, 27 | AwsCreds::new("access", "secret"), 28 | ); 29 | spec.endpoint = Some("memory://wasm-web".to_string()); 30 | spec.region = Some("us-east-1".to_string()); 31 | spec 32 | } 33 | 34 | #[wasm_bindgen_test] 35 | async fn web_s3_roundtrip_wal_and_sstable() { 36 | let schema = Arc::new(Schema::new(vec![ 37 | Field::new("id", DataType::Utf8, false), 38 | Field::new("value", DataType::Int32, false), 39 | ])); 40 | let schema_cfg = SchemaBuilder::from_schema(Arc::clone(&schema)) 41 | .primary_key("id") 42 | .build() 43 | .expect("schema config"); 44 | 45 | let now_ms = Date::now() as u128; 46 | let prefix = format!("wasm-web-smoke-{now_ms}"); 47 | let s3_spec = memory_s3_spec(prefix.clone()); 48 | 49 | let exec = Arc::new(WebExecutor::new()); 50 | let mut db: DB<AmazonS3, WebExecutor> = DB::<AmazonS3, WebExecutor>::builder(schema_cfg) 51 | .object_store(ObjectSpec::s3(s3_spec)) 52 | .expect("object_store config") 53 | .wal_sync_policy(WalSyncPolicy::Always) 54 | .open_with_executor(Arc::clone(&exec)) 55 | .await 56 | .expect("build web db"); 57 | 58 | // Seal after every batch so immutables are flushed deterministically in tests. 59 | db.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 60 | 61 | let batch = RecordBatch::try_new( 62 | Arc::clone(&schema), 63 | vec![ 64 | Arc::new(StringArray::from(vec!["alpha", "beta"])) as _, 65 | Arc::new(Int32Array::from(vec![1, 2])) as _, 66 | ], 67 | ) 68 | .expect("batch"); 69 | 70 | db.ingest(batch.clone()).await.expect("ingest"); 71 | 72 | let wal_cfg = db.wal_config().cloned().expect("wal config present"); 73 | let mut wal_events = Replayer::new(wal_cfg.clone()) 74 | .scan() 75 | .await 76 | .expect("wal replay"); 77 | assert!( 78 | wal_events 79 | .iter() 80 | .any(|event| matches!(event, WalEvent::DynAppend { .. })), 81 | "wal append should be visible" 82 | ); 83 | 84 | // Flush immutables into an SST and read it back through the Parquet reader. 85 | let sst_root = Path::parse(format!("{}/sst", prefix)).expect("sst root path"); 86 | let sst_cfg = Arc::new(SsTableConfig::new( 87 | Arc::clone(&schema), 88 | Arc::clone(&wal_cfg.segment_backend), 89 | sst_root, 90 | )); 91 | let descriptor = SsTableDescriptor::new(SsTableId::new(1), 0); 92 | let sstable = db 93 | .flush_immutables_with_descriptor(Arc::clone(&sst_cfg), descriptor) 94 | .await 95 | .expect("flush to sst"); 96 | 97 | let reader = SsTableReader::open(Arc::clone(&sst_cfg), sstable.descriptor().clone()) 98 | .await 99 | .expect("open sstable reader"); 100 | 101 | let mut stream = reader 102 | .into_stream(Timestamp::MAX, None, Arc::clone(&exec)) 103 | .await 104 | .expect("open stream"); 105 | 106 | let mut rows = Vec::new(); 107 | while let Some(batch) = stream.next().await { 108 | let batch = batch.expect("stream batch"); 109 | assert!(batch.delete.is_none(), "no deletes expected"); 110 | if batch.data.num_rows() == 0 { 111 | continue; 112 | } 113 | let ids = batch 114 | .data 115 | .column(0) 116 | .as_any() 117 | .downcast_ref::<StringArray>() 118 | .expect("string ids"); 119 | let values = batch 120 | .data 121 | .column(1) 122 | .as_any() 123 | .downcast_ref::<Int32Array>() 124 | .expect("int values"); 125 | for idx in 0..ids.len() { 126 | rows.push((ids.value(idx).to_string(), values.value(idx))); 127 | } 128 | } 129 | 130 | rows.sort_by(|a, b| a.0.cmp(&b.0)); 131 | assert_eq!( 132 | rows, 133 | vec![("alpha".to_string(), 1), ("beta".to_string(), 2)] 134 | ); 135 | assert!( 136 | wal_events 137 | .drain(..) 138 | .any(|event| matches!(event, WalEvent::TxnCommit { .. })), 139 | "wal should contain commit" 140 | ); 141 | } 142 | -------------------------------------------------------------------------------- /src/mode/dyn_config.rs: -------------------------------------------------------------------------------- 1 | use arrow_schema::SchemaRef; 2 | 3 | use super::DynModeConfig; 4 | use crate::extractor::{self, KeyExtractError}; 5 | 6 | impl DynModeConfig { 7 | /// Build a config from a key column index within `schema`. 8 | pub fn from_key_col(schema: SchemaRef, key_col: usize) -> Result<Self, KeyExtractError> { 9 | let fields = schema.fields(); 10 | if key_col >= fields.len() { 11 | return Err(KeyExtractError::ColumnOutOfBounds(key_col, fields.len())); 12 | } 13 | let extractor = extractor::projection_for_field(schema.clone(), key_col)?; 14 | Self::new(schema, extractor) 15 | } 16 | 17 | /// Build a config from a key field name within `schema`. 18 | pub fn from_key_name(schema: SchemaRef, key_field: &str) -> Result<Self, KeyExtractError> { 19 | let fields = schema.fields(); 20 | let Some((idx, _)) = fields 21 | .iter() 22 | .enumerate() 23 | .find(|(_, f)| f.name() == key_field) 24 | else { 25 | return Err(KeyExtractError::NoSuchField { 26 | name: key_field.to_string(), 27 | }); 28 | }; 29 | Self::from_key_col(schema, idx) 30 | } 31 | 32 | /// Build a config from schema metadata (`tonbo.key` markers). 33 | pub fn from_metadata(schema: SchemaRef) -> Result<Self, KeyExtractError> { 34 | use std::collections::HashMap; 35 | 36 | fn is_truthy(s: &str) -> bool { 37 | matches!(s, "true" | "TRUE" | "True" | "yes" | "YES" | "Yes") 38 | } 39 | fn parse_names_list(s: &str) -> Vec<String> { 40 | let t = s.trim(); 41 | if t.starts_with('[') && t.ends_with(']') { 42 | let inner = &t[1..t.len() - 1]; 43 | inner 44 | .split(',') 45 | .map(|p| p.trim().trim_matches('"').to_string()) 46 | .filter(|p| !p.is_empty()) 47 | .collect() 48 | } else { 49 | vec![t.trim_matches('"').to_string()] 50 | } 51 | } 52 | 53 | let fields = schema.fields(); 54 | 55 | // 1) Field-level markers: collect (ord, idx) for any field with tonbo.key 56 | let mut marks: Vec<(Option<u32>, usize)> = Vec::new(); 57 | for (i, f) in fields.iter().enumerate() { 58 | let md: &HashMap<String, String> = f.metadata(); 59 | if let Some(v) = md.get("tonbo.key") { 60 | let v = v.trim(); 61 | if let Ok(ord) = v.parse::<u32>() { 62 | marks.push((Some(ord), i)); 63 | } else if is_truthy(v) { 64 | marks.push((None, i)); 65 | } 66 | } 67 | } 68 | if !marks.is_empty() { 69 | if marks.len() == 1 { 70 | let idx = marks[0].1; 71 | return Self::from_key_col(schema, idx); 72 | } 73 | if marks.iter().any(|(o, _)| o.is_none()) { 74 | return Err(KeyExtractError::NoSuchField { 75 | name: "multiple tonbo.key markers require numeric ordinals".to_string(), 76 | }); 77 | } 78 | let mut ordered: Vec<(u32, usize)> = marks 79 | .into_iter() 80 | .filter_map(|(ord, idx)| ord.map(|o| (o, idx))) 81 | .collect(); 82 | ordered.sort_by_key(|(ord, _)| *ord); 83 | let indices: Vec<usize> = ordered.into_iter().map(|(_, idx)| idx).collect(); 84 | let extractor = extractor::projection_for_columns(schema.clone(), indices)?; 85 | return Self::new(schema, extractor); 86 | } 87 | 88 | // 2) Schema-level fallback: tonbo.keys = "name" | "[\"a\",\"b\"]" 89 | let smd: &HashMap<String, String> = schema.metadata(); 90 | if let Some(namev) = smd.get("tonbo.keys") { 91 | let names = parse_names_list(namev); 92 | if names.is_empty() { 93 | return Err(KeyExtractError::NoSuchField { 94 | name: "tonbo.keys[]".to_string(), 95 | }); 96 | } 97 | if names.len() == 1 { 98 | return Self::from_key_name(schema, &names[0]); 99 | } 100 | let mut indices: Vec<usize> = Vec::with_capacity(names.len()); 101 | for n in names.iter() { 102 | let Some((idx, _)) = fields.iter().enumerate().find(|(_, f)| f.name() == n) else { 103 | return Err(KeyExtractError::NoSuchField { name: n.clone() }); 104 | }; 105 | indices.push(idx); 106 | } 107 | let extractor = extractor::projection_for_columns(schema.clone(), indices)?; 108 | return Self::new(schema, extractor); 109 | } 110 | 111 | Err(KeyExtractError::NoSuchField { 112 | name: "<tonbo.key|tonbo.keys>".to_string(), 113 | }) 114 | } 115 | 116 | /// Build a config by first inspecting metadata and bubbling a descriptive error when absent. 117 | pub fn from_schema(schema: SchemaRef) -> Result<Self, KeyExtractError> { 118 | Self::from_metadata(schema) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /examples/06_composite_key.rs: -------------------------------------------------------------------------------- 1 | //! Composite Keys: multi-column primary keys for time-series and partitioned data 2 | //! 3 | //! Run: cargo run --example 06_composite_key 4 | 5 | use tonbo::prelude::*; 6 | 7 | // Define schema with composite key: (device_id, timestamp) 8 | // Use ordinal values in metadata for composite key ordering 9 | #[derive(Record)] 10 | struct SensorReading { 11 | #[metadata(k = "tonbo.key", v = "0")] 12 | device_id: String, 13 | #[metadata(k = "tonbo.key", v = "1")] 14 | timestamp: i64, 15 | temperature: Option<f64>, 16 | humidity: Option<f64>, 17 | } 18 | 19 | #[tokio::main] 20 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 21 | // Create DB with composite key detected from schema metadata 22 | let db = DbBuilder::from_schema(SensorReading::schema())? 23 | .on_disk("/tmp/tonbo_composite_key")? 24 | .open() 25 | .await?; 26 | 27 | // Insert time-series data 28 | let readings = vec![ 29 | SensorReading { 30 | device_id: "sensor-1".into(), 31 | timestamp: 1000, 32 | temperature: Some(22.5), 33 | humidity: Some(45.0), 34 | }, 35 | SensorReading { 36 | device_id: "sensor-1".into(), 37 | timestamp: 2000, 38 | temperature: Some(23.0), 39 | humidity: Some(46.0), 40 | }, 41 | SensorReading { 42 | device_id: "sensor-1".into(), 43 | timestamp: 3000, 44 | temperature: Some(22.8), 45 | humidity: Some(44.5), 46 | }, 47 | SensorReading { 48 | device_id: "sensor-2".into(), 49 | timestamp: 1500, 50 | temperature: Some(25.0), 51 | humidity: Some(50.0), 52 | }, 53 | SensorReading { 54 | device_id: "sensor-2".into(), 55 | timestamp: 2500, 56 | temperature: Some(25.5), 57 | humidity: Some(51.0), 58 | }, 59 | ]; 60 | 61 | let mut builders = SensorReading::new_builders(readings.len()); 62 | builders.append_rows(readings); 63 | db.ingest(builders.finish().into_record_batch()).await?; 64 | println!("Inserted 5 sensor readings with composite key (device_id, timestamp)"); 65 | 66 | // Query all data - results are ordered by composite key 67 | println!("\nAll readings (ordered by device_id, then timestamp):"); 68 | let batches = db.scan().collect().await?; 69 | print_readings(&batches)?; 70 | 71 | // Filter by first key component: device_id = 'sensor-1' 72 | println!("\nReadings for sensor-1:"); 73 | let filter = Predicate::eq(ColumnRef::new("device_id"), ScalarValue::from("sensor-1")); 74 | let batches = db.scan().filter(filter).collect().await?; 75 | print_readings(&batches)?; 76 | 77 | // Filter by second key component: timestamp > 2000 78 | println!("\nReadings after timestamp 2000:"); 79 | let filter = Predicate::gt(ColumnRef::new("timestamp"), ScalarValue::from(2000_i64)); 80 | let batches = db.scan().filter(filter).collect().await?; 81 | print_readings(&batches)?; 82 | 83 | // Combined filter on both key components 84 | println!("\nSensor-1 readings after timestamp 1500:"); 85 | let filter = Predicate::and(vec![ 86 | Predicate::eq(ColumnRef::new("device_id"), ScalarValue::from("sensor-1")), 87 | Predicate::gt(ColumnRef::new("timestamp"), ScalarValue::from(1500_i64)), 88 | ]); 89 | let batches = db.scan().filter(filter).collect().await?; 90 | print_readings(&batches)?; 91 | 92 | // Upsert: update existing key, insert new key 93 | println!("\nUpserting (sensor-1, 2000) and new (sensor-3, 1000):"); 94 | let mut tx = db.begin_transaction().await?; 95 | 96 | let updates = vec![ 97 | SensorReading { 98 | device_id: "sensor-1".into(), 99 | timestamp: 2000, 100 | temperature: Some(99.9), 101 | humidity: Some(99.9), 102 | }, 103 | SensorReading { 104 | device_id: "sensor-3".into(), 105 | timestamp: 1000, 106 | temperature: Some(20.0), 107 | humidity: Some(40.0), 108 | }, 109 | ]; 110 | let mut builders = SensorReading::new_builders(updates.len()); 111 | builders.append_rows(updates); 112 | tx.upsert_batch(&builders.finish().into_record_batch())?; 113 | tx.commit().await?; 114 | 115 | let batches = db.scan().collect().await?; 116 | print_readings(&batches)?; 117 | 118 | Ok(()) 119 | } 120 | 121 | fn print_readings(batches: &[arrow_array::RecordBatch]) -> Result<(), Box<dyn std::error::Error>> { 122 | for batch in batches { 123 | for r in batch.iter_views::<SensorReading>()?.try_flatten()? { 124 | let temp = r 125 | .temperature 126 | .map(|t| format!("{:.1}", t)) 127 | .unwrap_or("N/A".into()); 128 | let hum = r 129 | .humidity 130 | .map(|h| format!("{:.1}", h)) 131 | .unwrap_or("N/A".into()); 132 | println!( 133 | " ({}, {}) -> temp={}, humidity={}", 134 | r.device_id, r.timestamp, temp, hum 135 | ); 136 | } 137 | } 138 | Ok(()) 139 | } 140 | -------------------------------------------------------------------------------- /src/compaction/minor.rs: -------------------------------------------------------------------------------- 1 | //! Naïve minor-compaction driver for flushing immutable memtables. 2 | 3 | use std::sync::{ 4 | Arc, 5 | atomic::{AtomicU64, Ordering}, 6 | }; 7 | 8 | use fusio::executor::{Executor, Timer}; 9 | 10 | use crate::{ 11 | db::DbInner, 12 | manifest::ManifestFs, 13 | ondisk::sstable::{SsTable, SsTableConfig, SsTableDescriptor, SsTableError, SsTableId}, 14 | }; 15 | 16 | /// Naïve minor-compaction driver that flushes once a segment threshold is hit. 17 | pub(crate) struct MinorCompactor { 18 | segment_threshold: usize, 19 | target_level: usize, 20 | 21 | next_id: AtomicU64, 22 | } 23 | 24 | impl MinorCompactor { 25 | /// Build a compactor that flushes after `segment_threshold` immutable runs. 26 | pub(crate) fn new(segment_threshold: usize, target_level: usize, start_id: u64) -> Self { 27 | Self { 28 | segment_threshold: segment_threshold.max(1), 29 | target_level, 30 | next_id: AtomicU64::new(start_id), 31 | } 32 | } 33 | 34 | fn next_descriptor(&self) -> SsTableDescriptor { 35 | let id = self.next_id.fetch_add(1, Ordering::Relaxed); 36 | SsTableDescriptor::new(SsTableId::new(id), self.target_level) 37 | } 38 | 39 | /// Flush immutables when the threshold is met, returning the new SST on success. 40 | pub(crate) async fn maybe_compact<FS, E>( 41 | &self, 42 | db: &DbInner<FS, E>, 43 | config: Arc<SsTableConfig>, 44 | ) -> Result<Option<SsTable>, SsTableError> 45 | where 46 | FS: ManifestFs<E>, 47 | E: Executor + Timer + Clone, 48 | <FS as fusio::fs::Fs>::File: fusio::durability::FileCommit, 49 | { 50 | if db.num_immutable_segments() < self.segment_threshold { 51 | return Ok(None); 52 | } 53 | let descriptor = self.next_descriptor(); 54 | db.flush_immutables_with_descriptor(config, descriptor) 55 | .await 56 | .map(Some) 57 | } 58 | } 59 | 60 | #[cfg(all(test, feature = "tokio"))] 61 | mod tests { 62 | use std::sync::Arc; 63 | 64 | use arrow_schema::{DataType, Field, Schema}; 65 | use fusio::{ 66 | disk::LocalFs, dynamic::DynFs, executor::NoopExecutor, mem::fs::InMemoryFs, path::Path, 67 | }; 68 | use typed_arrow_dyn::{DynCell, DynRow}; 69 | 70 | use super::MinorCompactor; 71 | use crate::{ 72 | db::{DB, DbInner}, 73 | ondisk::sstable::SsTableConfig, 74 | test::build_batch, 75 | }; 76 | 77 | async fn build_db() -> (Arc<SsTableConfig>, DbInner<InMemoryFs, NoopExecutor>) { 78 | let schema = std::sync::Arc::new(Schema::new(vec![ 79 | Field::new("id", DataType::Utf8, false), 80 | Field::new("v", DataType::Int32, false), 81 | ])); 82 | let config = crate::schema::SchemaBuilder::from_schema(schema) 83 | .primary_key("id") 84 | .with_metadata() 85 | .build() 86 | .expect("key field"); 87 | let schema = Arc::clone(&config.schema); 88 | let executor = Arc::new(NoopExecutor); 89 | let db = DB::<InMemoryFs, NoopExecutor>::builder(config) 90 | .in_memory("compaction-test") 91 | .expect("in_memory config") 92 | .open_with_executor(Arc::clone(&executor)) 93 | .await 94 | .expect("db init") 95 | .into_inner(); 96 | 97 | let fs: Arc<dyn DynFs> = Arc::new(LocalFs {}); 98 | let cfg = Arc::new(SsTableConfig::new( 99 | schema.clone(), 100 | fs, 101 | Path::from("/tmp/tonbo-compaction-test"), 102 | )); 103 | (cfg, db) 104 | } 105 | 106 | #[tokio::test(flavor = "multi_thread")] 107 | async fn below_threshold_noop() { 108 | let (cfg, db) = build_db().await; 109 | let compactor = MinorCompactor::new(2, 0, 7); 110 | let result = compactor.maybe_compact(&db, cfg).await; 111 | assert!(matches!(result, Ok(None))); 112 | assert_eq!(db.num_immutable_segments(), 0); 113 | } 114 | 115 | #[tokio::test(flavor = "multi_thread")] 116 | async fn threshold_met_invokes_flush() { 117 | let (cfg, mut db) = build_db().await; 118 | db.set_seal_policy(Arc::new(crate::inmem::policy::BatchesThreshold { 119 | batches: 1, 120 | })); 121 | let rows = vec![DynRow(vec![ 122 | Some(DynCell::Str("k".into())), 123 | Some(DynCell::I32(1)), 124 | ])]; 125 | let batch = build_batch(cfg.schema().clone(), rows).expect("batch"); 126 | db.ingest(batch).await.expect("ingest"); 127 | assert_eq!(db.num_immutable_segments(), 1); 128 | 129 | let compactor = MinorCompactor::new(1, 0, 9); 130 | let table = compactor 131 | .maybe_compact(&db, cfg) 132 | .await 133 | .expect("flush result") 134 | .expect("sstable"); 135 | assert_eq!(db.num_immutable_segments(), 0); 136 | let descriptor = table.descriptor(); 137 | assert_eq!(descriptor.id().raw(), 9); 138 | assert_eq!(descriptor.level(), 0); 139 | assert_eq!(descriptor.stats().map(|s| s.rows), Some(1)); 140 | let stats = descriptor.stats().expect("descriptor stats"); 141 | assert_eq!(stats.rows, 1); 142 | assert!(stats.bytes > 0); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/mode/mod.rs: -------------------------------------------------------------------------------- 1 | //! Dynamic mode configuration and tuning for the canonical layout. 2 | //! 3 | //! Tonbo previously abstracted storage layouts behind a `Mode` trait so that 4 | //! multiple implementations could plug into the same `DB` surface. We are 5 | //! committing to the dynamic, Arrow `RecordBatch` layout as the sole runtime 6 | //! representation, so this module now only contains the configuration and 7 | //! helper utilities needed to build that layout. 8 | 9 | use std::sync::Arc; 10 | 11 | use arrow_schema::{DataType, Field, Schema, SchemaRef}; 12 | use serde_json::Value; 13 | use sha2::{Digest, Sha256}; 14 | 15 | use crate::{ 16 | extractor::{KeyExtractError, KeyProjection, projection_for_columns}, 17 | inmem::{immutable::memtable::MVCC_COMMIT_COL, mutable::DynMem}, 18 | manifest::TableDefinition, 19 | transaction::CommitAckMode, 20 | }; 21 | 22 | mod dyn_config; 23 | 24 | /// Configuration bundle for constructing a `DynMode`. 25 | pub struct DynModeConfig { 26 | /// Arrow schema describing the dynamic table. 27 | pub(crate) schema: SchemaRef, 28 | /// Extractor used to derive logical keys from dynamic batches. 29 | pub(crate) extractor: Arc<dyn KeyProjection>, 30 | /// WAL acknowledgement mode for transactional commits. 31 | pub(crate) commit_ack_mode: CommitAckMode, 32 | } 33 | 34 | impl DynModeConfig { 35 | /// Validate the extractor against `schema` and construct the config bundle. 36 | pub(crate) fn new( 37 | schema: SchemaRef, 38 | extractor: Box<dyn KeyProjection>, 39 | ) -> Result<Self, KeyExtractError> { 40 | extractor.validate_schema(&schema)?; 41 | let extractor: Arc<dyn KeyProjection> = extractor.into(); 42 | Ok(Self { 43 | schema, 44 | extractor, 45 | commit_ack_mode: CommitAckMode::default(), 46 | }) 47 | } 48 | 49 | /// Override the commit acknowledgement mode for transactional writes. 50 | pub fn with_commit_ack_mode(mut self, mode: CommitAckMode) -> Self { 51 | self.commit_ack_mode = mode; 52 | self 53 | } 54 | 55 | /// Clone the schema associated with this configuration. 56 | pub fn schema(&self) -> SchemaRef { 57 | Arc::clone(&self.schema) 58 | } 59 | 60 | /// Build the dynamic storage parameters and mutable memtable backing the DB. 61 | /// 62 | /// Returns `(schema, delete_schema, commit_ack_mode, mutable)`. 63 | pub(crate) fn build( 64 | self, 65 | ) -> Result<(SchemaRef, SchemaRef, CommitAckMode, DynMem), KeyExtractError> { 66 | let DynModeConfig { 67 | schema, 68 | extractor, 69 | commit_ack_mode, 70 | } = self; 71 | extractor.validate_schema(&schema)?; 72 | let key_schema = extractor.key_schema(); 73 | let delete_schema = build_delete_schema(&key_schema); 74 | let key_columns = key_schema.fields().len(); 75 | let delete_projection = 76 | projection_for_columns(delete_schema.clone(), (0..key_columns).collect())?; 77 | let delete_projection: Arc<dyn KeyProjection> = delete_projection.into(); 78 | 79 | let mutable = DynMem::new(schema.clone(), extractor, delete_projection); 80 | Ok((schema, delete_schema, commit_ack_mode, mutable)) 81 | } 82 | } 83 | 84 | fn build_delete_schema(key_schema: &SchemaRef) -> SchemaRef { 85 | let mut fields = key_schema 86 | .fields() 87 | .iter() 88 | .map(|field| field.as_ref().clone()) 89 | .collect::<Vec<Field>>(); 90 | fields.push(Field::new(MVCC_COMMIT_COL, DataType::UInt64, false)); 91 | std::sync::Arc::new(Schema::new(fields)) 92 | } 93 | 94 | /// Derive the table definition used when registering a table in the manifest. 95 | pub(crate) fn table_definition(config: &DynModeConfig, table_name: &str) -> TableDefinition { 96 | let key_columns = config 97 | .extractor 98 | .key_indices() 99 | .iter() 100 | .map(|idx| config.schema.field(*idx).name().clone()) 101 | .collect(); 102 | TableDefinition { 103 | name: table_name.to_string(), 104 | schema_fingerprint: fingerprint_schema(&config.schema), 105 | primary_key_columns: key_columns, 106 | retention: None, 107 | schema_version: 0, 108 | } 109 | } 110 | 111 | fn fingerprint_schema(schema: &SchemaRef) -> String { 112 | let mut hasher = Sha256::new(); 113 | let value = 114 | serde_json::to_value(schema.as_ref()).expect("arrow schema serialization should not fail"); 115 | let canonical = canonicalize_json(value); 116 | let bytes = 117 | serde_json::to_vec(&canonical).expect("canonical schema serialization should not fail"); 118 | hasher.update(bytes); 119 | format!("{:x}", hasher.finalize()) 120 | } 121 | 122 | fn canonicalize_json(value: Value) -> Value { 123 | match value { 124 | Value::Object(map) => { 125 | let mut entries: Vec<_> = map.into_iter().collect(); 126 | entries.sort_by(|a, b| a.0.cmp(&b.0)); 127 | let sorted = entries 128 | .into_iter() 129 | .map(|(key, value)| (key, canonicalize_json(value))) 130 | .collect(); 131 | Value::Object(sorted) 132 | } 133 | Value::Array(items) => Value::Array(items.into_iter().map(canonicalize_json).collect()), 134 | other => other, 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /tests/s3_smoke.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "s3-smoke")] 2 | //! Integration smoke test that exercises the S3 object-store plumbing against a 3 | //! live endpoint. Enable via `cargo test --features s3-smoke --test s3_smoke` 4 | //! (requires the TONBO_S3_* environment variables). 5 | 6 | use std::{ 7 | sync::Arc, 8 | time::{SystemTime, UNIX_EPOCH}, 9 | }; 10 | 11 | use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; 12 | use arrow_schema::{DataType, Field}; 13 | use fusio::{executor::tokio::TokioExecutor, impls::remotes::aws::fs::AmazonS3}; 14 | use tonbo::db::{AwsCreds, DB, ObjectSpec, S3Spec, WalSyncPolicy}; 15 | 16 | #[path = "common/mod.rs"] 17 | mod common; 18 | 19 | use common::config_with_pk; 20 | 21 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 22 | async fn s3_smoke() -> Result<(), Box<dyn std::error::Error>> { 23 | let endpoint = match std::env::var("TONBO_S3_ENDPOINT") { 24 | Ok(v) => v, 25 | Err(_) => { 26 | eprintln!("skipping s3_smoke – TONBO_S3_ENDPOINT missing"); 27 | return Ok(()); 28 | } 29 | }; 30 | let bucket = match std::env::var("TONBO_S3_BUCKET") { 31 | Ok(v) => v, 32 | Err(_) => { 33 | eprintln!("skipping s3_smoke – TONBO_S3_BUCKET missing"); 34 | return Ok(()); 35 | } 36 | }; 37 | let region = match std::env::var("TONBO_S3_REGION") { 38 | Ok(v) => v, 39 | Err(_) => { 40 | eprintln!("skipping s3_smoke – TONBO_S3_REGION missing"); 41 | return Ok(()); 42 | } 43 | }; 44 | let access = match std::env::var("TONBO_S3_ACCESS_KEY") { 45 | Ok(v) => v, 46 | Err(_) => { 47 | eprintln!("skipping s3_smoke – TONBO_S3_ACCESS_KEY missing"); 48 | return Ok(()); 49 | } 50 | }; 51 | let secret = match std::env::var("TONBO_S3_SECRET_KEY") { 52 | Ok(v) => v, 53 | Err(_) => { 54 | eprintln!("skipping s3_smoke – TONBO_S3_SECRET_KEY missing"); 55 | return Ok(()); 56 | } 57 | }; 58 | let session_token = std::env::var("TONBO_S3_SESSION_TOKEN").ok(); 59 | 60 | let config = config_with_pk( 61 | vec![ 62 | Field::new("id", DataType::Utf8, false), 63 | Field::new("value", DataType::Int32, false), 64 | ], 65 | &["id"], 66 | ); 67 | let schema = config.schema(); 68 | 69 | let label_millis = SystemTime::now() 70 | .duration_since(UNIX_EPOCH) 71 | .map_err(|err| format!("system clock before unix epoch: {err}"))? 72 | .as_millis(); 73 | let label = format!("smoke-{label_millis}"); 74 | 75 | let credentials = match session_token { 76 | Some(token) => AwsCreds::with_session_token(access, secret, token), 77 | None => AwsCreds::new(access, secret), 78 | }; 79 | 80 | let mut s3 = S3Spec::new(bucket.clone(), label.clone(), credentials); 81 | s3.endpoint = Some(endpoint); 82 | s3.region = Some(region); 83 | s3.sign_payload = Some(true); 84 | 85 | let db: DB<AmazonS3, TokioExecutor> = DB::<AmazonS3, TokioExecutor>::builder(config) 86 | .object_store(ObjectSpec::s3(s3)) 87 | .map_err(|err| format!("object_store config: {err}"))? 88 | .wal_sync_policy(WalSyncPolicy::Always) 89 | .wal_retention_bytes(Some(1 << 20)) 90 | .build() 91 | .await 92 | .map_err(|err| format!("failed to build S3-backed DB: {err}"))?; 93 | 94 | let batch = RecordBatch::try_new( 95 | schema.clone(), 96 | vec![ 97 | Arc::new(StringArray::from(vec!["alice", "bob"])) as _, 98 | Arc::new(Int32Array::from(vec![10, 20])) as _, 99 | ], 100 | )?; 101 | 102 | db.ingest(batch).await?; 103 | 104 | // Verify we can read the data back via scan 105 | let results: Vec<RecordBatch> = db.scan().collect().await?; 106 | 107 | let total_rows: usize = results.iter().map(|b| b.num_rows()).sum(); 108 | if total_rows < 2 { 109 | return Err(format!("expected at least 2 rows, got {total_rows}").into()); 110 | } 111 | 112 | // Verify the data content 113 | let mut found_alice = false; 114 | let mut found_bob = false; 115 | 116 | for batch in &results { 117 | let ids = batch 118 | .column(0) 119 | .as_any() 120 | .downcast_ref::<StringArray>() 121 | .expect("id column should be StringArray"); 122 | let values = batch 123 | .column(1) 124 | .as_any() 125 | .downcast_ref::<Int32Array>() 126 | .expect("value column should be Int32Array"); 127 | 128 | for i in 0..batch.num_rows() { 129 | match ids.value(i) { 130 | "alice" => { 131 | assert_eq!(values.value(i), 10, "alice should have value 10"); 132 | found_alice = true; 133 | } 134 | "bob" => { 135 | assert_eq!(values.value(i), 20, "bob should have value 20"); 136 | found_bob = true; 137 | } 138 | other => { 139 | return Err(format!("unexpected id: {other}").into()); 140 | } 141 | } 142 | } 143 | } 144 | 145 | if !found_alice { 146 | return Err("expected to find alice in results".into()); 147 | } 148 | if !found_bob { 149 | return Err("expected to find bob in results".into()); 150 | } 151 | 152 | Ok(()) 153 | } 154 | -------------------------------------------------------------------------------- /examples/08_nested_types.rs: -------------------------------------------------------------------------------- 1 | //! Nested Types: Deep struct nesting, List, and complex compositions 2 | //! 3 | //! Run: cargo run --example 08_nested_types 4 | 5 | use tonbo::prelude::*; 6 | use typed_arrow::bridge::List; 7 | 8 | // Level 1: Geo coordinates (innermost) 9 | #[derive(Record, Clone)] 10 | struct Geo { 11 | lat: f64, 12 | lon: f64, 13 | } 14 | 15 | // Level 2: Address contains optional Geo 16 | #[derive(Record, Clone)] 17 | struct Address { 18 | city: String, 19 | zip: Option<i32>, 20 | geo: Option<Geo>, 21 | } 22 | 23 | // Level 3: Company contains optional Address (headquarters) 24 | #[derive(Record, Clone)] 25 | struct Company { 26 | name: String, 27 | hq: Option<Address>, 28 | } 29 | 30 | // Level 4: Person with deep nesting + List 31 | #[derive(Record)] 32 | struct Person { 33 | #[metadata(k = "tonbo.key", v = "true")] 34 | id: i64, 35 | name: String, 36 | company: Option<Company>, 37 | home: Option<Address>, 38 | tags: Option<List<String>>, 39 | } 40 | 41 | #[tokio::main] 42 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 43 | let db = DbBuilder::from_schema(Person::schema())? 44 | .on_disk("/tmp/tonbo_nested_types")? 45 | .open() 46 | .await?; 47 | 48 | // Create people with varying levels of nested data 49 | let people = vec![ 50 | // Full deep nesting: Person -> Company -> Address -> Geo 51 | Person { 52 | id: 1, 53 | name: "Alice".into(), 54 | company: Some(Company { 55 | name: "TechCorp".into(), 56 | hq: Some(Address { 57 | city: "Seattle".into(), 58 | zip: Some(98101), 59 | geo: Some(Geo { 60 | lat: 47.6062, 61 | lon: -122.3321, 62 | }), 63 | }), 64 | }), 65 | home: Some(Address { 66 | city: "Bellevue".into(), 67 | zip: Some(98004), 68 | geo: None, 69 | }), 70 | tags: Some(List::new(vec!["engineer".into(), "rust".into()])), 71 | }, 72 | // Partial nesting: company without HQ address 73 | Person { 74 | id: 2, 75 | name: "Bob".into(), 76 | company: Some(Company { 77 | name: "StartupInc".into(), 78 | hq: None, // Remote-first, no HQ 79 | }), 80 | home: Some(Address { 81 | city: "Portland".into(), 82 | zip: None, 83 | geo: Some(Geo { 84 | lat: 45.5152, 85 | lon: -122.6784, 86 | }), 87 | }), 88 | tags: Some(List::new(vec!["founder".into()])), 89 | }, 90 | // Minimal data: no company, no home 91 | Person { 92 | id: 3, 93 | name: "Carol".into(), 94 | company: None, 95 | home: None, 96 | tags: None, 97 | }, 98 | ]; 99 | 100 | let mut builders = Person::new_builders(people.len()); 101 | builders.append_rows(people); 102 | db.ingest(builders.finish().into_record_batch()).await?; 103 | 104 | println!("Inserted 3 people with deep nested data\n"); 105 | 106 | // Query and traverse the nested structure 107 | let batches = db.scan().collect().await?; 108 | 109 | println!("=== Deep Nested Data ===\n"); 110 | for batch in &batches { 111 | for person in batch.iter_views::<Person>()?.try_flatten()? { 112 | println!("Person {} - {}", person.id, person.name); 113 | 114 | // Traverse: company -> hq -> geo (3 levels deep) 115 | match person.company { 116 | Some(company) => { 117 | println!(" Company: {}", company.name); 118 | match company.hq { 119 | Some(hq) => { 120 | print!(" HQ: {}", hq.city); 121 | if let Some(zip) = hq.zip { 122 | print!(", {}", zip); 123 | } 124 | if let Some(geo) = hq.geo { 125 | print!(" ({:.4}, {:.4})", geo.lat, geo.lon); 126 | } 127 | println!(); 128 | } 129 | None => println!(" HQ: (remote)"), 130 | } 131 | } 132 | None => println!(" Company: (none)"), 133 | } 134 | 135 | // Home address with optional geo 136 | match person.home { 137 | Some(home) => { 138 | print!(" Home: {}", home.city); 139 | if let Some(geo) = home.geo { 140 | print!(" ({:.4}, {:.4})", geo.lat, geo.lon); 141 | } 142 | println!(); 143 | } 144 | None => println!(" Home: (none)"), 145 | } 146 | 147 | // Tags list 148 | match person.tags { 149 | Some(tags) => { 150 | let vals: Vec<String> = tags 151 | .map(|r| r.map(|s| s.to_string())) 152 | .collect::<Result<_, _>>()?; 153 | println!(" Tags: {:?}", vals); 154 | } 155 | None => println!(" Tags: (none)"), 156 | } 157 | 158 | println!(); 159 | } 160 | } 161 | 162 | Ok(()) 163 | } 164 | -------------------------------------------------------------------------------- /examples/09_time_travel.rs: -------------------------------------------------------------------------------- 1 | //! Time Travel: list versions and create snapshots at specific timestamps 2 | //! 3 | //! This example demonstrates the time travel API: 4 | //! - `db.list_versions(limit)` - enumerate committed versions 5 | //! - `db.snapshot_at(timestamp)` - create a snapshot at a specific timestamp 6 | //! 7 | //! Tonbo supports two levels of time travel: 8 | //! 1. **MVCC timestamps** - every commit gets a logical timestamp for visibility control 9 | //! 2. **Manifest versions** - when data is flushed to SST files, a version snapshot is recorded in 10 | //! the manifest, enabling queries against historical file sets 11 | //! 12 | //! Run: cargo run --example 09_time_travel 13 | 14 | use tonbo::prelude::*; 15 | 16 | #[derive(Record)] 17 | struct Product { 18 | #[metadata(k = "tonbo.key", v = "true")] 19 | id: i64, 20 | name: String, 21 | price: i64, 22 | } 23 | 24 | #[tokio::main] 25 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 26 | let db = DbBuilder::from_schema(Product::schema())? 27 | .on_disk("/tmp/tonbo_time_travel")? 28 | .open() 29 | .await?; 30 | 31 | // === Insert data in multiple transactions === 32 | // Each transaction gets a unique MVCC timestamp 33 | 34 | // Transaction 1: Initial products 35 | let products = vec![ 36 | Product { 37 | id: 1, 38 | name: "Laptop".into(), 39 | price: 999, 40 | }, 41 | Product { 42 | id: 2, 43 | name: "Mouse".into(), 44 | price: 29, 45 | }, 46 | ]; 47 | let mut builders = Product::new_builders(products.len()); 48 | builders.append_rows(products); 49 | db.ingest(builders.finish().into_record_batch()).await?; 50 | println!("Tx1: Inserted Laptop ($999), Mouse ($29)"); 51 | 52 | // Transaction 2: Price update 53 | let mut tx = db.begin_transaction().await?; 54 | let update = vec![Product { 55 | id: 1, 56 | name: "Laptop".into(), 57 | price: 899, 58 | }]; 59 | let mut builders = Product::new_builders(update.len()); 60 | builders.append_rows(update); 61 | tx.upsert_batch(&builders.finish().into_record_batch())?; 62 | tx.commit().await?; 63 | println!("Tx2: Laptop price reduced to $899"); 64 | 65 | // Transaction 3: New product 66 | let mut tx = db.begin_transaction().await?; 67 | let update = vec![Product { 68 | id: 3, 69 | name: "Keyboard".into(), 70 | price: 79, 71 | }]; 72 | let mut builders = Product::new_builders(update.len()); 73 | builders.append_rows(update); 74 | tx.upsert_batch(&builders.finish().into_record_batch())?; 75 | tx.commit().await?; 76 | println!("Tx3: Added Keyboard ($79)"); 77 | 78 | // === List persisted versions === 79 | // Versions are created when data is flushed to SST files 80 | println!("\n=== Persisted Versions (from manifest) ==="); 81 | let versions = db.list_versions(10).await?; 82 | if versions.is_empty() { 83 | println!(" (no SST versions yet - data is in memory)"); 84 | } else { 85 | for (i, v) in versions.iter().enumerate() { 86 | println!( 87 | " Version {}: timestamp={}, ssts={}, levels={}", 88 | versions.len() - i, 89 | v.timestamp.get(), 90 | v.sst_count, 91 | v.level_count 92 | ); 93 | } 94 | } 95 | 96 | // === Query current state === 97 | println!("\n=== Current State ==="); 98 | let batches = db.scan().collect().await?; 99 | for batch in &batches { 100 | for product in batch.iter_views::<Product>()?.try_flatten()? { 101 | println!(" {} - {} (${})", product.id, product.name, product.price); 102 | } 103 | } 104 | 105 | // === Snapshot at specific MVCC timestamp === 106 | // Query using timestamps from list_versions for reliable time travel 107 | if let Some(first_version) = versions.last() { 108 | println!( 109 | "\n=== Snapshot at timestamp={} (first version) ===", 110 | first_version.timestamp.get() 111 | ); 112 | let snapshot = db.snapshot_at(first_version.timestamp).await?; 113 | let batches = snapshot.scan(&db).collect().await?; 114 | for batch in &batches { 115 | for product in batch.iter_views::<Product>()?.try_flatten()? { 116 | println!(" {} - {} (${})", product.id, product.name, product.price); 117 | } 118 | } 119 | } 120 | 121 | if let Some(latest_version) = versions.first() { 122 | println!( 123 | "\n=== Snapshot at timestamp={} (latest version) ===", 124 | latest_version.timestamp.get() 125 | ); 126 | let snapshot = db.snapshot_at(latest_version.timestamp).await?; 127 | let batches = snapshot.scan(&db).collect().await?; 128 | for batch in &batches { 129 | for product in batch.iter_views::<Product>()?.try_flatten()? { 130 | println!(" {} - {} (${})", product.id, product.name, product.price); 131 | } 132 | } 133 | } 134 | 135 | // Current snapshot (includes in-memory data) 136 | println!("\n=== Current State (begin_snapshot) ==="); 137 | let snapshot = db.begin_snapshot().await?; 138 | let batches = snapshot.scan(&db).collect().await?; 139 | for batch in &batches { 140 | for product in batch.iter_views::<Product>()?.try_flatten()? { 141 | println!(" {} - {} (${})", product.id, product.name, product.price); 142 | } 143 | } 144 | 145 | Ok(()) 146 | } 147 | -------------------------------------------------------------------------------- /examples/03_filter.rs: -------------------------------------------------------------------------------- 1 | //! Query filtering: predicates for eq, gt, lt, and, or, in, is_null 2 | //! 3 | //! Run: cargo run --example 03_filter 4 | 5 | use fusio::{disk::LocalFs, executor::tokio::TokioExecutor}; 6 | use tonbo::prelude::*; 7 | 8 | #[derive(Record)] 9 | struct Product { 10 | #[metadata(k = "tonbo.key", v = "true")] 11 | id: String, 12 | name: String, 13 | price: i64, 14 | category: Option<String>, 15 | } 16 | 17 | #[tokio::main] 18 | async fn main() -> Result<(), Box<dyn std::error::Error>> { 19 | let db = DbBuilder::from_schema(Product::schema())? 20 | .on_disk("/tmp/tonbo_filter_example")? 21 | .open() 22 | .await?; 23 | 24 | // Insert sample data 25 | let products = vec![ 26 | Product { 27 | id: "p1".into(), 28 | name: "Laptop".into(), 29 | price: 999, 30 | category: Some("Electronics".into()), 31 | }, 32 | Product { 33 | id: "p2".into(), 34 | name: "Mouse".into(), 35 | price: 29, 36 | category: Some("Electronics".into()), 37 | }, 38 | Product { 39 | id: "p3".into(), 40 | name: "Desk".into(), 41 | price: 299, 42 | category: Some("Furniture".into()), 43 | }, 44 | Product { 45 | id: "p4".into(), 46 | name: "Chair".into(), 47 | price: 199, 48 | category: Some("Furniture".into()), 49 | }, 50 | Product { 51 | id: "p5".into(), 52 | name: "Notebook".into(), 53 | price: 5, 54 | category: Some("Office".into()), 55 | }, 56 | Product { 57 | id: "p6".into(), 58 | name: "Mystery Box".into(), 59 | price: 50, 60 | category: None, 61 | }, 62 | ]; 63 | let mut builders = Product::new_builders(products.len()); 64 | builders.append_rows(products); 65 | db.ingest(builders.finish().into_record_batch()).await?; 66 | 67 | // 1. Equality: price == 29 68 | println!("1. price == 29:"); 69 | let filter = Predicate::eq(ColumnRef::new("price"), ScalarValue::from(29_i64)); 70 | print_products(&db, filter).await?; 71 | 72 | // 2. Comparison: price > 100 73 | println!("\n2. price > 100:"); 74 | let filter = Predicate::gt(ColumnRef::new("price"), ScalarValue::from(100_i64)); 75 | print_products(&db, filter).await?; 76 | 77 | // 3. Range: 50 <= price <= 300 78 | println!("\n3. 50 <= price <= 300:"); 79 | let filter = Predicate::and(vec![ 80 | Predicate::gte(ColumnRef::new("price"), ScalarValue::from(50_i64)), 81 | Predicate::lte(ColumnRef::new("price"), ScalarValue::from(300_i64)), 82 | ]); 83 | print_products(&db, filter).await?; 84 | 85 | // 4. IN list: category in ["Electronics", "Office"] 86 | println!("\n4. category IN ['Electronics', 'Office']:"); 87 | let filter = Predicate::in_list( 88 | ColumnRef::new("category"), 89 | vec![ 90 | ScalarValue::from("Electronics"), 91 | ScalarValue::from("Office"), 92 | ], 93 | ); 94 | print_products(&db, filter).await?; 95 | 96 | // 5. IS NULL: category is null 97 | println!("\n5. category IS NULL:"); 98 | let filter = Predicate::is_null(ColumnRef::new("category")); 99 | print_products(&db, filter).await?; 100 | 101 | // 6. IS NOT NULL: category is not null 102 | println!("\n6. category IS NOT NULL:"); 103 | let filter = Predicate::is_not_null(ColumnRef::new("category")); 104 | print_products(&db, filter).await?; 105 | 106 | // 7. AND: Electronics AND price < 100 107 | println!("\n7. category == 'Electronics' AND price < 100:"); 108 | let filter = Predicate::and(vec![ 109 | Predicate::eq(ColumnRef::new("category"), ScalarValue::from("Electronics")), 110 | Predicate::lt(ColumnRef::new("price"), ScalarValue::from(100_i64)), 111 | ]); 112 | print_products(&db, filter).await?; 113 | 114 | // 8. OR: Furniture OR price < 10 115 | println!("\n8. category == 'Furniture' OR price < 10:"); 116 | let filter = Predicate::or(vec![ 117 | Predicate::eq(ColumnRef::new("category"), ScalarValue::from("Furniture")), 118 | Predicate::lt(ColumnRef::new("price"), ScalarValue::from(10_i64)), 119 | ]); 120 | print_products(&db, filter).await?; 121 | 122 | // 9. NOT: NOT category == 'Electronics' 123 | println!("\n9. NOT category == 'Electronics':"); 124 | let filter = Predicate::eq(ColumnRef::new("category"), ScalarValue::from("Electronics")).not(); 125 | print_products(&db, filter).await?; 126 | 127 | // 10. Complex: (Electronics OR Furniture) AND price > 100 128 | println!("\n10. (Electronics OR Furniture) AND price > 100:"); 129 | let filter = Predicate::and(vec![ 130 | Predicate::or(vec![ 131 | Predicate::eq(ColumnRef::new("category"), ScalarValue::from("Electronics")), 132 | Predicate::eq(ColumnRef::new("category"), ScalarValue::from("Furniture")), 133 | ]), 134 | Predicate::gt(ColumnRef::new("price"), ScalarValue::from(100_i64)), 135 | ]); 136 | print_products(&db, filter).await?; 137 | 138 | Ok(()) 139 | } 140 | 141 | async fn print_products( 142 | db: &DB<LocalFs, TokioExecutor>, 143 | filter: Predicate, 144 | ) -> Result<(), Box<dyn std::error::Error>> { 145 | let batches = db.scan().filter(filter).collect().await?; 146 | let mut found = false; 147 | for batch in &batches { 148 | for p in batch.iter_views::<Product>()?.try_flatten()? { 149 | let cat = p.category.unwrap_or("NULL"); 150 | println!(" {} - {} (${}) [{}]", p.id, p.name, p.price, cat); 151 | found = true; 152 | } 153 | } 154 | if !found { 155 | println!(" (no results)"); 156 | } 157 | Ok(()) 158 | } 159 | -------------------------------------------------------------------------------- /src/schema.rs: -------------------------------------------------------------------------------- 1 | //! Declarative schema utilities for defining primary keys and runtime layouts. 2 | //! 3 | //! Use `SchemaBuilder` to wrap an Arrow `Schema`, declare single or composite primary keys, 4 | //! and optionally backfill `tonbo.keys` metadata so downstream tools see the same intent. 5 | //! The builder validates key columns and produces the `DynModeConfig` used by `DbBuilder`, 6 | //! preferring Arrow metadata over ad hoc extractors for the primary key path. 7 | 8 | use std::sync::Arc; 9 | 10 | use arrow_schema::{Schema, SchemaRef}; 11 | use serde_json::json; 12 | 13 | use crate::{ 14 | extractor::{self, KeyExtractError}, 15 | mode::DynModeConfig, 16 | }; 17 | 18 | /// Builder for declaring primary keys against an Arrow schema. 19 | /// 20 | /// The builder lets callers specify key columns programmatically while reusing 21 | /// the same validation and extractor logic exercised by metadata-driven flows. 22 | /// Optionally it can back-fill schema metadata (`tonbo.keys`) so downstream 23 | /// tooling observes the same declaration. 24 | #[derive(Clone)] 25 | pub struct SchemaBuilder { 26 | schema: SchemaRef, 27 | key_parts: Vec<String>, 28 | write_metadata: bool, 29 | } 30 | 31 | impl SchemaBuilder { 32 | /// Start a builder from an Arrow schema reference. 33 | pub fn from_schema(schema: SchemaRef) -> Self { 34 | Self { 35 | schema, 36 | key_parts: Vec::new(), 37 | write_metadata: false, 38 | } 39 | } 40 | 41 | /// Declare a single-column primary key, replacing any prior selection. 42 | pub fn primary_key(mut self, field: impl Into<String>) -> Self { 43 | self.key_parts = vec![field.into()]; 44 | self 45 | } 46 | 47 | /// Declare a composite key with fields in the provided order, replacing any prior selection. 48 | pub fn composite_key<I, S>(mut self, fields: I) -> Self 49 | where 50 | I: IntoIterator<Item = S>, 51 | S: Into<String>, 52 | { 53 | self.key_parts = fields.into_iter().map(Into::into).collect(); 54 | self 55 | } 56 | 57 | /// Append a field to the key definition (useful for incremental configuration). 58 | pub fn add_key_part(mut self, field: impl Into<String>) -> Self { 59 | self.key_parts.push(field.into()); 60 | self 61 | } 62 | 63 | /// Request that the builder writes the resulting key declaration back into schema metadata. 64 | pub fn with_metadata(mut self) -> Self { 65 | self.write_metadata = true; 66 | self 67 | } 68 | 69 | /// Finalise the builder, producing a `DynModeConfig` and optionally updated schema metadata. 70 | pub fn build(self) -> Result<DynModeConfig, KeyExtractError> { 71 | if self.key_parts.is_empty() { 72 | return Err(KeyExtractError::NoSuchField { 73 | name: "schema builder requires at least one key field".to_string(), 74 | }); 75 | } 76 | 77 | let fields = self.schema.fields(); 78 | let mut indices = Vec::with_capacity(self.key_parts.len()); 79 | for name in &self.key_parts { 80 | let Some((idx, _)) = fields.iter().enumerate().find(|(_, f)| f.name() == name) else { 81 | return Err(KeyExtractError::NoSuchField { name: name.clone() }); 82 | }; 83 | indices.push(idx); 84 | } 85 | 86 | let schema = if self.write_metadata { 87 | let mut metadata = self.schema.metadata().clone(); 88 | metadata.insert("tonbo.keys".to_string(), json!(self.key_parts).to_string()); 89 | let field_refs = fields.iter().cloned().collect::<Vec<_>>(); 90 | Arc::new(Schema::new_with_metadata(field_refs, metadata)) 91 | } else { 92 | Arc::clone(&self.schema) 93 | }; 94 | 95 | let extractor = extractor::projection_for_columns(Arc::clone(&schema), indices)?; 96 | 97 | DynModeConfig::new(schema, extractor) 98 | } 99 | } 100 | 101 | #[cfg(test)] 102 | mod tests { 103 | use std::sync::Arc; 104 | 105 | use arrow_schema::{DataType, Field, Schema}; 106 | 107 | use super::SchemaBuilder; 108 | 109 | #[test] 110 | fn primary_key_builder() { 111 | let schema = Arc::new(Schema::new(vec![ 112 | Field::new("id", DataType::Utf8, false), 113 | Field::new("value", DataType::Int32, false), 114 | ])); 115 | 116 | let config = SchemaBuilder::from_schema(Arc::clone(&schema)) 117 | .primary_key("id") 118 | .build() 119 | .expect("builder should succeed"); 120 | 121 | assert_eq!(config.schema.fields()[0].name(), "id"); 122 | } 123 | 124 | #[test] 125 | fn composite_key_builder_sets_metadata() { 126 | let schema = Arc::new(Schema::new(vec![ 127 | Field::new("pk", DataType::Utf8, false), 128 | Field::new("ts", DataType::Int64, false), 129 | ])); 130 | 131 | let config = SchemaBuilder::from_schema(Arc::clone(&schema)) 132 | .composite_key(["pk", "ts"]) 133 | .with_metadata() 134 | .build() 135 | .expect("builder should succeed"); 136 | 137 | let md = config.schema.metadata(); 138 | assert_eq!(md.get("tonbo.keys"), Some(&String::from("[\"pk\",\"ts\"]"))); 139 | } 140 | 141 | #[test] 142 | fn missing_field_errors() { 143 | let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Utf8, false)])); 144 | 145 | let err = match SchemaBuilder::from_schema(schema) 146 | .primary_key("missing") 147 | .build() 148 | { 149 | Ok(_) => panic!("builder should fail"), 150 | Err(err) => err, 151 | }; 152 | assert!(matches!( 153 | err, 154 | crate::extractor::KeyExtractError::NoSuchField { .. } 155 | )); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /docs/rfcs/0006-mvcc-sidecar.md: -------------------------------------------------------------------------------- 1 | # RFC: MVCC Storage Layout for Immutable Segments 2 | 3 | - Status: Accepted 4 | - Authors: Tonbo storage team 5 | - Created: 2025-10-28 6 | - Updated: 2025-11 7 | - Area: Storage engine, MVCC, SSTable 8 | 9 | ## Summary 10 | 11 | Define the on-disk layout for MVCC metadata in immutable segments and SSTables. Data files embed `_commit_ts` as an appended column, while tombstones are stored in a separate key-only delete sidecar. 12 | 13 | ## Motivation 14 | 15 | - Keep user schema columns cleanly separated from MVCC system columns 16 | - Avoid materializing tombstoned rows as null-filled payloads in the data file 17 | - Enable efficient column pruning—scans that don't need visibility checks can skip `_commit_ts` 18 | - Support delete-heavy workloads without bloating data files 19 | 20 | ## Goals 21 | 22 | - Immutable segments and SST files store `_commit_ts` alongside user data for upserts 23 | - Tombstones live in a dedicated key-only sidecar, not in the data file 24 | - WAL payloads remain Arrow-native and replayable 25 | - Compaction, recovery, and GC operate over append-only objects via Fusio 26 | - Minimal read amplification for scans and range lookups 27 | 28 | ## Non-Goals 29 | 30 | - Changing timestamp assignment strategy (remains monotonic per commit) 31 | - Introducing new transaction semantics or cross-table coordination 32 | - Typed/compile-time ingestion pathways (future work per RFC 0001) 33 | 34 | ## Design 35 | 36 | ### Storage Layout 37 | 38 | Each SSTable consists of up to two files: 39 | 40 | ``` 41 | /sst/L{level}/{id}.parquet # data + _commit_ts column 42 | /sst/L{level}/{id}.delete.parquet # key-only delete sidecar (when tombstones exist) 43 | ``` 44 | 45 | ### Data File 46 | 47 | The data Parquet contains user schema columns with `_commit_ts: UInt64` appended as the last column. This enables: 48 | 49 | - Column pruning to skip `_commit_ts` when visibility filtering is not needed 50 | - Single-file I/O for upsert-only segments 51 | - Alignment with WAL frame layout 52 | 53 | ### Delete Sidecar 54 | 55 | Schema: `<primary-key columns>, _commit_ts: UInt64 (non-null)` 56 | 57 | - Only emitted when the segment contains tombstones 58 | - Key-only format avoids storing null value columns for deleted rows 59 | - Enables efficient tombstone lookup during scans without polluting the data file 60 | 61 | ### WAL Integration 62 | 63 | - Upsert frames carry batches with `_commit_ts` column 64 | - Delete frames carry key-only batches for tombstones 65 | - Replay reconstructs both upsert data and tombstone metadata 66 | 67 | ### Read Path 68 | 69 | 1. Load data Parquet, extracting `_commit_ts` for visibility checks 70 | 2. Load delete sidecar if present 71 | 3. Range scans consult both for MVCC filtering 72 | 4. Callers never see `_commit_ts` in query results—projection excludes it 73 | 74 | ### Compaction 75 | 76 | - Merge data files with latest-wins semantics based on `_commit_ts` 77 | - Merge delete sidecars alongside data 78 | - Tombstones may be pruned when `commit_ts <= tombstone_watermark` and no live versions exist 79 | 80 | ### GC 81 | 82 | Manifest-driven GC treats data and delete files atomically—both are retained or removed together based on version visibility. 83 | 84 | ## Alternatives Considered 85 | 86 | 1. **Three-file layout** (data + mvcc sidecar + delete sidecar): More files increase I/O; Parquet column pruning already handles skipping `_commit_ts`. 87 | 88 | 2. **Embed tombstones in data file as null rows**: Wastes space, forces nullable schemas, complicates scans. 89 | 90 | 3. **Store MVCC in Parquet metadata**: Lacks per-row fidelity, complicates streaming reads. 91 | 92 | 4. **Tombstones in WAL only**: Makes SST reconstruction expensive and breaks crash recovery. 93 | 94 | ## Comparison with Other Systems 95 | 96 | ### Apache Iceberg 97 | 98 | Iceberg has three delete mechanisms: 99 | 100 | | Type | How it works | Pros | Cons | 101 | |------|--------------|------|------| 102 | | Position deletes | file_path + row position | Fast read (exact location) | Requires knowing physical position; deprecated in v3 | 103 | | Equality deletes | Column values that identify rows | Write-friendly (no position needed) | Query penalty—must scan and filter | 104 | | Deletion vectors (v3) | Bitmap per data file | Compact, fast read | Requires file-level tracking | 105 | 106 | Tonbo's key-only delete sidecar is similar to Iceberg's **equality deletes**—identifying rows by key values rather than physical position. Iceberg's experience shows equality deletes accumulate and hurt read performance until merged, which informed our decision to track tombstone watermarks for pruning. 107 | 108 | ### RocksDB 109 | 110 | RocksDB takes a different approach: 111 | 112 | - **Point tombstones**: Stored inline with data in SST files (key + sequence number) 113 | - **Range tombstones**: Dedicated meta-block within each SST file 114 | - **Compaction**: Tombstones drop only at bottom level when no snapshot references them 115 | - **Trigger heuristics**: Compaction triggered when tombstone ratio exceeds 50% 116 | 117 | RocksDB's inline storage avoids extra files but complicates the data format. Their range tombstone support is something Tonbo currently lacks. 118 | 119 | ### Design Trade-offs 120 | 121 | Tonbo's current design optimizes for: 122 | - **Simplicity**: Separate file is easier to reason about than inline storage 123 | - **Write path**: No need to know physical row positions (unlike position deletes) 124 | - **Schema purity**: User data files remain uncontaminated by tombstone markers 125 | 126 | Known limitations for future consideration: 127 | - **File proliferation**: Every segment with tombstones creates an extra file; deletion vectors (bitmaps) would be more compact 128 | - **No range deletes**: Point-delete-only; range tombstone support may be needed for bulk delete workloads 129 | - **Watermark semantics**: Tombstone retention must coordinate with snapshot/reader registry to avoid premature pruning 130 | -------------------------------------------------------------------------------- /src/tests_internal/wal_policy_e2e.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "tokio")] 2 | 3 | use std::{fs, path::PathBuf, sync::Arc, time::Duration}; 4 | 5 | use arrow_array::{Int32Array, RecordBatch, StringArray}; 6 | use arrow_schema::{DataType, Field}; 7 | use fusio::{DynFs, disk::LocalFs, executor::tokio::TokioExecutor, path::Path as FusioPath}; 8 | use tokio::time::sleep; 9 | 10 | use crate::{ 11 | db::{BatchesThreshold, WalSyncPolicy}, 12 | test_support::{TestFsWalStateStore, TestWalExt as WalExt}, 13 | }; 14 | 15 | #[path = "common/mod.rs"] 16 | mod common; 17 | use common::config_with_pk; 18 | 19 | fn workspace_temp_dir(prefix: &str) -> PathBuf { 20 | let base = std::env::current_dir().expect("cwd"); 21 | let dir = base.join("target").join("tmp").join(format!( 22 | "{prefix}-{}", 23 | std::time::SystemTime::now() 24 | .duration_since(std::time::UNIX_EPOCH) 25 | .expect("time") 26 | .as_nanos() 27 | )); 28 | fs::create_dir_all(&dir).expect("create workspace temp dir"); 29 | dir 30 | } 31 | 32 | fn wal_cfg_with_backend(wal_dir: &PathBuf, policy: WalSyncPolicy) -> crate::db::WalConfig { 33 | fs::create_dir_all(wal_dir).expect("wal dir"); 34 | let wal_path = FusioPath::from_filesystem_path(wal_dir).expect("wal path"); 35 | let wal_fs = Arc::new(LocalFs {}); 36 | let wal_backend: Arc<dyn DynFs> = wal_fs.clone(); 37 | let wal_state = Arc::new(TestFsWalStateStore::new(wal_fs)); 38 | crate::db::WalConfig::default() 39 | .wal_dir(wal_path) 40 | .segment_backend(wal_backend) 41 | .state_store(Some(wal_state)) 42 | .segment_max_bytes(256) 43 | .flush_interval(Duration::from_millis(1)) 44 | .sync_policy(policy) 45 | } 46 | 47 | async fn write_rows( 48 | db: &mut crate::db::DbInner<LocalFs, TokioExecutor>, 49 | schema: &arrow_schema::SchemaRef, 50 | offset: i32, 51 | ) -> Result<(), Box<dyn std::error::Error>> { 52 | let ids: Vec<String> = (0..32).map(|n| format!("row-{offset}-{n:02}")).collect(); 53 | let vals: Vec<i32> = (0..32).map(|n| offset + n as i32).collect(); 54 | let batch = RecordBatch::try_new( 55 | schema.clone(), 56 | vec![ 57 | Arc::new(StringArray::from(ids)) as _, 58 | Arc::new(Int32Array::from(vals)) as _, 59 | ], 60 | )?; 61 | db.ingest(batch).await?; 62 | Ok(()) 63 | } 64 | 65 | /// IntervalBytes policy should trigger syncs after crossing byte threshold. 66 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 67 | async fn wal_interval_bytes_syncs() -> Result<(), Box<dyn std::error::Error>> { 68 | let temp_root = workspace_temp_dir("wal-policy-bytes"); 69 | let wal_dir = temp_root.join("wal"); 70 | let config = config_with_pk( 71 | vec![ 72 | Field::new("id", DataType::Utf8, false), 73 | Field::new("v", DataType::Int32, false), 74 | ], 75 | &["id"], 76 | ); 77 | let schema = config.schema(); 78 | let wal_cfg = wal_cfg_with_backend(&wal_dir, WalSyncPolicy::IntervalBytes(1)); 79 | let executor = Arc::new(TokioExecutor::default()); 80 | 81 | let mut db = crate::db::DB::<LocalFs, TokioExecutor>::builder(config) 82 | .on_disk(temp_root.to_string_lossy().into_owned())? 83 | .wal_config(wal_cfg) 84 | .with_minor_compaction(1, 0, 1) 85 | .open_with_executor(Arc::clone(&executor)) 86 | .await? 87 | .into_inner(); 88 | db.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 89 | 90 | write_rows(&mut db, &schema, 0).await?; 91 | write_rows(&mut db, &schema, 100).await?; 92 | 93 | let wal_handle = db 94 | .wal() 95 | .cloned() 96 | .expect("wal handle available before shutdown"); 97 | 98 | db.disable_wal().await?; 99 | let metrics = wal_handle.metrics(); 100 | let guard = metrics.read().await; 101 | assert!( 102 | guard.sync_operations > 0, 103 | "expected sync operations to be recorded" 104 | ); 105 | 106 | if let Err(err) = fs::remove_dir_all(&temp_root) { 107 | eprintln!("cleanup failed: {err}"); 108 | } 109 | Ok(()) 110 | } 111 | 112 | /// IntervalTime policy should also emit syncs even with small batches. 113 | #[tokio::test(flavor = "multi_thread", worker_threads = 2)] 114 | async fn wal_interval_time_syncs() -> Result<(), Box<dyn std::error::Error>> { 115 | let temp_root = workspace_temp_dir("wal-policy-time"); 116 | let wal_dir = temp_root.join("wal"); 117 | let config = config_with_pk( 118 | vec![ 119 | Field::new("id", DataType::Utf8, false), 120 | Field::new("v", DataType::Int32, false), 121 | ], 122 | &["id"], 123 | ); 124 | let schema = config.schema(); 125 | let wal_cfg = wal_cfg_with_backend( 126 | &wal_dir, 127 | WalSyncPolicy::IntervalTime(Duration::from_millis(0)), 128 | ); 129 | let executor = Arc::new(TokioExecutor::default()); 130 | 131 | let mut db = crate::db::DB::<LocalFs, TokioExecutor>::builder(config) 132 | .on_disk(temp_root.to_string_lossy().into_owned())? 133 | .wal_config(wal_cfg) 134 | .with_minor_compaction(1, 0, 1) 135 | .open_with_executor(Arc::clone(&executor)) 136 | .await? 137 | .into_inner(); 138 | db.set_seal_policy(Arc::new(BatchesThreshold { batches: 1 })); 139 | 140 | write_rows(&mut db, &schema, 0).await?; 141 | 142 | let wal_handle = db 143 | .wal() 144 | .cloned() 145 | .expect("wal handle available before shutdown"); 146 | 147 | // allow timer to tick 148 | sleep(Duration::from_millis(5)).await; 149 | 150 | db.disable_wal().await?; 151 | let metrics = wal_handle.metrics(); 152 | let guard = metrics.read().await; 153 | assert!( 154 | guard.sync_operations > 0, 155 | "expected sync operations to be recorded" 156 | ); 157 | 158 | if let Err(err) = fs::remove_dir_all(&temp_root) { 159 | eprintln!("cleanup failed: {err}"); 160 | } 161 | Ok(()) 162 | } 163 | --------------------------------------------------------------------------------