├── src ├── function │ ├── scalar │ │ └── mod.rs │ ├── mod.rs │ └── aggregate │ │ ├── count.rs │ │ ├── mod.rs │ │ └── avg.rs ├── sql │ ├── mod.rs │ ├── ast │ │ └── mod.rs │ └── parser │ │ └── mod.rs ├── plan │ ├── physical_planner │ │ └── mod.rs │ ├── logical_plan │ │ ├── analyze.rs │ │ ├── values.rs │ │ ├── create_table.rs │ │ ├── empty_relation.rs │ │ ├── drop_table.rs │ │ ├── filter.rs │ │ ├── create_index.rs │ │ ├── update.rs │ │ ├── limit.rs │ │ ├── delete.rs │ │ ├── table_scan.rs │ │ ├── aggregate.rs │ │ ├── project.rs │ │ ├── drop_index.rs │ │ ├── insert.rs │ │ ├── join.rs │ │ ├── sort.rs │ │ └── util.rs │ ├── mod.rs │ └── logical_planner │ │ ├── mod.rs │ │ ├── plan_create_index.rs │ │ ├── plan_explain.rs │ │ ├── plan_delete.rs │ │ ├── plan_insert.rs │ │ ├── plan_update.rs │ │ ├── plan_drop.rs │ │ ├── plan_create_table.rs │ │ └── plan_query.rs ├── storage │ ├── index │ │ └── mod.rs │ ├── io │ │ └── mod.rs │ ├── heap │ │ └── mod.rs │ ├── page │ │ ├── mod.rs │ │ └── freelist_page.rs │ ├── codec │ │ ├── mod.rs │ │ ├── tuple.rs │ │ ├── meta_page.rs │ │ └── freelist_page.rs │ └── mod.rs ├── tests │ ├── mod.rs │ ├── sql_example │ │ ├── create_index.slt │ │ ├── create_table.slt │ │ ├── analyze.slt │ │ ├── delete.slt │ │ ├── update.slt │ │ ├── drop.slt │ │ ├── insert.slt │ │ ├── show_explain.slt │ │ └── transaction.slt │ └── sql_test.rs ├── optimizer │ ├── mod.rs │ └── rule │ │ ├── mod.rs │ │ ├── push_down_filter.rs │ │ ├── eliminate_limit.rs │ │ ├── push_down_limit.rs │ │ └── merge_limit.rs ├── utils │ ├── mod.rs │ ├── cache │ │ ├── mod.rs │ │ └── tiny_lfu.rs │ ├── bitmap.rs │ └── table_ref.rs ├── buffer │ └── mod.rs ├── catalog │ ├── mod.rs │ ├── registry.rs │ └── column.rs ├── recovery │ ├── wal │ │ ├── record.rs │ │ ├── codec │ │ │ ├── clr.rs │ │ │ ├── txn.rs │ │ │ ├── page.rs │ │ │ └── checkpoint.rs │ │ ├── io.rs │ │ ├── writer.rs │ │ └── buffer.rs │ ├── mod.rs │ ├── redo.rs │ ├── analysis.rs │ ├── wal_record.rs │ └── resource_manager.rs ├── lib.rs ├── transaction │ ├── mod.rs │ └── lock_guard.rs ├── error.rs ├── expression │ ├── util.rs │ ├── literal.rs │ ├── alias.rs │ ├── cast.rs │ ├── column.rs │ └── aggregate.rs └── execution │ ├── physical_plan │ ├── create_table.rs │ ├── analyze.rs │ ├── filter.rs │ ├── project.rs │ ├── empty.rs │ ├── create_index.rs │ ├── values.rs │ ├── scan.rs │ ├── limit.rs │ ├── drop_table.rs │ ├── drop_index.rs │ ├── nested_loop_join.rs │ └── seq_scan.rs │ └── mod.rs ├── public ├── rust-db.png └── terminal-preview.svg ├── typos.toml ├── docs ├── src │ ├── assets │ │ └── rust-db.png │ ├── SUMMARY.md │ ├── modules │ │ ├── config.md │ │ ├── bin.md │ │ ├── tests.md │ │ ├── background.md │ │ ├── expression.md │ │ ├── execution.md │ │ ├── sql.md │ │ ├── buffer.md │ │ ├── plan.md │ │ ├── optimizer.md │ │ ├── index.md │ │ ├── catalog.md │ │ └── storage.md │ ├── introduction.md │ ├── buffer │ │ └── page.md │ ├── contributing.md │ └── storage │ │ └── disk_io.md ├── book.toml └── mermaid-init.js ├── .dockerignore ├── .gitignore ├── fly.toml ├── .github └── workflows │ ├── ci_typos.yml │ ├── mdbook.yml │ └── ci.yml ├── Dockerfile ├── LICENSE └── Cargo.toml /src/function/scalar/mod.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/sql/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod ast; 2 | pub mod parser; 3 | -------------------------------------------------------------------------------- /src/function/mod.rs: -------------------------------------------------------------------------------- 1 | mod aggregate; 2 | mod scalar; 3 | 4 | pub use aggregate::*; 5 | -------------------------------------------------------------------------------- /public/rust-db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feichai0017/QuillSQL/HEAD/public/rust-db.png -------------------------------------------------------------------------------- /typos.toml: -------------------------------------------------------------------------------- 1 | [files] 2 | extend-exclude = ["docs/mermaid.min.js", "docs/mermaid-init.js"] 3 | -------------------------------------------------------------------------------- /docs/src/assets/rust-db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feichai0017/QuillSQL/HEAD/docs/src/assets/rust-db.png -------------------------------------------------------------------------------- /src/plan/physical_planner/mod.rs: -------------------------------------------------------------------------------- 1 | mod physical_planner; 2 | 3 | pub use physical_planner::PhysicalPlanner; 4 | -------------------------------------------------------------------------------- /src/storage/index/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod btree_index; 2 | pub mod btree_iterator; 3 | pub mod index_recovery; 4 | pub mod wal_codec; 5 | -------------------------------------------------------------------------------- /src/tests/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub mod recovery_tests; 3 | pub mod sql_test; 4 | #[cfg(test)] 5 | pub mod transaction_tests; 6 | -------------------------------------------------------------------------------- /src/optimizer/mod.rs: -------------------------------------------------------------------------------- 1 | mod logical_optimizer; 2 | pub mod rule; 3 | 4 | pub use logical_optimizer::{LogicalOptimizer, LogicalOptimizerRule}; 5 | -------------------------------------------------------------------------------- /src/storage/io/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(target_os = "linux")] 2 | pub mod io_uring; 3 | 4 | #[cfg(not(target_os = "linux"))] 5 | pub mod block_io; 6 | -------------------------------------------------------------------------------- /src/tests/sql_example/create_index.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table t1 (a int, b int) 3 | 4 | statement ok 5 | create index idx1 on t1 (a) -------------------------------------------------------------------------------- /src/tests/sql_example/create_table.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table t(v1 int, v2 int, v3 int) 3 | 4 | statement ok 5 | create table if not exists t -------------------------------------------------------------------------------- /src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod bitmap; 2 | pub mod cache; 3 | pub mod scalar; 4 | pub mod table_ref; 5 | pub mod util; 6 | 7 | pub mod ring_buffer; 8 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | target 2 | .git 3 | .github 4 | **/*.rs.bk 5 | **/*.swp 6 | **/*.swo 7 | **/.DS_Store 8 | **/node_modules 9 | **/tmp 10 | **/target 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/plan/logical_plan/analyze.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::table_ref::TableReference; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct Analyze { 5 | pub table: TableReference, 6 | } 7 | -------------------------------------------------------------------------------- /src/plan/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod logical_plan; 2 | mod logical_planner; 3 | mod physical_planner; 4 | 5 | pub use logical_planner::{LogicalPlanner, PlannerContext}; 6 | pub use physical_planner::PhysicalPlanner; 7 | -------------------------------------------------------------------------------- /src/storage/heap/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod heap_recovery; 2 | pub mod mvcc_heap; 3 | pub mod table_heap; 4 | pub mod wal_codec; 5 | 6 | pub use mvcc_heap::MvccHeap; 7 | pub use table_heap::{TableHeap, TableIterator}; 8 | -------------------------------------------------------------------------------- /src/storage/page/mod.rs: -------------------------------------------------------------------------------- 1 | mod btree_page; 2 | mod freelist_page; 3 | mod meta_page; 4 | mod table_page; 5 | 6 | pub use btree_page::*; 7 | pub use freelist_page::*; 8 | pub use meta_page::*; 9 | pub use table_page::*; 10 | -------------------------------------------------------------------------------- /src/tests/sql_example/analyze.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table analyze_t (v int) 3 | 4 | statement ok 5 | insert into analyze_t values (1), (2), (3) 6 | 7 | statement ok 8 | analyze table analyze_t 9 | 10 | query I 11 | select count(*) from analyze_t 12 | ---- 13 | 3 14 | -------------------------------------------------------------------------------- /docs/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["The QuillSQL Team"] 3 | language = "en" 4 | src = "src" 5 | title = "QuillSQL Internals" 6 | 7 | [preprocessor.mermaid] 8 | command = "mdbook-mermaid" 9 | 10 | [output.html] 11 | site-url = "/QuillSQL/" 12 | 13 | additional-js = ["mermaid.min.js", "mermaid-init.js"] 14 | -------------------------------------------------------------------------------- /src/buffer/mod.rs: -------------------------------------------------------------------------------- 1 | mod buffer_manager; 2 | mod buffer_pool; 3 | mod page; 4 | 5 | pub use buffer_manager::BufferManager; 6 | pub use buffer_pool::{BufferPool, FrameId, BUFFER_POOL_SIZE}; 7 | pub use page::{ 8 | AtomicPageId, PageId, PageMeta, ReadPageGuard, WritePageGuard, INVALID_PAGE_ID, PAGE_SIZE, 9 | }; 10 | -------------------------------------------------------------------------------- /src/plan/logical_planner/mod.rs: -------------------------------------------------------------------------------- 1 | mod bind_expr; 2 | mod logical_planner; 3 | mod plan_create_index; 4 | mod plan_create_table; 5 | mod plan_delete; 6 | mod plan_drop; 7 | mod plan_explain; 8 | mod plan_insert; 9 | mod plan_query; 10 | mod plan_set_expr; 11 | mod plan_update; 12 | 13 | pub use logical_planner::{LogicalPlanner, PlannerContext}; 14 | -------------------------------------------------------------------------------- /src/tests/sql_example/delete.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table d1 ( 3 | id int primary key, 4 | value int 5 | ) 6 | 7 | statement ok 8 | insert into d1 values (1, 10), (2, 20), (3, 30) 9 | 10 | statement ok 11 | delete from d1 where id = 2 12 | 13 | query 14 | select * from d1 order by id 15 | ---- 16 | 1 10 17 | 3 30 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/catalog/mod.rs: -------------------------------------------------------------------------------- 1 | mod catalog; 2 | mod column; 3 | mod data_type; 4 | mod information; 5 | pub mod registry; 6 | mod schema; 7 | mod stats; 8 | 9 | pub use catalog::*; 10 | pub use column::{Column, ColumnRef}; 11 | pub use data_type::DataType; 12 | pub use information::*; 13 | pub use registry::*; 14 | pub use schema::*; 15 | pub use stats::*; 16 | -------------------------------------------------------------------------------- /src/optimizer/rule/mod.rs: -------------------------------------------------------------------------------- 1 | mod eliminate_limit; 2 | mod merge_limit; 3 | mod push_down_filter; 4 | mod push_down_limit; 5 | mod push_limit_to_scan; 6 | 7 | pub use eliminate_limit::EliminateLimit; 8 | pub use merge_limit::MergeLimit; 9 | pub use push_down_filter::PushDownFilterToScan; 10 | pub use push_down_limit::PushDownLimit; 11 | pub use push_limit_to_scan::PushLimitIntoScan; 12 | -------------------------------------------------------------------------------- /src/recovery/wal/record.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | 3 | use crate::recovery::wal::Lsn; 4 | 5 | #[derive(Clone, Debug)] 6 | pub struct WalRecord { 7 | pub start_lsn: Lsn, 8 | pub end_lsn: Lsn, 9 | pub payload: Bytes, 10 | } 11 | 12 | impl WalRecord { 13 | pub fn encoded_len(&self) -> u64 { 14 | self.end_lsn.saturating_sub(self.start_lsn) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod background; 2 | pub mod buffer; 3 | pub mod catalog; 4 | pub mod config; 5 | pub mod database; 6 | pub mod error; 7 | pub mod execution; 8 | pub mod expression; 9 | pub mod function; 10 | pub mod optimizer; 11 | pub mod plan; 12 | pub mod recovery; 13 | pub mod session; 14 | pub mod sql; 15 | pub mod storage; 16 | pub mod tests; 17 | pub mod transaction; 18 | pub mod utils; 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | 7 | # These are backup files generated by rustfmt 8 | **/*.rs.bk 9 | 10 | # MSVC Windows builds of rustc generate these, which store debugging information 11 | *.pdb 12 | 13 | 14 | # Added by cargo 15 | 16 | /target 17 | .vscode 18 | .github 19 | .history 20 | AGENTS.md 21 | GEMINI.md 22 | /docs/book 23 | -------------------------------------------------------------------------------- /src/plan/logical_plan/values.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | 4 | #[derive(derive_new::new, Debug, Clone)] 5 | pub struct Values { 6 | pub schema: SchemaRef, 7 | pub values: Vec>, 8 | } 9 | 10 | impl std::fmt::Display for Values { 11 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 12 | write!(f, "Values") 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/plan/logical_plan/create_table.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::Column; 2 | use crate::utils::table_ref::TableReference; 3 | 4 | #[derive(Debug, Clone)] 5 | pub struct CreateTable { 6 | pub name: TableReference, 7 | pub columns: Vec, 8 | pub if_not_exists: bool, 9 | } 10 | 11 | impl std::fmt::Display for CreateTable { 12 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 13 | write!(f, "CreateTable: {}", self.name) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/tests/sql_example/update.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table t_update ( 3 | id int, 4 | v int 5 | ) 6 | 7 | statement ok 8 | insert into t_update values (1, 5), (2, 10) 9 | 10 | statement ok 11 | update t_update set v = v + 1 12 | 13 | query 14 | select id, v from t_update order by id 15 | ---- 16 | 1 6 17 | 2 11 18 | 19 | statement ok 20 | update t_update set v = v + 2 where id = 1 21 | 22 | query 23 | select id, v from t_update order by id 24 | ---- 25 | 1 8 26 | 2 11 -------------------------------------------------------------------------------- /src/plan/logical_plan/empty_relation.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct EmptyRelation { 5 | /// Whether to produce a placeholder row 6 | pub produce_one_row: bool, 7 | /// The schema description of the output 8 | pub schema: SchemaRef, 9 | } 10 | 11 | impl std::fmt::Display for EmptyRelation { 12 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 13 | write!(f, "EmptyRelation") 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/storage/codec/mod.rs: -------------------------------------------------------------------------------- 1 | mod btree_page; 2 | mod common; 3 | mod freelist_page; 4 | mod meta_page; 5 | mod scalar; 6 | mod table_page; 7 | mod tuple; 8 | 9 | pub use btree_page::*; 10 | pub use common::CommonCodec; 11 | pub use freelist_page::{FreelistPageCodec, FreelistPageHeaderCodec}; 12 | pub use meta_page::MetaPageCodec; 13 | pub use scalar::ScalarValueCodec; 14 | pub use table_page::*; 15 | pub use tuple::TupleCodec; 16 | 17 | // data + consumed offset 18 | pub type DecodedData = (T, usize); 19 | -------------------------------------------------------------------------------- /src/storage/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod codec; 2 | pub mod disk_manager; 3 | pub mod disk_scheduler; 4 | pub mod engine; 5 | pub mod heap; 6 | pub mod index; 7 | pub mod io; 8 | pub mod page; 9 | pub mod tuple; 10 | 11 | pub use engine::{ 12 | DefaultStorageEngine, IndexHandle, IndexScanRequest, StorageEngine, TableBinding, TableHandle, 13 | TupleStream, 14 | }; 15 | 16 | pub use heap::heap_recovery; 17 | pub use heap::mvcc_heap::{self, MvccHeap}; 18 | pub use heap::table_heap; 19 | pub use heap::table_heap::{TableHeap, TableIterator}; 20 | -------------------------------------------------------------------------------- /src/tests/sql_example/drop.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table drop_t (id int primary key) 3 | 4 | statement ok 5 | create index drop_t_idx on drop_t(id) 6 | 7 | statement ok 8 | drop index drop_t_idx 9 | 10 | statement ok 11 | drop index if exists drop_t_idx 12 | 13 | statement ok 14 | create index drop_t_idx on drop_t(id) 15 | 16 | statement ok 17 | drop table drop_t 18 | 19 | statement ok 20 | drop table if exists drop_t 21 | 22 | statement error 23 | drop table drop_t 24 | 25 | statement error 26 | drop index drop_t_idx 27 | -------------------------------------------------------------------------------- /src/plan/logical_plan/drop_table.rs: -------------------------------------------------------------------------------- 1 | use crate::utils::table_ref::TableReference; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct DropTable { 5 | pub name: TableReference, 6 | pub if_exists: bool, 7 | } 8 | 9 | impl std::fmt::Display for DropTable { 10 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 11 | if self.if_exists { 12 | write!(f, "DropTable IF EXISTS: {}", self.name) 13 | } else { 14 | write!(f, "DropTable: {}", self.name) 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/plan/logical_plan/filter.rs: -------------------------------------------------------------------------------- 1 | use crate::expression::Expr; 2 | use crate::plan::logical_plan::LogicalPlan; 3 | use std::sync::Arc; 4 | 5 | #[derive(derive_new::new, Debug, Clone)] 6 | pub struct Filter { 7 | /// The predicate expression, which must have Boolean type. 8 | pub predicate: Expr, 9 | /// The incoming logical plan 10 | pub input: Arc, 11 | } 12 | 13 | impl std::fmt::Display for Filter { 14 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 15 | write!(f, "Filter: {}", self.predicate) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/transaction/mod.rs: -------------------------------------------------------------------------------- 1 | mod lock_guard; 2 | mod lock_manager; 3 | mod mvcc; 4 | mod transaction; 5 | mod transaction_manager; 6 | mod txn_context; 7 | 8 | pub use lock_guard::{RowLockGuard, TxnReadGuard}; 9 | pub use lock_manager::{LockDebugSnapshot, LockManager, LockMode}; 10 | pub use mvcc::{TransactionSnapshot, TransactionStatus}; 11 | pub use transaction::{ 12 | CommandId, IsolationLevel, Transaction, TransactionId, TransactionState, INVALID_COMMAND_ID, 13 | }; 14 | pub use transaction_manager::{TransactionManager, TxnDebugSnapshot}; 15 | pub use txn_context::TxnContext; 16 | -------------------------------------------------------------------------------- /fly.toml: -------------------------------------------------------------------------------- 1 | app = "quillsql" 2 | primary_region = "sin" 3 | 4 | [build] 5 | dockerfile = "Dockerfile" 6 | 7 | [env] 8 | PORT = "8080" 9 | RUST_LOG = "info" 10 | # QUILL_DB_FILE = "/data/quill.db" 11 | QUILL_DISABLE_DIRECT_IO = "1" 12 | 13 | [http_service] 14 | internal_port = 8080 15 | force_https = true 16 | auto_start_machines = true 17 | auto_stop_machines = "off" 18 | min_machines_running = 1 19 | 20 | [[vm]] 21 | cpu_kind = "shared" 22 | cpus = 1 23 | memory = "512mb" 24 | 25 | # [[mounts]] 26 | # source = "data" 27 | # destination = "/data" -------------------------------------------------------------------------------- /src/plan/logical_plan/create_index.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::plan::logical_plan::OrderByExpr; 3 | use crate::utils::table_ref::TableReference; 4 | 5 | #[derive(derive_new::new, Debug, Clone)] 6 | pub struct CreateIndex { 7 | pub index_name: String, 8 | pub table: TableReference, 9 | pub table_schema: SchemaRef, 10 | pub columns: Vec, 11 | } 12 | 13 | impl std::fmt::Display for CreateIndex { 14 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 15 | write!(f, "CreateIndex: {}", self.index_name) 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/plan/logical_plan/update.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::utils::table_ref::TableReference; 4 | use std::collections::HashMap; 5 | 6 | #[derive(derive_new::new, Debug, Clone)] 7 | pub struct Update { 8 | pub table: TableReference, 9 | pub table_schema: SchemaRef, 10 | pub assignments: HashMap, 11 | pub selection: Option, 12 | } 13 | 14 | impl std::fmt::Display for Update { 15 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 16 | write!(f, "Update: {}", self.table,) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/plan/logical_plan/limit.rs: -------------------------------------------------------------------------------- 1 | use crate::plan::logical_plan::LogicalPlan; 2 | use std::sync::Arc; 3 | 4 | #[derive(derive_new::new, Debug, Clone)] 5 | pub struct Limit { 6 | pub limit: Option, 7 | pub offset: usize, 8 | pub input: Arc, 9 | } 10 | 11 | impl std::fmt::Display for Limit { 12 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 13 | write!( 14 | f, 15 | "Limit: {}, offset: {}", 16 | self.limit.map_or("None".to_string(), |v| v.to_string()), 17 | self.offset 18 | ) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.github/workflows/ci_typos.yml: -------------------------------------------------------------------------------- 1 | name: Typos Check 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} 13 | cancel-in-progress: true 14 | 15 | env: 16 | RUST_BACKTRACE: 1 17 | 18 | jobs: 19 | typos-check: 20 | name: typos check 21 | runs-on: ubuntu-latest 22 | timeout-minutes: 10 23 | env: 24 | FORCE_COLOR: 1 25 | steps: 26 | - uses: actions/checkout@v5 27 | - name: Check typos 28 | uses: crate-ci/typos@v1.37.2 29 | -------------------------------------------------------------------------------- /src/recovery/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod analysis; 2 | pub mod control_file; 3 | pub mod recovery_manager; 4 | pub mod redo; 5 | pub mod resource_manager; 6 | pub mod undo; 7 | pub mod wal; 8 | pub mod wal_record; 9 | 10 | pub use control_file::{ControlFileManager, ControlFileSnapshot, WalInitState}; 11 | pub use recovery_manager::RecoveryManager; 12 | pub use wal::{Lsn, WalAppendContext, WalAppendResult, WalManager, WalReader, WalWriterHandle}; 13 | pub use wal_record::{ 14 | decode_frame, CheckpointPayload, PageWritePayload, ResourceManagerId, TransactionPayload, 15 | TransactionRecordKind, WalFrame, WalRecordPayload, 16 | }; 17 | -------------------------------------------------------------------------------- /src/utils/cache/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::buffer::FrameId; 2 | use crate::error::QuillSQLResult; 3 | 4 | pub mod clock_lru; 5 | pub mod lru_k; 6 | pub mod tiny_lfu; 7 | pub mod window_lfu; 8 | 9 | pub trait Replacer { 10 | fn new(capacity: usize) -> Self 11 | where 12 | Self: Sized; 13 | 14 | fn record_access(&mut self, frame_id: FrameId) -> QuillSQLResult<()>; 15 | 16 | fn evict(&mut self) -> Option; 17 | 18 | fn set_evictable(&mut self, frame_id: FrameId, set_evictable: bool) -> QuillSQLResult<()>; 19 | 20 | fn remove(&mut self, frame_id: FrameId); 21 | 22 | fn size(&self) -> usize; 23 | } 24 | -------------------------------------------------------------------------------- /src/plan/logical_plan/delete.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::utils::table_ref::TableReference; 4 | 5 | #[derive(derive_new::new, Debug, Clone)] 6 | pub struct Delete { 7 | /// Target table reference 8 | pub table: TableReference, 9 | /// Cached schema for the table heap 10 | pub table_schema: SchemaRef, 11 | /// Optional predicate bound during planning 12 | pub selection: Option, 13 | } 14 | 15 | impl std::fmt::Display for Delete { 16 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 17 | write!(f, "Delete: {}", self.table) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/sql/ast/mod.rs: -------------------------------------------------------------------------------- 1 | // This acts as an abstraction layer, allowing easier parser replacement later. 2 | pub use sqlparser::ast::BinaryOperator; 3 | pub use sqlparser::ast::ColumnDef; 4 | pub use sqlparser::ast::DataType; 5 | pub use sqlparser::ast::Expr; 6 | pub use sqlparser::ast::Function; 7 | pub use sqlparser::ast::FunctionArg; 8 | pub use sqlparser::ast::FunctionArgExpr; 9 | pub use sqlparser::ast::ObjectName; 10 | pub use sqlparser::ast::OrderByExpr; 11 | pub use sqlparser::ast::Statement; 12 | pub use sqlparser::ast::TableConstraint; 13 | pub use sqlparser::ast::TransactionMode; 14 | pub use sqlparser::ast::UnaryOperator; 15 | pub use sqlparser::ast::Value; 16 | -------------------------------------------------------------------------------- /src/plan/logical_plan/table_scan.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::utils::table_ref::TableReference; 4 | 5 | #[derive(derive_new::new, Debug, Clone)] 6 | pub struct TableScan { 7 | pub table_ref: TableReference, 8 | pub table_schema: SchemaRef, 9 | pub filters: Vec, 10 | pub limit: Option, 11 | /// Row-count estimate attached by the planner (from ANALYZE). 12 | pub estimated_row_count: Option, 13 | } 14 | 15 | impl std::fmt::Display for TableScan { 16 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 17 | write!(f, "TableScan: {}", self.table_ref) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/plan/logical_plan/aggregate.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::plan::logical_plan::LogicalPlan; 4 | use std::sync::Arc; 5 | 6 | #[derive(Debug, Clone)] 7 | pub struct Aggregate { 8 | /// The incoming logical plan 9 | pub input: Arc, 10 | /// Grouping expressions 11 | pub group_exprs: Vec, 12 | /// Aggregate expressions 13 | pub aggr_exprs: Vec, 14 | /// The schema description of the aggregate output 15 | pub schema: SchemaRef, 16 | } 17 | 18 | impl std::fmt::Display for Aggregate { 19 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 20 | write!(f, "Aggregate") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/function/aggregate/count.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::function::aggregate::Accumulator; 3 | use crate::utils::scalar::ScalarValue; 4 | 5 | #[derive(Debug, Clone)] 6 | pub struct CountAccumulator { 7 | count: i64, 8 | } 9 | 10 | impl CountAccumulator { 11 | pub fn new() -> Self { 12 | Self { count: 0 } 13 | } 14 | } 15 | 16 | impl Accumulator for CountAccumulator { 17 | fn update_value(&mut self, value: &ScalarValue) -> QuillSQLResult<()> { 18 | if !value.is_null() { 19 | self.count += 1; 20 | } 21 | Ok(()) 22 | } 23 | 24 | fn evaluate(&self) -> QuillSQLResult { 25 | Ok(self.count.into()) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/plan/logical_plan/project.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::plan::logical_plan::LogicalPlan; 4 | use std::sync::Arc; 5 | 6 | #[derive(derive_new::new, Debug, Clone)] 7 | pub struct Project { 8 | pub exprs: Vec, 9 | pub input: Arc, 10 | pub schema: SchemaRef, 11 | } 12 | 13 | impl std::fmt::Display for Project { 14 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 15 | write!( 16 | f, 17 | "Project: {}", 18 | self.exprs 19 | .iter() 20 | .map(|e| format!("{e}")) 21 | .collect::>() 22 | .join(", ") 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /.github/workflows/mdbook.yml: -------------------------------------------------------------------------------- 1 | name: Deploy mdBook 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy-book: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v4 16 | 17 | - name: Install mdbook and mdbook-mermaid 18 | run: | 19 | cargo install mdbook --no-default-features 20 | cargo install mdbook-mermaid 21 | 22 | - name: Build mdbook 23 | run: mdbook build docs 24 | 25 | - name: Deploy to GitHub Pages 26 | uses: peaceiris/actions-gh-pages@v3 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | publish_dir: ./docs/book 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.82 as builder 2 | 3 | # Install and use the nightly toolchain to support Edition 2024 dependencies 4 | RUN rustup toolchain install nightly 5 | RUN rustup default nightly 6 | WORKDIR /app 7 | 8 | # Pre-cache deps 9 | COPY Cargo.toml Cargo.lock ./ 10 | RUN mkdir -p src/bin && echo "fn main(){}" > src/bin/dummy.rs && cargo build --release || true 11 | 12 | # Build 13 | COPY . . 14 | RUN cargo build --release --bin server 15 | 16 | FROM gcr.io/distroless/cc-debian12:nonroot 17 | USER nonroot 18 | WORKDIR /app 19 | COPY --from=builder /app/target/release/server /usr/local/bin/server 20 | COPY --from=builder /app/public /app/public 21 | COPY --from=builder /app/docs /app/docs 22 | ENV QUILL_HTTP_ADDR=0.0.0.0:8080 23 | EXPOSE 8080 24 | CMD ["/usr/local/bin/server"] 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/tests/sql_example/insert.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table t1 ( 3 | a tinyint, 4 | b smallint, 5 | c integer, 6 | d bigint, 7 | e tinyint unsigned, 8 | f smallint unsigned, 9 | g integer unsigned, 10 | h bigint unsigned, 11 | i float, 12 | j varchar 13 | ) 14 | 15 | statement ok 16 | insert into t1 values 17 | (1, 2, 3, 4, 5, 6, 7, 8, 1.1, 'a'), 18 | (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) 19 | 20 | query 21 | select * from t1 22 | ---- 23 | 1 2 3 4 5 6 7 8 1.1 a 24 | NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL 25 | 26 | 27 | statement ok 28 | create table t2 ( 29 | a tinyint, 30 | b integer default 1, 31 | ) 32 | 33 | statement ok 34 | insert into t2(a) values (1) 35 | 36 | query 37 | select * from t2 38 | ---- 39 | 1 1 -------------------------------------------------------------------------------- /src/plan/logical_plan/drop_index.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone)] 2 | pub struct DropIndex { 3 | pub name: String, 4 | pub schema: Option, 5 | pub catalog: Option, 6 | pub if_exists: bool, 7 | } 8 | 9 | impl std::fmt::Display for DropIndex { 10 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 11 | let qualified = match (&self.catalog, &self.schema) { 12 | (Some(catalog), Some(schema)) => format!("{catalog}.{schema}.{}", self.name), 13 | (None, Some(schema)) => format!("{schema}.{}", self.name), 14 | _ => self.name.clone(), 15 | }; 16 | if self.if_exists { 17 | write!(f, "DropIndex IF EXISTS: {qualified}") 18 | } else { 19 | write!(f, "DropIndex: {qualified}") 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/plan/logical_plan/insert.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::plan::logical_plan::LogicalPlan; 3 | use crate::utils::table_ref::TableReference; 4 | use std::sync::Arc; 5 | 6 | #[derive(derive_new::new, Debug, Clone)] 7 | pub struct Insert { 8 | pub table: TableReference, 9 | pub table_schema: SchemaRef, 10 | pub projected_schema: SchemaRef, 11 | pub input: Arc, 12 | } 13 | 14 | impl std::fmt::Display for Insert { 15 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 16 | write!( 17 | f, 18 | "Insert: {} ({})", 19 | self.table, 20 | self.projected_schema 21 | .columns 22 | .iter() 23 | .map(|c| c.name.clone()) 24 | .collect::>() 25 | .join(", ") 26 | ) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | pub type QuillSQLResult = Result; 4 | 5 | #[derive(Debug, Error)] 6 | pub enum QuillSQLError { 7 | #[error("Not support: {0}")] 8 | NotSupport(String), 9 | 10 | #[error("Internal error: {0}")] 11 | Internal(String), 12 | 13 | #[error("IO error: {0}")] 14 | Io(#[from] std::io::Error), 15 | 16 | #[error("Parser error: {0}")] 17 | Parser(#[from] sqlparser::parser::ParserError), 18 | 19 | #[error("Bincode error: {0}")] 20 | Bincode(#[from] bincode::Error), 21 | 22 | #[error("Plan error: {0}")] 23 | Plan(String), 24 | 25 | #[error("Execution error: {0}")] 26 | Execution(String), 27 | 28 | #[error("Storage error: {0}")] 29 | Storage(String), 30 | 31 | #[error("Concurrent error: {0}")] 32 | Concurrent(String), 33 | 34 | #[error("Unwind")] 35 | Unwind, 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | 8 | jobs: 9 | rust: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Install nightly toolchain 15 | uses: dtolnay/rust-toolchain@nightly 16 | with: 17 | components: rustfmt, clippy 18 | 19 | - uses: Swatinem/rust-cache@v2 20 | 21 | - name: fmt 22 | run: cargo fmt --all -- --check 23 | 24 | - name: clippy 25 | run: cargo clippy --all-targets 26 | 27 | - name: test 28 | run: cargo test -q 29 | 30 | docker-build: 31 | runs-on: ubuntu-latest 32 | needs: rust 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | # 仅构建镜像,不在 Dockerfile 里跑测试 37 | - name: Build Docker image (no push) 38 | run: docker build --pull -t quillsql-ci . 39 | -------------------------------------------------------------------------------- /src/catalog/registry.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use dashmap::DashMap; 4 | 5 | use crate::storage::table_heap::TableHeap; 6 | use crate::utils::table_ref::TableReference; 7 | 8 | /// Registry of table heaps that may require background maintenance. 9 | #[derive(Debug, Default)] 10 | pub struct TableRegistry { 11 | inner: DashMap>, 12 | } 13 | 14 | impl TableRegistry { 15 | pub fn new() -> Self { 16 | Self { 17 | inner: DashMap::new(), 18 | } 19 | } 20 | 21 | pub fn register(&self, table: TableReference, heap: Arc) { 22 | self.inner.insert(table, heap); 23 | } 24 | 25 | pub fn unregister(&self, table: &TableReference) { 26 | self.inner.remove(table); 27 | } 28 | 29 | pub fn iter_tables(&self) -> impl Iterator)> + '_ { 30 | self.inner 31 | .iter() 32 | .map(|entry| (entry.key().clone(), entry.value().clone())) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/tests/sql_example/show_explain.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | CREATE TABLE t(id INT, v INT) 3 | 4 | statement ok 5 | INSERT INTO t(id, v) VALUES (1, 10), (2, 20), (3, 30) 6 | 7 | # SHOW DATABASES should at least include 'public' 8 | query T 9 | SELECT schema FROM information_schema.schemas WHERE schema = 'public' 10 | ---- 11 | public 12 | 13 | # SHOW TABLES should include user table 't' (using information_schema for stability) 14 | query T 15 | SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' and table_name = 't' 16 | ---- 17 | t 18 | 19 | # EXPLAIN returns lines of plan text 20 | query T 21 | EXPLAIN SELECT id FROM t WHERE v > 10 ORDER BY id LIMIT 2 OFFSET 1 22 | ---- 23 | Limit: 2, offset: 1 24 | Sort: id ASC NULLS LAST 25 | Project: id 26 | Filter: (v Gt 10) 27 | TableScan: t 28 | 29 | query T 30 | EXPLAIN SELECT id, COUNT(*) FROM t GROUP BY id ORDER BY id 31 | ---- 32 | Sort: id ASC NULLS LAST 33 | Project: id, Count 34 | Aggregate 35 | TableScan: t 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/catalog/column.rs: -------------------------------------------------------------------------------- 1 | use derive_with::With; 2 | use std::sync::Arc; 3 | 4 | use crate::catalog::DataType; 5 | use crate::utils::scalar::ScalarValue; 6 | use crate::utils::table_ref::TableReference; 7 | 8 | pub type ColumnRef = Arc; 9 | 10 | #[derive(Debug, Clone, With)] 11 | pub struct Column { 12 | pub relation: Option, 13 | pub name: String, 14 | pub data_type: DataType, 15 | pub nullable: bool, 16 | pub default: ScalarValue, 17 | } 18 | 19 | impl PartialEq for Column { 20 | fn eq(&self, other: &Self) -> bool { 21 | self.name == other.name && self.data_type == other.data_type 22 | } 23 | } 24 | 25 | impl Eq for Column {} 26 | 27 | impl Column { 28 | pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { 29 | Self { 30 | relation: None, 31 | name: name.into(), 32 | data_type, 33 | nullable, 34 | default: ScalarValue::new_empty(data_type), 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/expression/util.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::error::QuillSQLResult; 3 | use crate::expression::{Alias, Cast, ColumnExpr, Expr}; 4 | 5 | /// Convert an expression into Column expression 6 | pub fn columnize_expr(e: &Expr, input_schema: &SchemaRef) -> QuillSQLResult { 7 | match e { 8 | Expr::Column(_) => Ok(e.clone()), 9 | Expr::Alias(Alias { expr, name }) => Ok(Expr::Alias(Alias { 10 | expr: Box::new(columnize_expr(expr, input_schema)?), 11 | name: name.clone(), 12 | })), 13 | Expr::Cast(Cast { expr, data_type }) => Ok(Expr::Cast(Cast { 14 | expr: Box::new(columnize_expr(expr, input_schema)?), 15 | data_type: *data_type, 16 | })), 17 | _ => { 18 | let name = e.to_string(); 19 | let idx = input_schema.index_of(None, name.as_str())?; 20 | let col = input_schema.column_with_index(idx)?; 21 | Ok(Expr::Column(ColumnExpr { 22 | relation: col.relation.clone(), 23 | name, 24 | })) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 eric_song 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/tests/sql_example/transaction.slt: -------------------------------------------------------------------------------- 1 | statement ok 2 | create table accounts(id integer primary key, balance integer) 3 | 4 | statement ok 5 | insert into accounts values (1, 100), (2, 200) 6 | 7 | statement ok 8 | update accounts set balance = 150 where id = 1 9 | 10 | query 11 | select * from accounts order by id 12 | ---- 13 | 1 150 14 | 2 200 15 | 16 | statement ok 17 | begin 18 | 19 | statement ok 20 | update accounts set balance = 175 where id = 2 21 | 22 | statement ok 23 | rollback 24 | 25 | query 26 | select * from accounts order by id 27 | ---- 28 | 1 150 29 | 2 200 30 | 31 | statement ok 32 | set transaction isolation level serializable 33 | 34 | statement ok 35 | begin 36 | 37 | statement ok 38 | update accounts set balance = 160 where id = 1 39 | 40 | statement ok 41 | commit 42 | 43 | query 44 | select * from accounts order by id 45 | ---- 46 | 1 160 47 | 2 200 48 | 49 | statement ok 50 | set session transaction isolation level read committed 51 | 52 | statement ok 53 | begin 54 | 55 | statement ok 56 | update accounts set balance = 195 where id = 2 57 | 58 | statement ok 59 | commit 60 | 61 | query 62 | select * from accounts order by id 63 | ---- 64 | 1 160 65 | 2 195 66 | -------------------------------------------------------------------------------- /src/expression/literal.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::Schema; 2 | use crate::catalog::{Column, DataType}; 3 | use crate::error::QuillSQLResult; 4 | use crate::expression::ExprTrait; 5 | use crate::storage::tuple::Tuple; 6 | use crate::utils::scalar::ScalarValue; 7 | 8 | #[derive(Debug, Clone, PartialEq, Eq)] 9 | pub struct Literal { 10 | pub value: ScalarValue, 11 | } 12 | 13 | impl ExprTrait for Literal { 14 | fn data_type(&self, _input_schema: &Schema) -> QuillSQLResult { 15 | Ok(self.value.data_type()) 16 | } 17 | 18 | fn nullable(&self, _input_schema: &Schema) -> QuillSQLResult { 19 | Ok(self.value.is_null()) 20 | } 21 | 22 | fn evaluate(&self, _tuple: &Tuple) -> QuillSQLResult { 23 | Ok(self.value.clone()) 24 | } 25 | 26 | fn to_column(&self, input_schema: &Schema) -> QuillSQLResult { 27 | Ok(Column::new( 28 | format!("{}", self.value), 29 | self.data_type(input_schema)?, 30 | self.nullable(input_schema)?, 31 | )) 32 | } 33 | } 34 | 35 | impl std::fmt::Display for Literal { 36 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 37 | write!(f, "{}", self.value) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/execution/physical_plan/create_table.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::utils::table_ref::TableReference; 3 | use crate::{ 4 | catalog::Schema, 5 | error::QuillSQLResult, 6 | execution::{ExecutionContext, VolcanoExecutor}, 7 | storage::tuple::Tuple, 8 | }; 9 | use std::sync::Arc; 10 | 11 | #[derive(derive_new::new, Debug)] 12 | pub struct PhysicalCreateTable { 13 | pub table: TableReference, 14 | pub schema: Schema, 15 | pub if_not_exists: bool, 16 | } 17 | 18 | impl VolcanoExecutor for PhysicalCreateTable { 19 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 20 | if self.if_not_exists && context.catalog.try_table_heap(&self.table).is_some() { 21 | return Ok(None); 22 | } 23 | 24 | context 25 | .catalog 26 | .create_table(self.table.clone(), Arc::new(self.schema.clone()))?; 27 | Ok(None) 28 | } 29 | fn output_schema(&self) -> SchemaRef { 30 | Arc::new(self.schema.clone()) 31 | } 32 | } 33 | 34 | impl std::fmt::Display for PhysicalCreateTable { 35 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 36 | write!(f, "CreateTable: {}", self.table) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/recovery/redo.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::buffer::BufferManager; 4 | use crate::error::QuillSQLResult; 5 | use crate::recovery::resource_manager::{ 6 | ensure_default_resource_managers_registered, get_resource_manager, RedoContext, 7 | }; 8 | use crate::recovery::wal::codec::WalFrame; 9 | use crate::storage::disk_scheduler::DiskScheduler; 10 | 11 | pub struct RedoExecutor { 12 | disk_scheduler: Arc, 13 | buffer_pool: Option>, 14 | } 15 | 16 | impl RedoExecutor { 17 | pub fn new( 18 | disk_scheduler: Arc, 19 | buffer_pool: Option>, 20 | ) -> Self { 21 | ensure_default_resource_managers_registered(); 22 | Self { 23 | disk_scheduler, 24 | buffer_pool, 25 | } 26 | } 27 | 28 | pub fn apply(&self, frame: &WalFrame) -> QuillSQLResult { 29 | if let Some(manager) = get_resource_manager(frame.rmid) { 30 | let ctx = RedoContext { 31 | disk_scheduler: self.disk_scheduler.clone(), 32 | buffer_pool: self.buffer_pool.clone(), 33 | }; 34 | manager.redo(frame, &ctx) 35 | } else { 36 | Ok(0) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/expression/alias.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::Schema; 2 | use crate::catalog::{Column, DataType}; 3 | use crate::error::QuillSQLResult; 4 | use crate::expression::{Expr, ExprTrait}; 5 | use crate::storage::tuple::Tuple; 6 | use crate::utils::scalar::ScalarValue; 7 | 8 | #[derive(Clone, PartialEq, Eq, Debug)] 9 | pub struct Alias { 10 | pub expr: Box, 11 | pub name: String, 12 | } 13 | 14 | impl ExprTrait for Alias { 15 | fn data_type(&self, input_schema: &Schema) -> QuillSQLResult { 16 | self.expr.data_type(input_schema) 17 | } 18 | 19 | fn nullable(&self, input_schema: &Schema) -> QuillSQLResult { 20 | self.expr.nullable(input_schema) 21 | } 22 | 23 | fn evaluate(&self, tuple: &Tuple) -> QuillSQLResult { 24 | self.expr.evaluate(tuple) 25 | } 26 | 27 | fn to_column(&self, input_schema: &Schema) -> QuillSQLResult { 28 | Ok(Column::new( 29 | self.name.clone(), 30 | self.data_type(input_schema)?, 31 | self.nullable(input_schema)?, 32 | )) 33 | } 34 | } 35 | 36 | impl std::fmt::Display for Alias { 37 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 38 | write!(f, "{} AS {}", self.expr, self.name) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_create_index.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::plan::logical_plan::{CreateIndex, LogicalPlan}; 3 | 4 | use super::LogicalPlanner; 5 | 6 | impl<'a> LogicalPlanner<'a> { 7 | pub fn plan_create_index( 8 | &self, 9 | index_name: &sqlparser::ast::ObjectName, 10 | table_name: &sqlparser::ast::ObjectName, 11 | columns: &[sqlparser::ast::OrderByExpr], 12 | ) -> QuillSQLResult { 13 | let index_name = index_name.0.first().map_or( 14 | Err(QuillSQLError::Plan(format!( 15 | "Index name {index_name} is not expected" 16 | ))), 17 | |ident| Ok(ident.value.clone()), 18 | )?; 19 | let table = self.bind_table_name(table_name)?; 20 | let mut columns_expr = vec![]; 21 | for col in columns.iter() { 22 | let col_expr = self.bind_order_by_expr(col)?; 23 | columns_expr.push(col_expr); 24 | } 25 | let table_schema = self.context.catalog.table_heap(&table)?.schema.clone(); 26 | Ok(LogicalPlan::CreateIndex(CreateIndex { 27 | index_name, 28 | table, 29 | table_schema, 30 | columns: columns_expr, 31 | })) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/recovery/wal/codec/clr.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::recovery::Lsn; 3 | use crate::transaction::TransactionId; 4 | 5 | #[derive(Debug, Clone)] 6 | pub struct ClrPayload { 7 | pub txn_id: TransactionId, 8 | pub undone_lsn: Lsn, 9 | pub undo_next_lsn: Lsn, 10 | } 11 | 12 | pub fn encode_clr(body: &ClrPayload) -> Vec { 13 | // CLR (rmid=Clr, info=0) 14 | // body: txn_id(8) + undone_lsn(8) + undo_next_lsn(8) 15 | let mut buf = Vec::with_capacity(24); 16 | buf.extend_from_slice(&body.txn_id.to_le_bytes()); 17 | buf.extend_from_slice(&body.undone_lsn.to_le_bytes()); 18 | buf.extend_from_slice(&body.undo_next_lsn.to_le_bytes()); 19 | buf 20 | } 21 | 22 | pub fn decode_clr(bytes: &[u8]) -> QuillSQLResult { 23 | if bytes.len() != 24 { 24 | return Err(QuillSQLError::Internal( 25 | "CLR payload must be 24 bytes".to_string(), 26 | )); 27 | } 28 | let txn_id = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as TransactionId; 29 | let undone_lsn = u64::from_le_bytes(bytes[8..16].try_into().unwrap()); 30 | let undo_next_lsn = u64::from_le_bytes(bytes[16..24].try_into().unwrap()); 31 | Ok(ClrPayload { 32 | txn_id, 33 | undone_lsn, 34 | undo_next_lsn, 35 | }) 36 | } 37 | -------------------------------------------------------------------------------- /src/execution/physical_plan/analyze.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Display; 2 | 3 | use crate::catalog::{SchemaRef, EMPTY_SCHEMA_REF}; 4 | use crate::error::QuillSQLResult; 5 | use crate::execution::{ExecutionContext, VolcanoExecutor}; 6 | use crate::storage::tuple::Tuple; 7 | use crate::transaction::LockMode; 8 | use crate::utils::table_ref::TableReference; 9 | 10 | #[derive(Debug)] 11 | pub struct PhysicalAnalyze { 12 | table: TableReference, 13 | } 14 | 15 | impl PhysicalAnalyze { 16 | pub fn new(table: TableReference) -> Self { 17 | Self { table } 18 | } 19 | } 20 | 21 | impl VolcanoExecutor for PhysicalAnalyze { 22 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 23 | context 24 | .txn_ctx_mut() 25 | .lock_table(self.table.clone(), LockMode::IntentionShared)?; 26 | context.catalog.analyze_table(&self.table)?; 27 | Ok(()) 28 | } 29 | 30 | fn next(&self, _context: &mut ExecutionContext) -> QuillSQLResult> { 31 | Ok(None) 32 | } 33 | 34 | fn output_schema(&self) -> SchemaRef { 35 | EMPTY_SCHEMA_REF.clone() 36 | } 37 | } 38 | 39 | impl Display for PhysicalAnalyze { 40 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 41 | write!(f, "Analyze {}", self.table) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_explain.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::catalog::{Column, DataType, Schema}; 4 | use crate::error::QuillSQLResult; 5 | use crate::plan::logical_plan::{LogicalPlan, Values}; 6 | use crate::plan::LogicalPlanner; 7 | use crate::utils::util::pretty_format_logical_plan; 8 | 9 | impl LogicalPlanner<'_> { 10 | /// Build a plan that returns the formatted logical plan as rows of text. 11 | pub fn plan_explain( 12 | &mut self, 13 | statement: &sqlparser::ast::Statement, 14 | ) -> QuillSQLResult { 15 | let inner_plan = self.plan(statement)?; 16 | let text = pretty_format_logical_plan(&inner_plan); 17 | let lines: Vec> = text 18 | .lines() 19 | .map(|s| { 20 | vec![crate::expression::Expr::Literal( 21 | crate::expression::Literal { 22 | value: s.to_string().into(), 23 | }, 24 | )] 25 | }) 26 | .collect(); 27 | 28 | let schema = Arc::new(Schema::new(vec![Column::new( 29 | "plan", 30 | DataType::Varchar(None), 31 | false, 32 | )])); 33 | Ok(LogicalPlan::Values(Values { 34 | schema, 35 | values: lines, 36 | })) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/plan/logical_plan/join.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::expression::Expr; 3 | use crate::plan::logical_plan::LogicalPlan; 4 | use std::sync::Arc; 5 | 6 | #[derive(derive_new::new, Debug, Clone)] 7 | pub struct Join { 8 | pub left: Arc, 9 | pub right: Arc, 10 | pub join_type: JoinType, 11 | pub condition: Option, 12 | pub schema: SchemaRef, 13 | } 14 | 15 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 16 | pub enum JoinType { 17 | // select * from x inner join y on ... 18 | Inner, 19 | // select * from x left (outer) join y on ... 20 | LeftOuter, 21 | // select * from x right (outer) join y on ... 22 | RightOuter, 23 | // select * from x full (outer) join y on ... 24 | FullOuter, 25 | // select * from x, y 26 | // select * from x cross join y 27 | Cross, 28 | } 29 | 30 | impl std::fmt::Display for Join { 31 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 32 | write!(f, "{} Join", self.join_type)?; 33 | if let Some(condition) = self.condition.as_ref() { 34 | write!(f, ": On {condition}")?; 35 | } 36 | Ok(()) 37 | } 38 | } 39 | 40 | impl std::fmt::Display for JoinType { 41 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 42 | write!(f, "{self:?}") 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/expression/cast.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{Column, DataType, Schema}; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::expression::{Expr, ExprTrait}; 4 | use crate::storage::tuple::Tuple; 5 | use crate::utils::scalar::ScalarValue; 6 | 7 | /// Cast expression 8 | #[derive(Clone, PartialEq, Eq, Debug)] 9 | pub struct Cast { 10 | /// The expression being cast 11 | pub expr: Box, 12 | /// The `DataType` the expression will yield 13 | pub data_type: DataType, 14 | } 15 | 16 | impl ExprTrait for Cast { 17 | fn data_type(&self, _input_schema: &Schema) -> QuillSQLResult { 18 | Ok(self.data_type) 19 | } 20 | 21 | fn nullable(&self, input_schema: &Schema) -> QuillSQLResult { 22 | self.expr.nullable(input_schema) 23 | } 24 | 25 | fn evaluate(&self, tuple: &Tuple) -> QuillSQLResult { 26 | let value = self.expr.evaluate(tuple)?; 27 | value.cast_to(&self.data_type) 28 | } 29 | 30 | fn to_column(&self, _input_schema: &Schema) -> QuillSQLResult { 31 | Err(QuillSQLError::Plan(format!( 32 | "expr {:?} as column not supported", 33 | self 34 | ))) 35 | } 36 | } 37 | 38 | impl std::fmt::Display for Cast { 39 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 40 | write!(f, "CAST {} AS {}", self.expr, self.data_type) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /docs/mermaid-init.js: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at https://mozilla.org/MPL/2.0/. 4 | 5 | (() => { 6 | const darkThemes = ['ayu', 'navy', 'coal']; 7 | const lightThemes = ['light', 'rust']; 8 | 9 | const classList = document.getElementsByTagName('html')[0].classList; 10 | 11 | let lastThemeWasLight = true; 12 | for (const cssClass of classList) { 13 | if (darkThemes.includes(cssClass)) { 14 | lastThemeWasLight = false; 15 | break; 16 | } 17 | } 18 | 19 | const theme = lastThemeWasLight ? 'default' : 'dark'; 20 | mermaid.initialize({ startOnLoad: true, theme }); 21 | 22 | // Simplest way to make mermaid re-render the diagrams in the new theme is via refreshing the page 23 | 24 | for (const darkTheme of darkThemes) { 25 | document.getElementById(darkTheme).addEventListener('click', () => { 26 | if (lastThemeWasLight) { 27 | window.location.reload(); 28 | } 29 | }); 30 | } 31 | 32 | for (const lightTheme of lightThemes) { 33 | document.getElementById(lightTheme).addEventListener('click', () => { 34 | if (!lastThemeWasLight) { 35 | window.location.reload(); 36 | } 37 | }); 38 | } 39 | })(); 40 | -------------------------------------------------------------------------------- /src/function/aggregate/mod.rs: -------------------------------------------------------------------------------- 1 | mod avg; 2 | mod count; 3 | 4 | pub use avg::AvgAccumulator; 5 | pub use count::CountAccumulator; 6 | use std::fmt::Debug; 7 | 8 | use crate::error::QuillSQLResult; 9 | use crate::utils::scalar::ScalarValue; 10 | use strum::{EnumIter, IntoEnumIterator}; 11 | 12 | #[derive(Clone, PartialEq, Eq, Debug, EnumIter)] 13 | pub enum AggregateFunctionKind { 14 | Count, 15 | Avg, 16 | } 17 | 18 | impl AggregateFunctionKind { 19 | pub fn create_accumulator(&self) -> Box { 20 | match self { 21 | AggregateFunctionKind::Count => Box::new(CountAccumulator::new()), 22 | AggregateFunctionKind::Avg => Box::new(AvgAccumulator::new()), 23 | } 24 | } 25 | 26 | pub fn find(name: &str) -> Option { 27 | AggregateFunctionKind::iter().find(|kind| kind.to_string().eq_ignore_ascii_case(name)) 28 | } 29 | } 30 | 31 | impl std::fmt::Display for AggregateFunctionKind { 32 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 33 | write!(f, "{self:?}") 34 | } 35 | } 36 | 37 | pub trait Accumulator: Send + Sync + Debug { 38 | /// Updates the accumulator's state from its input. 39 | fn update_value(&mut self, value: &ScalarValue) -> QuillSQLResult<()>; 40 | 41 | /// Returns the final aggregate value, consuming the internal state. 42 | fn evaluate(&self) -> QuillSQLResult; 43 | } 44 | -------------------------------------------------------------------------------- /src/execution/physical_plan/filter.rs: -------------------------------------------------------------------------------- 1 | use log::debug; 2 | use std::sync::Arc; 3 | 4 | use crate::catalog::SchemaRef; 5 | use crate::expression::Expr; 6 | use crate::{ 7 | error::QuillSQLResult, 8 | execution::{ExecutionContext, VolcanoExecutor}, 9 | storage::tuple::Tuple, 10 | }; 11 | 12 | use super::PhysicalPlan; 13 | 14 | #[derive(derive_new::new, Debug)] 15 | pub struct PhysicalFilter { 16 | pub predicate: Expr, 17 | pub input: Arc, 18 | } 19 | 20 | impl VolcanoExecutor for PhysicalFilter { 21 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 22 | debug!("init filter executor"); 23 | self.input.init(context) 24 | } 25 | 26 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 27 | loop { 28 | if let Some(tuple) = self.input.next(context)? { 29 | if context.eval_predicate(&self.predicate, &tuple)? { 30 | return Ok(Some(tuple)); 31 | } 32 | } else { 33 | return Ok(None); 34 | } 35 | } 36 | } 37 | 38 | fn output_schema(&self) -> SchemaRef { 39 | self.input.output_schema() 40 | } 41 | } 42 | 43 | impl std::fmt::Display for PhysicalFilter { 44 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 45 | write!(f, "Filter: {}", self.predicate) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/execution/physical_plan/project.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::catalog::SchemaRef; 4 | use crate::expression::Expr; 5 | use crate::{ 6 | error::QuillSQLResult, 7 | execution::{ExecutionContext, VolcanoExecutor}, 8 | storage::tuple::Tuple, 9 | }; 10 | 11 | use super::PhysicalPlan; 12 | 13 | #[derive(derive_new::new, Debug)] 14 | pub struct PhysicalProject { 15 | pub exprs: Vec, 16 | pub schema: SchemaRef, 17 | pub input: Arc, 18 | } 19 | 20 | impl VolcanoExecutor for PhysicalProject { 21 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 22 | self.input.init(context) 23 | } 24 | 25 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 26 | if let Some(tuple) = self.input.next(context)? { 27 | let mut new_values = Vec::with_capacity(self.exprs.len()); 28 | for expr in &self.exprs { 29 | new_values.push(context.eval_expr(expr, &tuple)?); 30 | } 31 | Ok(Some(Tuple::new(self.output_schema(), new_values))) 32 | } else { 33 | Ok(None) 34 | } 35 | } 36 | 37 | fn output_schema(&self) -> SchemaRef { 38 | self.schema.clone() 39 | } 40 | } 41 | 42 | impl std::fmt::Display for PhysicalProject { 43 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 44 | write!(f, "Project") 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_delete.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::plan::logical_plan::{Delete, LogicalPlan}; 3 | use crate::plan::LogicalPlanner; 4 | 5 | impl<'a> LogicalPlanner<'a> { 6 | pub fn plan_delete( 7 | &self, 8 | table: &sqlparser::ast::TableWithJoins, 9 | selection: &Option, 10 | ) -> QuillSQLResult { 11 | if !table.joins.is_empty() { 12 | return Err(QuillSQLError::Plan( 13 | "DELETE with joins is not supported".to_string(), 14 | )); 15 | } 16 | 17 | let table_ref = match &table.relation { 18 | sqlparser::ast::TableFactor::Table { name, .. } => self.bind_table_name(name)?, 19 | _ => { 20 | return Err(QuillSQLError::Plan(format!( 21 | "Table {} is not supported in DELETE", 22 | table 23 | ))) 24 | } 25 | }; 26 | 27 | let table_heap = self.context.catalog.table_heap(&table_ref)?; 28 | let table_schema = table_heap.schema.clone(); 29 | 30 | let predicate = match selection { 31 | Some(expr) => Some(self.bind_expr(expr)?), 32 | None => None, 33 | }; 34 | 35 | Ok(LogicalPlan::Delete(Delete { 36 | table: table_ref, 37 | table_schema, 38 | selection: predicate, 39 | })) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/plan/logical_plan/sort.rs: -------------------------------------------------------------------------------- 1 | use crate::expression::Expr; 2 | use crate::plan::logical_plan::LogicalPlan; 3 | use std::sync::Arc; 4 | 5 | #[derive(derive_new::new, Debug, Clone)] 6 | pub struct Sort { 7 | pub order_by: Vec, 8 | pub input: Arc, 9 | pub limit: Option, 10 | } 11 | 12 | #[derive(Clone, PartialEq, Eq, Debug)] 13 | pub struct OrderByExpr { 14 | /// The expression to sort on 15 | pub expr: Box, 16 | /// The direction of the sort 17 | pub asc: bool, 18 | /// Whether to put Nulls before all other data values 19 | pub nulls_first: bool, 20 | } 21 | 22 | impl std::fmt::Display for OrderByExpr { 23 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 24 | write!( 25 | f, 26 | "{} {} {}", 27 | self.expr, 28 | if self.asc { "ASC" } else { "DESC" }, 29 | if self.nulls_first { 30 | "NULLS FIRST" 31 | } else { 32 | "NULLS LAST" 33 | } 34 | ) 35 | } 36 | } 37 | 38 | impl std::fmt::Display for Sort { 39 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 40 | write!( 41 | f, 42 | "Sort: {}", 43 | self.order_by 44 | .iter() 45 | .map(|e| format!("{e}")) 46 | .collect::>() 47 | .join(", ") 48 | ) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/function/aggregate/avg.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::DataType; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::function::aggregate::Accumulator; 4 | use crate::utils::scalar::ScalarValue; 5 | 6 | #[derive(Debug, Clone)] 7 | pub struct AvgAccumulator { 8 | sum: Option, 9 | count: u64, 10 | } 11 | 12 | impl AvgAccumulator { 13 | pub fn new() -> Self { 14 | Self { 15 | sum: None, 16 | count: 0, 17 | } 18 | } 19 | } 20 | 21 | impl Accumulator for AvgAccumulator { 22 | fn update_value(&mut self, value: &ScalarValue) -> QuillSQLResult<()> { 23 | if !value.is_null() { 24 | let value = match value.cast_to(&DataType::Float64)? { 25 | ScalarValue::Float64(Some(v)) => v, 26 | _ => { 27 | return Err(QuillSQLError::Internal(format!( 28 | "Failed to cast value {} to float64", 29 | value 30 | ))) 31 | } 32 | }; 33 | 34 | match self.sum { 35 | Some(sum) => self.sum = Some(sum + value), 36 | None => self.sum = Some(value), 37 | } 38 | self.count += 1; 39 | } 40 | Ok(()) 41 | } 42 | 43 | fn evaluate(&self) -> QuillSQLResult { 44 | Ok(ScalarValue::Float64( 45 | self.sum.map(|f| f / self.count as f64), 46 | )) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/execution/physical_plan/empty.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::execution::{ExecutionContext, VolcanoExecutor}; 3 | use crate::{error::QuillSQLResult, storage::tuple::Tuple}; 4 | use std::sync::atomic::{AtomicUsize, Ordering}; 5 | 6 | #[derive(Debug)] 7 | pub struct PhysicalEmpty { 8 | pub produce_row_count: usize, 9 | pub schema: SchemaRef, 10 | outputted_count: AtomicUsize, 11 | } 12 | 13 | impl PhysicalEmpty { 14 | pub fn new(produce_row_count: usize, schema: SchemaRef) -> Self { 15 | Self { 16 | produce_row_count, 17 | schema, 18 | outputted_count: AtomicUsize::new(0), 19 | } 20 | } 21 | } 22 | 23 | impl VolcanoExecutor for PhysicalEmpty { 24 | fn init(&self, _context: &mut ExecutionContext) -> QuillSQLResult<()> { 25 | self.outputted_count.store(0, Ordering::SeqCst); 26 | Ok(()) 27 | } 28 | fn next(&self, _context: &mut ExecutionContext) -> QuillSQLResult> { 29 | if self.outputted_count.fetch_add(1, Ordering::SeqCst) < self.produce_row_count { 30 | Ok(Some(Tuple::new(self.schema.clone(), vec![]))) 31 | } else { 32 | Ok(None) 33 | } 34 | } 35 | 36 | fn output_schema(&self) -> SchemaRef { 37 | self.schema.clone() 38 | } 39 | } 40 | 41 | impl std::fmt::Display for PhysicalEmpty { 42 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 43 | write!(f, "Empty") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/transaction/lock_guard.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::storage::page::RecordId; 4 | use crate::transaction::{LockManager, TransactionId}; 5 | use crate::utils::table_ref::TableReference; 6 | 7 | pub struct RowLockGuard { 8 | manager: Arc, 9 | txn_id: TransactionId, 10 | table: TableReference, 11 | rid: RecordId, 12 | released: bool, 13 | } 14 | 15 | impl RowLockGuard { 16 | pub fn new( 17 | manager: Arc, 18 | txn_id: TransactionId, 19 | table: TableReference, 20 | rid: RecordId, 21 | ) -> Self { 22 | Self { 23 | manager, 24 | txn_id, 25 | table, 26 | rid, 27 | released: false, 28 | } 29 | } 30 | 31 | pub fn release(mut self) { 32 | self.do_release(); 33 | } 34 | 35 | fn do_release(&mut self) { 36 | if !self.released { 37 | let _ = self 38 | .manager 39 | .unlock_row_raw(self.txn_id, self.table.clone(), self.rid); 40 | self.released = true; 41 | } 42 | } 43 | } 44 | 45 | impl Drop for RowLockGuard { 46 | fn drop(&mut self) { 47 | self.do_release(); 48 | } 49 | } 50 | 51 | pub enum TxnReadGuard { 52 | Temporary(RowLockGuard), 53 | } 54 | 55 | impl TxnReadGuard { 56 | pub fn release(self) { 57 | match self { 58 | TxnReadGuard::Temporary(guard) => guard.release(), 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /docs/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [Introduction](./introduction.md) 4 | - [Overall Architecture](./architecture.md) 5 | - [Module Overview](./modules/overview.md) 6 | 7 | --- 8 | 9 | - [Contributor's Guide](./contributing.md) 10 | 11 | --- 12 | 13 | - [SQL Front-End](./modules/sql.md) 14 | - [Catalog](./modules/catalog.md) 15 | - [Expression System](./modules/expression.md) 16 | - [Query Plan](./modules/plan.md) 17 | - [The Lifecycle of a Query](./plan/lifecycle.md) 18 | - [Query Optimizer](./modules/optimizer.md) 19 | - [Rule-Based Optimization](./optimizer/rules.md) 20 | - [Execution Engine](./modules/execution.md) 21 | - [The Volcano Model](./execution/volcano.md) 22 | - [Transaction Manager](./modules/transaction.md) 23 | - [MVCC and 2PL](./transaction/mvcc_and_2pl.md) 24 | - [Storage Engine](./modules/storage.md) 25 | - [Disk I/O](./storage/disk_io.md) 26 | - [Page & Tuple Layout](./storage/page_layouts.md) 27 | - [Table Heap & MVCC](./storage/table_heap.md) 28 | - [Buffer Manager](./modules/buffer.md) 29 | - [Page & Page Guards](./buffer/page.md) 30 | - [The Buffer Pool](./buffer/buffer_pool.md) 31 | - [Indexes](./modules/index.md) 32 | - [B+Tree](./index/btree_index.md) 33 | - [Recovery Manager (WAL)](./modules/recovery.md) 34 | - [The ARIES Protocol](./recovery/aries.md) 35 | - [Write-Ahead Logging](./recovery/wal.md) 36 | - [Background Services](./modules/background.md) 37 | - [Configuration](./modules/config.md) 38 | - [Front-Ends (CLI / HTTP)](./modules/bin.md) 39 | - [Testing & Documentation](./modules/tests.md) 40 | -------------------------------------------------------------------------------- /src/recovery/wal/io.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | use std::sync::Arc; 3 | 4 | use bytes::Bytes; 5 | 6 | use crate::error::QuillSQLResult; 7 | use crate::storage::disk_scheduler::{DiskCommandResultReceiver, DiskScheduler}; 8 | 9 | pub type WalIoTicket = DiskCommandResultReceiver<()>; 10 | 11 | pub trait WalSink: Send + Sync { 12 | fn schedule_write( 13 | &self, 14 | path: PathBuf, 15 | offset: u64, 16 | data: Bytes, 17 | sync: bool, 18 | ) -> QuillSQLResult>; 19 | 20 | fn schedule_fsync(&self, path: PathBuf) -> QuillSQLResult>; 21 | } 22 | 23 | #[derive(Clone)] 24 | pub struct DiskSchedulerWalSink { 25 | scheduler: Arc, 26 | } 27 | 28 | impl DiskSchedulerWalSink { 29 | pub fn new(scheduler: Arc) -> Self { 30 | Self { scheduler } 31 | } 32 | } 33 | 34 | impl WalSink for DiskSchedulerWalSink { 35 | fn schedule_write( 36 | &self, 37 | path: PathBuf, 38 | offset: u64, 39 | data: Bytes, 40 | sync: bool, 41 | ) -> QuillSQLResult> { 42 | if data.is_empty() && !sync { 43 | return Ok(None); 44 | } 45 | let receiver = self 46 | .scheduler 47 | .schedule_wal_write(path, offset, data, sync)?; 48 | Ok(Some(receiver)) 49 | } 50 | 51 | fn schedule_fsync(&self, path: PathBuf) -> QuillSQLResult> { 52 | let receiver = self.scheduler.schedule_wal_fsync(path)?; 53 | Ok(Some(receiver)) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /docs/src/modules/config.md: -------------------------------------------------------------------------------- 1 | # Configuration & Runtime Options 2 | 3 | `src/config/` centralizes tunables used by `DatabaseOptions`, the CLI/HTTP front-ends, and 4 | background workers. Keeping knobs in one place makes it easy to demonstrate how WAL, 5 | buffering, or vacuum behavior changes under different settings. 6 | 7 | --- 8 | 9 | ## Key Types 10 | 11 | | Type | Description | 12 | | ---- | ----------- | 13 | | `DatabaseOptions` | Top-level options when constructing a database (WAL config, default isolation, etc.). | 14 | | `WalOptions` | WAL directory, segment size, flush strategy, writer interval, sync mode. | 15 | | `IndexVacuumConfig` / `MvccVacuumConfig` | Background worker intervals (buffer writer, MVCC vacuum). | 16 | | `BufferPoolConfig` | Optional overrides for pool size, TinyLFU, and replacement policy details. | 17 | 18 | --- 19 | 20 | ## Usage 21 | 22 | - CLI/HTTP front-ends parse env vars or config files into `DatabaseOptions` and pass them 23 | to `Database::new_*`. 24 | - During `bootstrap_storage`, the database wires these options into `WalManager`, 25 | `DiskScheduler`, and `BackgroundWorkers`. 26 | - Workers and execution components receive `Arc` references to the relevant configs so 27 | they can adapt at runtime without global state. 28 | 29 | --- 30 | 31 | ## Teaching Ideas 32 | 33 | - Toggle `WalOptions::synchronous_commit` to discuss commit latency vs durability. 34 | - Shrink the buffer pool to highlight eviction behavior under different replacement 35 | policies. 36 | - Adjust `MvccVacuumConfig` intervals and measure how vacuum frequency affects foreground 37 | write throughput. 38 | -------------------------------------------------------------------------------- /src/recovery/wal/codec/txn.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::transaction::TransactionId; 3 | 4 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 5 | #[repr(u8)] 6 | pub enum TransactionRecordKind { 7 | Begin = 1, 8 | Commit = 2, 9 | Abort = 3, 10 | } 11 | 12 | impl TransactionRecordKind { 13 | pub fn from_u8(value: u8) -> QuillSQLResult { 14 | match value { 15 | 1 => Ok(TransactionRecordKind::Begin), 16 | 2 => Ok(TransactionRecordKind::Commit), 17 | 3 => Ok(TransactionRecordKind::Abort), 18 | other => Err(QuillSQLError::Internal(format!( 19 | "Unknown transaction record kind: {}", 20 | other 21 | ))), 22 | } 23 | } 24 | } 25 | 26 | #[derive(Debug, Clone)] 27 | pub struct TransactionPayload { 28 | pub marker: TransactionRecordKind, 29 | pub txn_id: TransactionId, 30 | } 31 | 32 | pub fn encode_transaction(body: &TransactionPayload) -> (u8, Vec) { 33 | let mut buf = Vec::with_capacity(8); 34 | buf.extend_from_slice(&body.txn_id.to_le_bytes()); 35 | (body.marker as u8, buf) 36 | } 37 | 38 | pub fn decode_transaction(bytes: &[u8], info: u8) -> QuillSQLResult { 39 | if bytes.len() != 8 { 40 | return Err(QuillSQLError::Internal( 41 | "Transaction payload must be 8 bytes".to_string(), 42 | )); 43 | } 44 | let txn_id = u64::from_le_bytes(bytes[0..8].try_into().unwrap()) as TransactionId; 45 | let marker = TransactionRecordKind::from_u8(info)?; 46 | Ok(TransactionPayload { marker, txn_id }) 47 | } 48 | -------------------------------------------------------------------------------- /docs/src/modules/bin.md: -------------------------------------------------------------------------------- 1 | # Front-Ends (CLI / HTTP) 2 | 3 | The `bin/` directory contains the user-facing entry points. Both binaries embed the same 4 | `Database` type, so they demonstrate how the core engine can power different UIs. 5 | 6 | | Binary | Purpose | 7 | | ------ | ------- | 8 | | `client.rs` | Interactive CLI (REPL) that reads SQL, executes it, and prints tabular output. | 9 | | `server.rs` | HTTP + JSON API for integration tests or web UIs. | 10 | 11 | --- 12 | 13 | ## CLI (`bin/client.rs`) 14 | 15 | - Uses `rustyline` to provide history, multi-line editing, and familiar shell shortcuts. 16 | - Each command calls `database.run(sql)` and formats the resulting `Vec`. 17 | - Supports meta commands (e.g., `.tables`) that expose catalog metadata—great for 18 | teaching how logical objects map to physical handles. 19 | 20 | ## HTTP (`bin/server.rs`) 21 | 22 | - Built with `axum`/`hyper` (depending on the current `Cargo.toml`), exposing endpoints such as: 23 | - `POST /query` – run arbitrary SQL and return rows or an error payload. 24 | - Health/metrics endpoints—which you can extend in labs to surface background worker 25 | status or buffer metrics. 26 | - Configuration comes from `QUILL_DB_FILE`, `QUILL_HTTP_ADDR`, `PORT`, etc., mirroring 27 | how production services inject settings. 28 | 29 | --- 30 | 31 | ## Teaching Ideas 32 | 33 | - Extend the CLI with `\describe table` to practice catalog lookups. 34 | - Add transaction endpoints (BEGIN/COMMIT) to the HTTP server to demonstrate how 35 | `SessionContext` scopes transactions per connection. 36 | - Combine CLI interaction with `RUST_LOG` tracing to walk through the entire query 37 | lifecycle. 38 | -------------------------------------------------------------------------------- /src/recovery/wal/codec/page.rs: -------------------------------------------------------------------------------- 1 | use crate::buffer::PageId; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::recovery::Lsn; 4 | 5 | #[derive(Debug, Clone)] 6 | pub struct PageWritePayload { 7 | pub page_id: PageId, 8 | pub prev_page_lsn: Lsn, 9 | pub page_image: Vec, 10 | } 11 | 12 | pub fn encode_page_write(body: &PageWritePayload) -> Vec { 13 | // Page/PageWrite (rmid=Page, info=0) 14 | // body: page_id(4) + prev_page_lsn(8) + image_len(4) + page_image[] 15 | let mut buf = Vec::with_capacity(4 + 8 + 4 + body.page_image.len()); 16 | buf.extend_from_slice(&body.page_id.to_le_bytes()); 17 | buf.extend_from_slice(&body.prev_page_lsn.to_le_bytes()); 18 | buf.extend_from_slice(&(body.page_image.len() as u32).to_le_bytes()); 19 | buf.extend_from_slice(&body.page_image); 20 | buf 21 | } 22 | 23 | pub fn decode_page_write(bytes: &[u8]) -> QuillSQLResult { 24 | if bytes.len() < 4 + 8 + 4 { 25 | return Err(QuillSQLError::Internal( 26 | "PageWrite payload too short".to_string(), 27 | )); 28 | } 29 | let page_id = u32::from_le_bytes(bytes[0..4].try_into().unwrap()) as PageId; 30 | let prev_page_lsn = u64::from_le_bytes(bytes[4..12].try_into().unwrap()) as Lsn; 31 | let image_len = u32::from_le_bytes(bytes[12..16].try_into().unwrap()) as usize; 32 | if bytes.len() != 16 + image_len { 33 | return Err(QuillSQLError::Internal( 34 | "PageWrite payload length mismatch".to_string(), 35 | )); 36 | } 37 | let page_image = bytes[16..].to_vec(); 38 | Ok(PageWritePayload { 39 | page_id, 40 | prev_page_lsn, 41 | page_image, 42 | }) 43 | } 44 | -------------------------------------------------------------------------------- /docs/src/modules/tests.md: -------------------------------------------------------------------------------- 1 | # Testing & Documentation 2 | 3 | QuillSQL is intended for teaching, so the repo invests heavily in examples and automated 4 | verification. The `tests/` tree and this mdBook work together to illustrate every module. 5 | 6 | --- 7 | 8 | ## Test Suite 9 | 10 | | Location | Purpose | 11 | | -------- | ------- | 12 | | `tests/sql_example/*.slt` | [sqllogictest](https://www.sqlite.org/sqllogictest.html) suites covering DDL, DML, transactions, and indexes. | 13 | | `tests/transaction_tests.rs` | Rust unit tests that stress MVCC visibility, lock conflicts, and isolation semantics. | 14 | | `tests/storage_*` | Component tests for heap/index/buffer internals—perfect references for lab exercises. | 15 | 16 | Common commands: 17 | 18 | ```bash 19 | cargo test -q 20 | # focused run 21 | cargo test tests::transaction_tests::repeatable_read_sees_consistent_snapshot_after_update -- --nocapture 22 | ``` 23 | 24 | For long-running suites, wrap with `timeout` to guard against hangs. 25 | 26 | --- 27 | 28 | ## Documentation (mdBook) 29 | 30 | - The `docs/` directory is an mdBook; run `mdbook serve docs` to browse locally. 31 | - Each module, including this page, has a dedicated chapter so instructors can teach 32 | subsystem by subsystem. 33 | - Anchor chapters such as `architecture.md`, `transactions.md`, and `wal.md` walk through 34 | end-to-end flows and subsystem deep dives. 35 | 36 | --- 37 | 38 | ## Teaching Ideas 39 | 40 | - Require sqllogictest additions alongside code changes to reinforce “tests as docs”. 41 | - Use the mdBook site during lectures to connect diagrams with source files. 42 | - Assign “doc walk-through” tasks where students extend diagrams or add experiment 43 | instructions to existing chapters. 44 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_insert.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use std::sync::Arc; 3 | 4 | use crate::plan::logical_plan::{Insert, LogicalPlan, Values}; 5 | 6 | use super::LogicalPlanner; 7 | 8 | impl<'a> LogicalPlanner<'a> { 9 | pub fn plan_insert( 10 | &self, 11 | table_name: &sqlparser::ast::ObjectName, 12 | columns_ident: &Vec, 13 | source: &sqlparser::ast::Query, 14 | ) -> QuillSQLResult { 15 | let mut input = self.plan_set_expr(source.body.as_ref())?; 16 | let table = self.bind_table_name(table_name)?; 17 | let table_schema = self.context.catalog.table_heap(&table)?.schema.clone(); 18 | 19 | let projected_schema = if columns_ident.is_empty() { 20 | table_schema.clone() 21 | } else { 22 | let columns: Vec = columns_ident 23 | .iter() 24 | .map(|ident| ident.value.clone()) 25 | .collect(); 26 | let indices = columns 27 | .iter() 28 | .map(|name| table_schema.index_of(Some(&table), name.as_str())) 29 | .collect::>>()?; 30 | 31 | Arc::new(table_schema.project(&indices)?) 32 | }; 33 | 34 | if let LogicalPlan::Values(Values { values, .. }) = input { 35 | input = LogicalPlan::Values(Values { 36 | values, 37 | schema: projected_schema.clone(), 38 | }) 39 | } 40 | 41 | Ok(LogicalPlan::Insert(Insert { 42 | table, 43 | table_schema, 44 | projected_schema, 45 | input: Arc::new(input), 46 | })) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/recovery/wal/writer.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicBool, Ordering}; 2 | use std::sync::{Arc, Weak}; 3 | use std::thread; 4 | use std::time::Duration; 5 | 6 | use crate::error::{QuillSQLError, QuillSQLResult}; 7 | 8 | use super::WalManager; 9 | 10 | #[derive(Debug)] 11 | pub(super) struct WalWriterRuntime { 12 | stop_flag: Arc, 13 | thread: thread::JoinHandle<()>, 14 | } 15 | 16 | impl WalWriterRuntime { 17 | pub(super) fn spawn(target: Weak, interval: Duration) -> QuillSQLResult { 18 | let stop_flag = Arc::new(AtomicBool::new(false)); 19 | let thread_stop = stop_flag.clone(); 20 | let handle = thread::Builder::new() 21 | .name("walwriter".into()) 22 | .spawn(move || { 23 | while !thread_stop.load(Ordering::Relaxed) { 24 | if let Some(manager) = target.upgrade() { 25 | let _ = manager.flush(None); 26 | } else { 27 | break; 28 | } 29 | thread::sleep(interval); 30 | } 31 | if let Some(manager) = target.upgrade() { 32 | let _ = manager.flush(None); 33 | } 34 | }) 35 | .map_err(|e| QuillSQLError::Internal(format!("Failed to spawn walwriter: {}", e)))?; 36 | Ok(Self { 37 | stop_flag, 38 | thread: handle, 39 | }) 40 | } 41 | 42 | pub(super) fn stop(self) -> QuillSQLResult<()> { 43 | self.stop_flag.store(true, Ordering::Release); 44 | self.thread 45 | .join() 46 | .map_err(|_| QuillSQLError::Internal("walwriter thread panicked".to_string())) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/storage/page/freelist_page.rs: -------------------------------------------------------------------------------- 1 | use crate::buffer::{PageId, INVALID_PAGE_ID, PAGE_SIZE}; 2 | use crate::storage::codec::{CommonCodec, FreelistPageHeaderCodec}; 3 | use std::sync::LazyLock; 4 | 5 | static EMPTY_FREELIST_PAGE_HEADER: FreelistPageHeader = FreelistPageHeader { 6 | next_page_id: 0, 7 | current_size: 0, 8 | max_size: 0, 9 | }; 10 | 11 | pub static FREELIST_PAGE_MAX_SIZE: LazyLock = LazyLock::new(|| { 12 | (PAGE_SIZE - FreelistPageHeaderCodec::encode(&EMPTY_FREELIST_PAGE_HEADER).len()) 13 | / CommonCodec::encode_u32(INVALID_PAGE_ID).len() 14 | }); 15 | 16 | #[derive(Debug, Eq, PartialEq)] 17 | pub struct FreelistPage { 18 | pub header: FreelistPageHeader, 19 | pub array: Vec, 20 | } 21 | 22 | #[derive(Debug, Eq, PartialEq)] 23 | pub struct FreelistPageHeader { 24 | pub next_page_id: PageId, 25 | pub current_size: u32, 26 | pub max_size: u32, 27 | } 28 | 29 | impl FreelistPage { 30 | pub fn new() -> Self { 31 | Self { 32 | header: FreelistPageHeader { 33 | next_page_id: INVALID_PAGE_ID, 34 | current_size: 0, 35 | max_size: *FREELIST_PAGE_MAX_SIZE as u32, 36 | }, 37 | array: vec![], 38 | } 39 | } 40 | 41 | pub fn is_full(&self) -> bool { 42 | self.header.current_size >= self.header.max_size 43 | } 44 | 45 | pub fn push(&mut self, page_id: PageId) { 46 | self.array.push(page_id); 47 | self.header.current_size += 1; 48 | } 49 | 50 | pub fn pop(&mut self) -> Option { 51 | let page_id = self.array.pop(); 52 | if page_id.is_some() { 53 | self.header.current_size -= 1; 54 | } 55 | page_id 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /docs/src/introduction.md: -------------------------------------------------------------------------------- 1 |
2 | QuillSQL Logo 3 |
4 | 5 | # QuillSQL Internals 6 | 7 | Welcome to the technical documentation for QuillSQL. 8 | 9 | This book provides a deep dive into the internal architecture and implementation details of the database. It is intended for developers, contributors, and anyone interested in understanding how a relational database is built from the ground up, referencing concepts from classic database courses like CMU 15-445. 10 | 11 | --- 12 | 13 | ## Table of Contents 14 | 15 | * [**Overall Architecture**](./architecture.md): A high-level overview of the entire system. 16 | 17 | * **Core Modules** 18 | * [**Buffer Manager**](./modules/buffer.md): The in-memory page cache. 19 | * [Page & Page Guards](./buffer/page.md) 20 | * [The Buffer Pool](./buffer/buffer_pool.md) 21 | * [**Storage Engine**](./modules/storage.md): How data is physically stored. 22 | * [Disk I/O](./storage/disk_io.md) 23 | * [Page & Tuple Layout](./storage/page_layouts.md) 24 | * [Table Heap & MVCC](./storage/table_heap.md) 25 | * [**Indexes**](./modules/index.md): The B+Tree implementation. 26 | * [B+Tree Details](./index/btree_index.md) 27 | * [**Recovery Manager (WAL)**](./modules/recovery.md): Crash recovery and the ARIES protocol. 28 | * [**Transaction Manager**](./modules/transaction.md): Concurrency control with MVCC and 2PL. 29 | * [**Query Plan**](./modules/plan.md): The journey from SQL to an executable plan. 30 | * [**Query Optimizer**](./modules/optimizer.md): Rule-based plan transformations. 31 | * [**Execution Engine**](./modules/execution.md): The Volcano (iterator) execution model. 32 | -------------------------------------------------------------------------------- /src/plan/logical_plan/util.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{ColumnRef, Schema}; 2 | use crate::error::QuillSQLResult; 3 | use crate::expression::{Expr, ExprTrait}; 4 | use crate::plan::logical_plan::JoinType; 5 | use crate::plan::logical_plan::LogicalPlan; 6 | use std::sync::Arc; 7 | 8 | pub fn build_join_schema( 9 | left: &Schema, 10 | right: &Schema, 11 | join_type: JoinType, 12 | ) -> QuillSQLResult { 13 | fn nullify_columns(columns: &[ColumnRef]) -> Vec { 14 | columns 15 | .iter() 16 | .map(|f| Arc::new(f.as_ref().clone().with_nullable(true))) 17 | .collect() 18 | } 19 | 20 | let left_cols = &left.columns; 21 | let right_cols = &right.columns; 22 | 23 | let columns: Vec = match join_type { 24 | JoinType::Inner | JoinType::Cross => { 25 | left_cols.iter().chain(right_cols.iter()).cloned().collect() 26 | } 27 | JoinType::LeftOuter => left_cols 28 | .iter() 29 | .chain(&nullify_columns(right_cols)) 30 | .cloned() 31 | .collect(), 32 | JoinType::RightOuter => nullify_columns(left_cols) 33 | .iter() 34 | .chain(right_cols.iter()) 35 | .cloned() 36 | .collect(), 37 | JoinType::FullOuter => nullify_columns(left_cols) 38 | .iter() 39 | .chain(&nullify_columns(right_cols)) 40 | .cloned() 41 | .collect(), 42 | }; 43 | Ok(Schema { columns }) 44 | } 45 | 46 | pub fn project_schema(input: &LogicalPlan, exprs: &[Expr]) -> QuillSQLResult { 47 | let input_schema = &input.schema(); 48 | let mut columns = vec![]; 49 | for expr in exprs { 50 | columns.push(expr.to_column(input_schema)?) 51 | } 52 | Ok(Schema::new(columns)) 53 | } 54 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_update.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::plan::logical_plan::{LogicalPlan, Update}; 3 | use crate::plan::LogicalPlanner; 4 | use std::collections::HashMap; 5 | 6 | impl<'a> LogicalPlanner<'a> { 7 | pub fn plan_update( 8 | &self, 9 | table: &sqlparser::ast::TableWithJoins, 10 | assignments: &[sqlparser::ast::Assignment], 11 | selection: &Option, 12 | ) -> QuillSQLResult { 13 | let table_ref = match &table.relation { 14 | sqlparser::ast::TableFactor::Table { name, .. } => self.bind_table_name(name)?, 15 | _ => { 16 | return Err(QuillSQLError::Plan(format!( 17 | "table {} is not supported", 18 | table 19 | ))) 20 | } 21 | }; 22 | 23 | let table_schema = self.context.catalog.table_heap(&table_ref)?.schema.clone(); 24 | 25 | let mut assignment_map = HashMap::new(); 26 | for assign in assignments { 27 | let column_ident = assign.id.get(0).ok_or(QuillSQLError::Plan(format!( 28 | "Assignment {} is not supported", 29 | assign 30 | )))?; 31 | let column_name = column_ident.value.to_ascii_lowercase(); 32 | let value = self.bind_expr(&assign.value)?; 33 | assignment_map.insert(column_name, value); 34 | } 35 | 36 | let selection = match selection { 37 | Some(e) => Some(self.bind_expr(e)?), 38 | None => None, 39 | }; 40 | 41 | Ok(LogicalPlan::Update(Update { 42 | table: table_ref, 43 | table_schema, 44 | assignments: assignment_map, 45 | selection, 46 | })) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/execution/physical_plan/create_index.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{SchemaRef, EMPTY_SCHEMA_REF}; 2 | use crate::error::QuillSQLError; 3 | use crate::expression::{ColumnExpr, Expr}; 4 | use crate::plan::logical_plan::OrderByExpr; 5 | use crate::storage::tuple::Tuple; 6 | use crate::utils::table_ref::TableReference; 7 | use crate::{ 8 | error::QuillSQLResult, 9 | execution::{ExecutionContext, VolcanoExecutor}, 10 | }; 11 | use std::sync::Arc; 12 | 13 | #[derive(Debug, derive_new::new)] 14 | pub struct PhysicalCreateIndex { 15 | pub name: String, 16 | pub table: TableReference, 17 | pub table_schema: SchemaRef, 18 | pub columns: Vec, 19 | } 20 | 21 | impl VolcanoExecutor for PhysicalCreateIndex { 22 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 23 | let mut key_indices = vec![]; 24 | for col in self.columns.iter() { 25 | match col.expr.as_ref() { 26 | Expr::Column(ColumnExpr { name, .. }) => { 27 | key_indices.push(self.table_schema.index_of(None, name)?); 28 | } 29 | _ => { 30 | return Err(QuillSQLError::Execution(format!( 31 | "The expr should be column instead of {}", 32 | col.expr 33 | ))) 34 | } 35 | } 36 | } 37 | let key_schema = Arc::new(self.table_schema.project(&key_indices)?); 38 | context 39 | .catalog 40 | .create_index(self.name.clone(), &self.table, key_schema)?; 41 | Ok(None) 42 | } 43 | fn output_schema(&self) -> SchemaRef { 44 | EMPTY_SCHEMA_REF.clone() 45 | } 46 | } 47 | 48 | impl std::fmt::Display for PhysicalCreateIndex { 49 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 50 | write!(f, "CreateIndex: {}", self.name) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/recovery/analysis.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::recovery::control_file::ControlFileSnapshot; 3 | use crate::recovery::wal::codec::{ 4 | decode_checkpoint, CheckpointPayload, ResourceManagerId, WalFrame, 5 | }; 6 | use crate::recovery::Lsn; 7 | 8 | #[derive(Debug, Default, Clone)] 9 | pub struct AnalysisResult { 10 | pub start_lsn: Lsn, 11 | pub has_frames: bool, 12 | } 13 | 14 | pub struct AnalysisPass { 15 | latest: Option<(Lsn, CheckpointPayload)>, 16 | snapshot: Option, 17 | has_frames: bool, 18 | } 19 | 20 | impl AnalysisPass { 21 | pub fn new(snapshot: Option) -> Self { 22 | Self { 23 | latest: None, 24 | snapshot, 25 | has_frames: false, 26 | } 27 | } 28 | 29 | pub fn observe(&mut self, frame: &WalFrame) { 30 | self.has_frames = true; 31 | if frame.rmid == ResourceManagerId::Checkpoint { 32 | if let Ok(payload) = decode_checkpoint(&frame.body) { 33 | self.latest = Some((frame.lsn, payload)); 34 | } 35 | } 36 | } 37 | 38 | pub fn finalize(self) -> QuillSQLResult { 39 | let start_lsn = if let Some((checkpoint_lsn, payload)) = &self.latest { 40 | self.snapshot 41 | .map(|snap| snap.checkpoint_redo_start) 42 | .filter(|redo| *redo >= payload.last_lsn && *redo <= *checkpoint_lsn) 43 | .unwrap_or_else(|| { 44 | payload 45 | .dpt 46 | .iter() 47 | .map(|(_, lsn)| *lsn) 48 | .min() 49 | .unwrap_or(payload.last_lsn) 50 | }) 51 | } else { 52 | 0 53 | }; 54 | 55 | Ok(AnalysisResult { 56 | start_lsn, 57 | has_frames: self.has_frames, 58 | }) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/recovery/wal_record.rs: -------------------------------------------------------------------------------- 1 | use crate::recovery::wal::codec; 2 | use crate::recovery::Lsn; 3 | 4 | pub use crate::recovery::wal::codec::{ 5 | decode_checkpoint, decode_clr, decode_frame, decode_page_write, 6 | decode_payload as decode_wal_payload, decode_transaction, encode_frame, 7 | heap_record_kind_to_info, CheckpointPayload, ClrPayload, PageWritePayload, ResourceManagerId, 8 | TransactionPayload, TransactionRecordKind, WalFrame, WAL_CRC_LEN, WAL_HEADER_LEN, WAL_MAGIC, 9 | WAL_VERSION, WAL_VERSION_V1, 10 | }; 11 | 12 | pub use crate::storage::heap::wal_codec::{ 13 | decode_heap_record as decode_heap, encode_heap_record as encode_heap, HeapDeletePayload, 14 | HeapInsertPayload, HeapRecordKind, HeapRecordPayload, RelationIdent, TupleMetaRepr, 15 | }; 16 | pub use crate::storage::index::wal_codec::{ 17 | decode_index_record as decode_index, encode_index_record as encode_index, 18 | IndexInternalEntryPayload, IndexInternalMergePayload, IndexInternalRedistributePayload, 19 | IndexInternalSplitPayload, IndexLeafDeletePayload, IndexLeafInsertPayload, 20 | IndexLeafMergePayload, IndexLeafRedistributePayload, IndexLeafSplitEntryPayload, 21 | IndexLeafSplitPayload, IndexParentDeletePayload, IndexParentInsertPayload, 22 | IndexParentUpdatePayload, IndexRecordPayload, IndexRelationIdent, IndexRootAdoptPayload, 23 | IndexRootInstallInternalPayload, IndexRootInstallLeafPayload, IndexRootResetPayload, 24 | }; 25 | 26 | #[derive(Debug, Clone)] 27 | pub enum WalRecordPayload { 28 | PageWrite(PageWritePayload), 29 | Transaction(TransactionPayload), 30 | Heap(HeapRecordPayload), 31 | Index(IndexRecordPayload), 32 | Checkpoint(CheckpointPayload), 33 | /// Compensation log record: documents an UNDO action; redo is a no-op. 34 | Clr(ClrPayload), 35 | } 36 | 37 | impl WalRecordPayload { 38 | pub fn encode(&self, lsn: Lsn, prev_lsn: Lsn) -> Vec { 39 | codec::encode_frame(lsn, prev_lsn, self) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/expression/column.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::Schema; 2 | use crate::catalog::{Column, DataType}; 3 | use crate::error::QuillSQLResult; 4 | use crate::expression::ExprTrait; 5 | use crate::storage::tuple::Tuple; 6 | use crate::utils::scalar::ScalarValue; 7 | use crate::utils::table_ref::TableReference; 8 | 9 | /// A named reference to a qualified field in a schema. 10 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] 11 | pub struct ColumnExpr { 12 | /// relation/table reference. 13 | pub relation: Option, 14 | /// field/column name. 15 | pub name: String, 16 | } 17 | 18 | impl ExprTrait for ColumnExpr { 19 | fn data_type(&self, input_schema: &Schema) -> QuillSQLResult { 20 | let column = input_schema.column_with_name(self.relation.as_ref(), &self.name)?; 21 | Ok(column.data_type) 22 | } 23 | 24 | fn nullable(&self, input_schema: &Schema) -> QuillSQLResult { 25 | let column = input_schema.column_with_name(self.relation.as_ref(), &self.name)?; 26 | Ok(column.nullable) 27 | } 28 | 29 | fn evaluate(&self, tuple: &Tuple) -> QuillSQLResult { 30 | tuple 31 | .value_by_name(self.relation.as_ref(), &self.name) 32 | .cloned() 33 | } 34 | 35 | fn to_column(&self, input_schema: &Schema) -> QuillSQLResult { 36 | let column = input_schema.column_with_name(self.relation.as_ref(), &self.name)?; 37 | Ok(Column::new( 38 | self.name.clone(), 39 | self.data_type(input_schema)?, 40 | self.nullable(input_schema)?, 41 | ) 42 | .with_relation(self.relation.clone().or(column.relation.clone()))) 43 | } 44 | } 45 | 46 | impl std::fmt::Display for ColumnExpr { 47 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 48 | if let Some(relation) = self.relation.as_ref() { 49 | write!(f, "{}.", relation)?; 50 | } 51 | write!(f, "{}", self.name) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/execution/physical_plan/values.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU32, Ordering}; 2 | 3 | use crate::catalog::SchemaRef; 4 | use crate::expression::Expr; 5 | use crate::storage::tuple::{Tuple, EMPTY_TUPLE}; 6 | use crate::utils::scalar::ScalarValue; 7 | use crate::{ 8 | error::QuillSQLResult, 9 | execution::{ExecutionContext, VolcanoExecutor}, 10 | }; 11 | 12 | #[derive(Debug)] 13 | pub struct PhysicalValues { 14 | pub schema: SchemaRef, 15 | pub rows: Vec>, 16 | 17 | cursor: AtomicU32, 18 | } 19 | impl PhysicalValues { 20 | pub fn new(schema: SchemaRef, rows: Vec>) -> Self { 21 | PhysicalValues { 22 | schema, 23 | rows, 24 | cursor: AtomicU32::new(0), 25 | } 26 | } 27 | } 28 | impl VolcanoExecutor for PhysicalValues { 29 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 30 | let cursor = self.cursor.fetch_add(1, Ordering::SeqCst) as usize; 31 | if cursor < self.rows.len() { 32 | let values = self.rows[cursor] 33 | .iter() 34 | .map(|e| context.eval_expr(e, &EMPTY_TUPLE)) 35 | .collect::>>()?; 36 | debug_assert_eq!(self.schema.column_count(), values.len()); 37 | 38 | let casted_values = values 39 | .iter() 40 | .zip(self.schema.columns.iter()) 41 | .map(|(val, col)| val.cast_to(&col.data_type)) 42 | .collect::>>()?; 43 | 44 | Ok(Some(Tuple::new(self.output_schema(), casted_values))) 45 | } else { 46 | Ok(None) 47 | } 48 | } 49 | 50 | fn output_schema(&self) -> SchemaRef { 51 | self.schema.clone() 52 | } 53 | } 54 | 55 | impl std::fmt::Display for PhysicalValues { 56 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 57 | write!(f, "Values: rows={}", self.rows.len()) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/utils/bitmap.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, Eq, PartialEq)] 2 | pub struct DynamicBitmap { 3 | map: Vec, 4 | } 5 | 6 | impl DynamicBitmap { 7 | pub fn new() -> Self { 8 | Self { map: Vec::new() } 9 | } 10 | 11 | pub fn set(&mut self, index: usize, value: bool) { 12 | let byte_idx = index >> 3; // idx / 8 13 | if byte_idx >= self.map.len() { 14 | self.map.extend(vec![0; byte_idx - self.map.len() + 1]) 15 | } 16 | let offset = index & 0b111; // idx % 8 17 | let mut byte = self.map[byte_idx]; 18 | 19 | let curval = (byte >> (7 - offset)) & 1; 20 | let mask = if value { 1 ^ curval } else { curval }; 21 | byte ^= mask << (7 - offset); // Bit flipping 22 | self.map[byte_idx] = byte; 23 | } 24 | 25 | pub fn get(&self, index: usize) -> Option { 26 | if index >= self.map.len() << 8 { 27 | return None; 28 | } 29 | let byte_idx = index >> 3; // idx / 8 30 | let offset = index & 0b111; // idx % 8 31 | let byte = self.map[byte_idx]; 32 | Some((byte >> (7 - offset)) & 1 == 1) 33 | } 34 | 35 | pub fn to_bytes(&self) -> Vec { 36 | self.map.clone() 37 | } 38 | 39 | pub fn from_bytes(bytes: &[u8]) -> Self { 40 | Self { 41 | map: bytes.to_vec(), 42 | } 43 | } 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use crate::utils::bitmap::DynamicBitmap; 49 | 50 | #[test] 51 | fn dynamic_bitmap() { 52 | let mut bitmap = DynamicBitmap::new(); 53 | assert_eq!(bitmap.get(0), None); 54 | 55 | bitmap.set(3, true); 56 | assert_eq!(bitmap.map.len(), 1); 57 | 58 | bitmap.set(10, true); 59 | assert_eq!(bitmap.map.len(), 2); 60 | 61 | assert_eq!(bitmap.get(0), Some(false)); 62 | assert_eq!(bitmap.get(3), Some(true)); 63 | assert_eq!(bitmap.get(10), Some(true)); 64 | 65 | let new_bitmap = DynamicBitmap::from_bytes(&bitmap.to_bytes()); 66 | assert_eq!(new_bitmap, bitmap); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/expression/aggregate.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{Column, DataType, Schema}; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::expression::{Expr, ExprTrait}; 4 | use crate::function::AggregateFunctionKind; 5 | use crate::storage::tuple::Tuple; 6 | use crate::utils::scalar::ScalarValue; 7 | use std::fmt::Debug; 8 | 9 | #[derive(Clone, PartialEq, Eq, Debug)] 10 | pub struct AggregateFunction { 11 | /// the function kind 12 | pub func_kind: AggregateFunctionKind, 13 | /// List of expressions to feed to the functions as arguments 14 | pub args: Vec, 15 | /// Whether this is a DISTINCT aggregation or not 16 | pub distinct: bool, 17 | } 18 | 19 | impl ExprTrait for AggregateFunction { 20 | fn data_type(&self, _input_schema: &Schema) -> QuillSQLResult { 21 | match self.func_kind { 22 | AggregateFunctionKind::Count => Ok(DataType::Int64), 23 | AggregateFunctionKind::Avg => Ok(DataType::Float64), 24 | } 25 | } 26 | 27 | fn nullable(&self, _input_schema: &Schema) -> QuillSQLResult { 28 | Ok(true) 29 | } 30 | 31 | fn evaluate(&self, tuple: &Tuple) -> QuillSQLResult { 32 | match self.func_kind { 33 | AggregateFunctionKind::Count | AggregateFunctionKind::Avg => { 34 | let expr = self.args.first().ok_or(QuillSQLError::Internal(format!( 35 | "aggregate function {} should have one arg instead of {:?}", 36 | self.func_kind, self.args 37 | )))?; 38 | expr.evaluate(tuple) 39 | } 40 | } 41 | } 42 | 43 | fn to_column(&self, input_schema: &Schema) -> QuillSQLResult { 44 | Ok(Column::new( 45 | format!("{}", self), 46 | self.data_type(input_schema)?, 47 | self.nullable(input_schema)?, 48 | )) 49 | } 50 | } 51 | 52 | impl std::fmt::Display for AggregateFunction { 53 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 54 | write!(f, "{}", self.func_kind) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/execution/physical_plan/scan.rs: -------------------------------------------------------------------------------- 1 | //! Shared prefetch buffer utilities for scan operators. 2 | 3 | use std::cell::RefCell; 4 | use std::collections::VecDeque; 5 | 6 | use crate::error::QuillSQLResult; 7 | use crate::storage::page::{RecordId, TupleMeta}; 8 | use crate::storage::tuple::Tuple; 9 | 10 | pub type ScanEntry = (RecordId, TupleMeta, Tuple); 11 | 12 | #[derive(Debug)] 13 | pub struct ScanPrefetch { 14 | buffer: RefCell>, 15 | batch_size: usize, 16 | } 17 | 18 | impl ScanPrefetch { 19 | pub fn new(batch_size: usize) -> Self { 20 | Self { 21 | buffer: RefCell::new(VecDeque::new()), 22 | batch_size, 23 | } 24 | } 25 | 26 | pub fn pop_front(&self) -> Option { 27 | self.buffer.borrow_mut().pop_front() 28 | } 29 | 30 | pub fn clear(&self) { 31 | self.buffer.borrow_mut().clear(); 32 | } 33 | 34 | pub fn refill(&self, mut producer: F) -> QuillSQLResult 35 | where 36 | F: FnMut(usize, &mut VecDeque) -> QuillSQLResult<()>, 37 | { 38 | let mut fetched = VecDeque::with_capacity(self.batch_size); 39 | producer(self.batch_size, &mut fetched)?; 40 | if fetched.is_empty() { 41 | return Ok(false); 42 | } 43 | self.buffer.borrow_mut().extend(fetched); 44 | Ok(true) 45 | } 46 | } 47 | 48 | #[cfg(test)] 49 | mod tests { 50 | use super::*; 51 | 52 | #[test] 53 | fn prefetch_refill_and_pop() { 54 | let prefetch = ScanPrefetch::new(2); 55 | let rid = RecordId::new(1, 0); 56 | let meta = TupleMeta::new(1, 0); 57 | let tuple = Tuple::empty(crate::catalog::EMPTY_SCHEMA_REF.clone()); 58 | let produced = prefetch 59 | .refill(|_, out| { 60 | out.push_back((rid, meta, tuple.clone())); 61 | Ok(()) 62 | }) 63 | .expect("refill should succeed"); 64 | assert!(produced); 65 | assert!(prefetch.pop_front().is_some()); 66 | assert!(prefetch.pop_front().is_none()); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/sql/parser/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use sqlparser::{ast::Statement, dialect::PostgreSqlDialect, parser::Parser}; 3 | 4 | pub fn parse_sql(sql: &str) -> QuillSQLResult> { 5 | // Lightweight rewrite for unsupported SHOW syntax under Postgres dialect 6 | // Maps to information_schema queries to keep planner/executor simple. 7 | let normalized = sql.trim().trim_end_matches(';').trim(); 8 | let lower = normalized.to_ascii_lowercase(); 9 | 10 | let rewritten = if lower == "show databases" || lower == "show database" { 11 | // List schemas (databases) from information_schema.schemas 12 | Some("select schema from information_schema.schemas".to_string()) 13 | } else if lower == "show tables" { 14 | // List all tables 15 | Some("select table_name from information_schema.tables".to_string()) 16 | } else if lower.starts_with("set transaction") { 17 | let rest = normalized["set transaction".len()..].trim_start(); 18 | Some(format!("SET TRANSACTION {}", rest)) 19 | } else if lower.starts_with("set session transaction") { 20 | let rest = normalized["set session transaction".len()..].trim_start(); 21 | Some(format!( 22 | "SET SESSION CHARACTERISTICS AS TRANSACTION {}", 23 | rest 24 | )) 25 | } else { 26 | None 27 | }; 28 | 29 | let sql_to_parse = rewritten.as_deref().unwrap_or(normalized); 30 | let stmts = Parser::parse_sql(&PostgreSqlDialect {}, sql_to_parse)?; 31 | for stmt in &stmts { 32 | match stmt { 33 | Statement::StartTransaction { .. } 34 | | Statement::Commit { .. } 35 | | Statement::Rollback { .. } 36 | | Statement::SetTransaction { .. } => {} 37 | _ => {} 38 | } 39 | } 40 | Ok(stmts) 41 | } 42 | 43 | #[cfg(test)] 44 | mod tests { 45 | 46 | #[test] 47 | pub fn test_parser() { 48 | let sql = "select * from (select * from t1)"; 49 | let stmts = super::parse_sql(sql).unwrap(); 50 | println!("{:#?}", stmts[0]); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /docs/src/modules/background.md: -------------------------------------------------------------------------------- 1 | # Background Services 2 | 3 | `src/background/` hosts the asynchronous workers that keep a database healthy: WAL 4 | writers, checkpoints, buffer flushers, and MVCC vacuum. A central registry makes it easy 5 | to start/stop workers together—ideal for teaching how background maintenance supports 6 | foreground queries. 7 | 8 | --- 9 | 10 | ## Responsibilities 11 | 12 | - Start workers according to configuration (`WalOptions`, `MvccVacuumConfig`, etc.). 13 | - Define lightweight traits (`CheckpointWal`, `BufferMaintenance`, `TxnSnapshotOps`) so 14 | workers can run without pulling in an async runtime. 15 | - Provide `BackgroundWorkers`, a registry that tracks `WorkerHandle`s and shuts them down 16 | when `Database` drops. 17 | 18 | --- 19 | 20 | ## Built-in Workers 21 | 22 | | Worker | Trigger | Behavior | 23 | | ------ | ------- | -------- | 24 | | WAL writer | `wal_writer_interval_ms` | Calls `WalManager::background_flush` to durably write log buffers. | 25 | | Checkpoint | `checkpoint_interval_ms` | Captures dirty page / active txn tables and emits `Checkpoint` records to bound recovery. | 26 | | Buffer writer | `bg_writer_interval` | Flushes dirty frames to reduce checkpoint pressure. | 27 | | MVCC vacuum | `MvccVacuumConfig` | Removes obsolete tuple versions once `safe_xmin` advances. | 28 | 29 | Every worker registers itself with `BackgroundWorkers`; `shutdown_all()` ensures threads 30 | exit cleanly during tests or process teardown. 31 | 32 | --- 33 | 34 | ## Interactions 35 | 36 | - **WalManager** – WAL writer and checkpoint workers operate on `Arc`. 37 | - **BufferManager** – background flushers inspect dirty frames and help checkpoints 38 | capture consistent snapshots. 39 | - **TransactionManager** – MVCC vacuum queries `TxnSnapshotOps` for `safe_xmin`. 40 | 41 | --- 42 | 43 | ## Teaching Ideas 44 | 45 | - Tune `MvccVacuumConfig::batch_limit` and chart how quickly old tuple versions disappear. 46 | - Disable a worker in tests to show why unflushed WAL or missing checkpoints lengthen 47 | recovery. 48 | - Enable `RUST_LOG=background=info` to trace how these tasks complement foreground load. 49 | -------------------------------------------------------------------------------- /src/execution/physical_plan/limit.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::AtomicUsize; 2 | use std::sync::Arc; 3 | 4 | use crate::catalog::SchemaRef; 5 | use crate::{ 6 | error::QuillSQLResult, 7 | execution::{ExecutionContext, VolcanoExecutor}, 8 | storage::tuple::Tuple, 9 | }; 10 | 11 | use super::PhysicalPlan; 12 | 13 | #[derive(Debug)] 14 | pub struct PhysicalLimit { 15 | pub limit: Option, 16 | pub offset: usize, 17 | pub input: Arc, 18 | 19 | cursor: AtomicUsize, 20 | } 21 | impl PhysicalLimit { 22 | pub fn new(limit: Option, offset: usize, input: Arc) -> Self { 23 | PhysicalLimit { 24 | limit, 25 | offset, 26 | input, 27 | cursor: AtomicUsize::new(0), 28 | } 29 | } 30 | } 31 | impl VolcanoExecutor for PhysicalLimit { 32 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 33 | self.input.init(context)?; 34 | self.cursor.store(0, std::sync::atomic::Ordering::SeqCst); 35 | Ok(()) 36 | } 37 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 38 | loop { 39 | let next_tuple = self.input.next(context)?; 40 | if next_tuple.is_none() { 41 | return Ok(None); 42 | } 43 | let cursor = self 44 | .cursor 45 | .fetch_add(1, std::sync::atomic::Ordering::SeqCst); 46 | if cursor < self.offset { 47 | continue; 48 | } 49 | return if let Some(limit) = self.limit { 50 | if cursor < self.offset + limit { 51 | Ok(next_tuple) 52 | } else { 53 | Ok(None) 54 | } 55 | } else { 56 | Ok(next_tuple) 57 | }; 58 | } 59 | } 60 | 61 | fn output_schema(&self) -> SchemaRef { 62 | self.input.output_schema() 63 | } 64 | } 65 | 66 | impl std::fmt::Display for PhysicalLimit { 67 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 68 | write!(f, "Limit") 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/execution/physical_plan/drop_table.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{SchemaRef, EMPTY_SCHEMA_REF}; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::execution::{ExecutionContext, VolcanoExecutor}; 4 | use crate::storage::tuple::Tuple; 5 | use crate::transaction::LockMode; 6 | use crate::utils::table_ref::TableReference; 7 | 8 | #[derive(Debug)] 9 | pub struct PhysicalDropTable { 10 | table: TableReference, 11 | if_exists: bool, 12 | } 13 | 14 | impl PhysicalDropTable { 15 | pub fn new(table: TableReference, if_exists: bool) -> Self { 16 | Self { table, if_exists } 17 | } 18 | 19 | fn qualified_name(&self) -> String { 20 | self.table.to_string() 21 | } 22 | } 23 | 24 | impl VolcanoExecutor for PhysicalDropTable { 25 | fn init(&self, _context: &mut ExecutionContext) -> QuillSQLResult<()> { 26 | Ok(()) 27 | } 28 | 29 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 30 | if context.catalog.try_table_heap(&self.table).is_none() { 31 | if self.if_exists { 32 | return Ok(None); 33 | } 34 | return Err(QuillSQLError::Execution(format!( 35 | "table {} does not exist", 36 | self.qualified_name() 37 | ))); 38 | } 39 | 40 | context 41 | .txn_ctx() 42 | .ensure_writable(&self.table, "DROP TABLE")?; 43 | context 44 | .txn_ctx_mut() 45 | .lock_table(self.table.clone(), LockMode::Exclusive)?; 46 | 47 | let dropped = context.catalog.drop_table(&self.table)?; 48 | if !dropped && !self.if_exists { 49 | return Err(QuillSQLError::Execution(format!( 50 | "table {} does not exist", 51 | self.qualified_name() 52 | ))); 53 | } 54 | 55 | Ok(None) 56 | } 57 | 58 | fn output_schema(&self) -> SchemaRef { 59 | EMPTY_SCHEMA_REF.clone() 60 | } 61 | } 62 | 63 | impl std::fmt::Display for PhysicalDropTable { 64 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 65 | write!(f, "DropTable: {}", self.table) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /docs/src/buffer/page.md: -------------------------------------------------------------------------------- 1 | # Page & Page Guards 2 | 3 | Before the Buffer Manager can hand out a reference to a page in memory, it must ensure that the page won't be evicted while it's being used by another thread. This is accomplished by **pinning**. 4 | 5 | ## Pinning 6 | 7 | Pinning simply means incrementing a "pin count" associated with the page's frame in the buffer pool. A frame with a pin count greater than zero is forbidden from being chosen as a victim by the page replacer. 8 | 9 | - When a thread wants to use a page, it must first pin it. 10 | - When the thread is finished with the page, it must **unpin** it (decrementing the count). 11 | 12 | Manually managing pin counts is tedious and error-prone. Forgetting to unpin a page leads to a memory leak, as the frame can never be evicted. To solve this, QuillSQL uses a common and powerful C++ and Rust pattern: **Resource Acquisition Is Initialization (RAII)**. 13 | 14 | ## `ReadPageGuard` and `WritePageGuard` 15 | 16 | Instead of returning a raw pointer to the page memory, the `BufferManager`'s `fetch_page_*` methods return a **guard** object: `ReadPageGuard` or `WritePageGuard`. 17 | 18 | These guards are responsible for the lifetime of the pin and the lock on the page: 19 | 20 | 1. **Acquisition**: When a `PageGuard` is created, its constructor acquires the appropriate lock (`RwLock`) on the page's frame and increments the frame's pin count. 21 | - `ReadPageGuard` takes a read lock, allowing multiple concurrent readers. 22 | - `WritePageGuard` takes an exclusive write lock. 23 | 24 | 2. **Usage**: The calling code uses the guard object to access the page's data. The guard provides safe, locked access to the underlying byte array. 25 | 26 | 3. **Release**: When the guard variable goes out of scope (e.g., at the end of a function), its `drop()` method is automatically called by the Rust compiler. This `drop()` implementation handles all the cleanup: 27 | - It decrements the pin count. 28 | - It releases the lock on the frame. 29 | - If it's a `WritePageGuard` and the data was modified, it informs the `BufferManager` that the page is now **dirty**. 30 | 31 | This RAII pattern makes using the buffer pool much safer and more ergonomic, as it makes it impossible to forget to unpin a page or release a lock. 32 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "quill-sql" 4 | version = "0.2.0" 5 | edition = "2021" 6 | description = "A tiny yet serious SQL database in Rust with ARIES-style WAL, 2PL, and B+Tree indexes." 7 | license = "MIT" 8 | repository = "https://github.com/feichai0017/quillsql" 9 | readme = "README.md" 10 | 11 | [dependencies] 12 | log = "0.4.21" 13 | stack-map = "1.0.5" 14 | ebr = { version = "0.2.13" } 15 | bztree = "0.2.0" 16 | crossbeam-epoch = "0.9.18" 17 | parking_lot = { version = "0.12.1", features = ["deadlock_detection"] } 18 | parking_lot_core = { version = "0.9.0", features = ["deadlock_detection"] } 19 | serial_test = "3.2.0" 20 | dashmap = "6.1.0" 21 | derive-with = "0.6.0" 22 | derive-new = "0.7.0" 23 | comfy-table = "7.1.0" 24 | bincode = "1.3.3" 25 | rand = "0.9.1" 26 | fastrand = "2.0" 27 | thiserror = "2.0.12" 28 | nom = "7.0.0" 29 | logos = "0.15.0" 30 | strum = { version = "0.26", features = ["derive"] } 31 | sqlparser = "0.34.0" 32 | clap = { version = "4.3.19", features = ["derive", "cargo"] } 33 | sqllogictest = "0.13.0" 34 | regex = "1.9.1" 35 | glob = "0.3.1" 36 | rayon = "1.8.0" 37 | string-error = "0.1.0" 38 | memmap2 = "0.9.1" 39 | natord = "1.0.0" 40 | env_logger = "0.11.5" 41 | serde = { version = "1.0", features = ["derive"] } 42 | serde_bytes = "0.11.15" 43 | fs4 = "0.8.4" 44 | tempfile = "3.12.0" 45 | tokio = { version = "1.41.1", features = ["full"] } 46 | tokio-util = { version = "0.7.12", features = ["full"] } 47 | tokio-stream = "0.1.16" 48 | futures = "0.3.31" 49 | bytes = "1.0.0" 50 | rustyline = "15.0.0" 51 | axum = { version = "0.7", features = ["macros", "json"] } 52 | tower = "0.4" 53 | tower-http = { version = "0.5", features = ["fs", "trace", "cors"] } 54 | serde_json = "1.0" 55 | once_cell = "1.19" 56 | io-uring = { version = "0.6", optional = true } 57 | crc32fast = "1.4" 58 | libc = "0.2.176" 59 | crossbeam-channel = "0.5.15" 60 | 61 | [target.'cfg(target_os = "linux")'.dependencies] 62 | io-uring = { version = "0.6" } 63 | 64 | [dev-dependencies] 65 | criterion = { version = "0.5", features = ["html_reports"] } 66 | rand_chacha = "0.9" 67 | rusqlite = { version = "0.31", features = ["bundled"] } 68 | postgres = "0.19" 69 | pprof = { version = "0.15.0", features = ["criterion", "flamegraph"] } 70 | 71 | [[bench]] 72 | name = "storage_bench" 73 | harness = false 74 | -------------------------------------------------------------------------------- /docs/src/modules/expression.md: -------------------------------------------------------------------------------- 1 | # Expression & Scalar Evaluation 2 | 3 | The expression subsystem (`src/expression/`) powers column computations, predicates, and 4 | UPDATE assignments. It keeps expression trees approachable while demonstrating how they 5 | are evaluated during execution. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Store planner-produced expression trees (`Expr`) in a serializable, traversable enum. 12 | - Bind column references, constants, and built-in functions. 13 | - Evaluate expressions against `Tuple`s at runtime, yielding `ScalarValue`. 14 | - Provide type inference and casting so arithmetic/comparison operators remain well-typed. 15 | 16 | --- 17 | 18 | ## Directory Layout 19 | 20 | | Path | Description | Key Types | 21 | | ---- | ----------- | --------- | 22 | | `mod.rs` | Public API and core enum. | `Expr`, `ExprTrait` | 23 | | `scalar.rs` | Runtime scalar representation + conversions. | `ScalarValue`, `DataType` | 24 | | `binder.rs` | Helpers for the planner/SQL binder. | `BoundExpr` | 25 | 26 | --- 27 | 28 | ## Concepts 29 | 30 | ### Expr Enum 31 | Expresses column refs, literals, comparisons, logical ops, arithmetic, and function 32 | invocations. Each variant implements `ExprTrait::evaluate(&self, tuple)` and returns a 33 | `ScalarValue`. 34 | 35 | ### ScalarValue 36 | Unified runtime value across types (int, bigint, bool, decimal, varchar, …). Includes 37 | `cast_to(DataType)` so results can be coerced to the target column type before writes. 38 | 39 | ### Type Inference 40 | Planner code invokes `Expr::data_type(schema)` to predict result types. Execution then 41 | casts when needed—e.g., `UPDATE t SET a = b + 1` uses the column’s declared type for `a`. 42 | 43 | --- 44 | 45 | ## Interactions 46 | 47 | - **Planner** – builds `Expr` trees with bound columns; execution reuses them verbatim. 48 | - **ExecutionContext** – exposes `eval_expr` and `eval_predicate`, wrapping expression 49 | evaluation plus boolean coercion (`NULL` becomes false for predicates). 50 | - **Optimizer** – rules like constant folding traverse `Expr` trees and reuse 51 | `ScalarValue` arithmetic helpers. 52 | 53 | --- 54 | 55 | ## Teaching Ideas 56 | 57 | - Add a simple built-in function (`length(expr)`) to follow the pipeline from parsing to 58 | evaluation. 59 | - Implement short-circuiting or full three-valued boolean logic and validate with 60 | sqllogictest. 61 | - Instrument `Expr::evaluate` with tracing to visualise expression evaluation inside 62 | physical operators. 63 | -------------------------------------------------------------------------------- /src/optimizer/rule/push_down_filter.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::optimizer::logical_optimizer::ApplyOrder; 3 | use crate::optimizer::LogicalOptimizerRule; 4 | use crate::plan::logical_plan::LogicalPlan; 5 | 6 | /// Attach `Filter` predicates directly to the underlying `TableScan`. 7 | /// This lets the physical planner decide whether the scan itself can honor 8 | /// the predicate (e.g. via index) while keeping the logical tree shallower. 9 | pub struct PushDownFilterToScan; 10 | 11 | impl LogicalOptimizerRule for PushDownFilterToScan { 12 | fn try_optimize(&self, plan: &LogicalPlan) -> QuillSQLResult> { 13 | let LogicalPlan::Filter(filter) = plan else { 14 | return Ok(None); 15 | }; 16 | 17 | match filter.input.as_ref() { 18 | LogicalPlan::TableScan(scan) => { 19 | let mut new_scan = scan.clone(); 20 | new_scan.filters.push(filter.predicate.clone()); 21 | Ok(Some(LogicalPlan::TableScan(new_scan))) 22 | } 23 | _ => Ok(None), 24 | } 25 | } 26 | 27 | fn name(&self) -> &str { 28 | "PushDownFilterToScan" 29 | } 30 | 31 | fn apply_order(&self) -> Option { 32 | Some(ApplyOrder::TopDown) 33 | } 34 | } 35 | 36 | #[cfg(test)] 37 | mod tests { 38 | use crate::database::Database; 39 | use crate::optimizer::rule::PushDownFilterToScan; 40 | use crate::optimizer::LogicalOptimizer; 41 | use crate::plan::logical_plan::LogicalPlan; 42 | use std::sync::Arc; 43 | 44 | fn build_optimizer() -> LogicalOptimizer { 45 | LogicalOptimizer::with_rules(vec![Arc::new(PushDownFilterToScan)]) 46 | } 47 | 48 | #[test] 49 | fn pushes_filter_into_scan() { 50 | let mut db = Database::new_temp().unwrap(); 51 | db.run("create table t1 (a int)").unwrap(); 52 | 53 | let plan = db 54 | .create_logical_plan("select * from t1 where a > 10") 55 | .unwrap(); 56 | let optimized_plan = build_optimizer().optimize(&plan).unwrap(); 57 | 58 | match optimized_plan { 59 | LogicalPlan::Project(project) => match project.input.as_ref() { 60 | LogicalPlan::TableScan(scan) => assert_eq!(scan.filters.len(), 1), 61 | other => panic!("expected TableScan under project, got {other:?}"), 62 | }, 63 | other => panic!("expected Project after pushdown, got {other:?}"), 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /docs/src/modules/execution.md: -------------------------------------------------------------------------------- 1 | # Execution Engine 2 | 3 | `src/execution/` drives `PhysicalPlan` trees using the Volcano (iterator) model. Every 4 | operator pulls tuples from its children, coordinating closely with transactions, 5 | storage, and expression evaluation. 6 | 7 | --- 8 | 9 | ## Core Components 10 | 11 | | Component | Role | 12 | | --------- | ---- | 13 | | `PhysicalPlan` | Enum covering all physical operators; each implements `VolcanoExecutor`. | 14 | | `ExecutionContext` | Shared context carrying the catalog, `TxnContext`, storage engine, and expression helpers. | 15 | | `TupleStream` | Unified scan interface returned by table/index handles. | 16 | 17 | --- 18 | 19 | ## Execution Flow 20 | 21 | 1. `ExecutionEngine::execute` calls `init` on the root plan (and recursively on children). 22 | 2. The engine loops calling `next`, with parents pulling tuples from children. 23 | 3. `ExecutionContext` supplies transaction snapshots, lock helpers, and expression 24 | evaluation per call. 25 | 4. Once `next` returns `None`, the accumulated results are returned to the caller (CLI, 26 | HTTP API, or tests). 27 | 28 | --- 29 | 30 | ## Operator Examples 31 | 32 | - **PhysicalSeqScan** – acquires a `table_stream` from the storage engine, uses 33 | `ScanPrefetch` for batching, and relies on `TxnContext::read_visible_tuple` for MVCC. 34 | - **PhysicalIndexScan** – uses `index_stream`, tracks `invisible_hits`, and notifies the 35 | catalog when garbage accumulates. 36 | - **PhysicalUpdate/PhysicalDelete** – call `prepare_row_for_write` to re-validate locks 37 | and the latest tuple before invoking `apply_update/delete`. 38 | - **PhysicalNestedLoopJoin** – showcases the parent/child pull loop and acts as a baseline 39 | for more advanced joins. 40 | 41 | --- 42 | 43 | ## Interactions 44 | 45 | - **StorageEngine** – all data access goes through handles/streams, keeping execution 46 | storage-agnostic. 47 | - **Transaction** – `TxnContext` enforces locking, snapshots, and undo logging; operators 48 | never talk to `LockManager` directly. 49 | - **Expression** – `ExecutionContext::eval_expr` / `eval_predicate` evaluate expressions 50 | built by the planner. 51 | - **Optimizer/Planner** – execution honours the plan as-is; all structural choices happen 52 | upstream. 53 | 54 | --- 55 | 56 | ## Teaching Ideas 57 | 58 | - Implement a new operator (e.g., `PhysicalMergeJoin`) to see how `ExecutionContext` 59 | support generalises. 60 | - Add adaptive prefetching inside `PhysicalSeqScan` to explore iterator hints. 61 | - Enable `RUST_LOG=execution=trace` to watch the `init`/`next` call sequence during a 62 | query. 63 | 64 | --- 65 | 66 | Further reading: [The Volcano Execution Model](../execution/volcano.md) 67 | -------------------------------------------------------------------------------- /src/optimizer/rule/eliminate_limit.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::optimizer::logical_optimizer::ApplyOrder; 3 | use crate::optimizer::LogicalOptimizerRule; 4 | use crate::plan::logical_plan::{EmptyRelation, LogicalPlan}; 5 | 6 | pub struct EliminateLimit; 7 | 8 | impl LogicalOptimizerRule for EliminateLimit { 9 | fn try_optimize(&self, plan: &LogicalPlan) -> QuillSQLResult> { 10 | if let LogicalPlan::Limit(limit) = plan { 11 | match limit.limit { 12 | Some(fetch) => { 13 | if fetch == 0 { 14 | return Ok(Some(LogicalPlan::EmptyRelation(EmptyRelation { 15 | produce_one_row: false, 16 | schema: limit.input.schema().clone(), 17 | }))); 18 | } 19 | } 20 | None => { 21 | if limit.offset == 0 { 22 | let input = limit.input.as_ref(); 23 | // input also can be Limit, so we should apply again. 24 | return Ok(Some( 25 | self.try_optimize(input)?.unwrap_or_else(|| input.clone()), 26 | )); 27 | } 28 | } 29 | } 30 | } 31 | Ok(None) 32 | } 33 | 34 | fn name(&self) -> &str { 35 | "EliminateLimit" 36 | } 37 | 38 | fn apply_order(&self) -> Option { 39 | Some(ApplyOrder::BottomUp) 40 | } 41 | } 42 | 43 | #[cfg(test)] 44 | mod tests { 45 | use crate::database::Database; 46 | use crate::optimizer::rule::EliminateLimit; 47 | use crate::optimizer::LogicalOptimizer; 48 | use crate::plan::logical_plan::LogicalPlan; 49 | use std::sync::Arc; 50 | 51 | fn build_optimizer() -> LogicalOptimizer { 52 | LogicalOptimizer::with_rules(vec![Arc::new(EliminateLimit)]) 53 | } 54 | 55 | #[test] 56 | fn eliminate_limit() { 57 | let mut db = Database::new_temp().unwrap(); 58 | db.run("create table t1 (a int)").unwrap(); 59 | 60 | let plan = db.create_logical_plan("select a from t1 limit 0").unwrap(); 61 | let optimized_plan = build_optimizer().optimize(&plan).unwrap(); 62 | assert!(matches!(optimized_plan, LogicalPlan::EmptyRelation(_))); 63 | 64 | let plan = db.create_logical_plan("select a from t1 offset 0").unwrap(); 65 | let optimized_plan = build_optimizer().optimize(&plan).unwrap(); 66 | if let LogicalPlan::Project(p) = optimized_plan { 67 | assert!(matches!(p.input.as_ref(), LogicalPlan::TableScan(_))); 68 | } else { 69 | panic!("the first node should be project"); 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/execution/physical_plan/drop_index.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::{SchemaRef, EMPTY_SCHEMA_REF}; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::execution::{ExecutionContext, VolcanoExecutor}; 4 | use crate::storage::tuple::Tuple; 5 | use crate::transaction::LockMode; 6 | 7 | #[derive(Debug)] 8 | pub struct PhysicalDropIndex { 9 | pub name: String, 10 | pub schema: Option, 11 | pub catalog: Option, 12 | pub if_exists: bool, 13 | } 14 | 15 | impl PhysicalDropIndex { 16 | pub fn new( 17 | name: String, 18 | schema: Option, 19 | catalog: Option, 20 | if_exists: bool, 21 | ) -> Self { 22 | Self { 23 | name, 24 | schema, 25 | catalog, 26 | if_exists, 27 | } 28 | } 29 | 30 | fn qualified_name(&self) -> String { 31 | match (&self.catalog, &self.schema) { 32 | (Some(catalog), Some(schema)) => format!("{catalog}.{schema}.{}", self.name), 33 | (None, Some(schema)) => format!("{schema}.{}", self.name), 34 | _ => self.name.clone(), 35 | } 36 | } 37 | } 38 | 39 | impl VolcanoExecutor for PhysicalDropIndex { 40 | fn init(&self, _context: &mut ExecutionContext) -> QuillSQLResult<()> { 41 | Ok(()) 42 | } 43 | 44 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 45 | let owner = context.catalog.find_index_owner( 46 | self.catalog.as_deref(), 47 | self.schema.as_deref(), 48 | &self.name, 49 | ); 50 | 51 | let Some(table_ref) = owner else { 52 | if self.if_exists { 53 | return Ok(None); 54 | } 55 | return Err(QuillSQLError::Execution(format!( 56 | "index {} does not exist", 57 | self.qualified_name() 58 | ))); 59 | }; 60 | 61 | context 62 | .txn_ctx() 63 | .ensure_writable(&table_ref, "DROP INDEX")?; 64 | context 65 | .txn_ctx_mut() 66 | .lock_table(table_ref.clone(), LockMode::Exclusive)?; 67 | 68 | let dropped = context.catalog.drop_index(&table_ref, &self.name)?; 69 | if !dropped && !self.if_exists { 70 | return Err(QuillSQLError::Execution(format!( 71 | "index {} does not exist", 72 | self.qualified_name() 73 | ))); 74 | } 75 | 76 | Ok(None) 77 | } 78 | 79 | fn output_schema(&self) -> SchemaRef { 80 | EMPTY_SCHEMA_REF.clone() 81 | } 82 | } 83 | 84 | impl std::fmt::Display for PhysicalDropIndex { 85 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 86 | write!(f, "DropIndex: {}", self.qualified_name()) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/optimizer/rule/push_down_limit.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::optimizer::logical_optimizer::ApplyOrder; 3 | use crate::optimizer::LogicalOptimizerRule; 4 | use crate::plan::logical_plan::{LogicalPlan, Sort}; 5 | 6 | pub struct PushDownLimit; 7 | 8 | impl LogicalOptimizerRule for PushDownLimit { 9 | fn try_optimize(&self, plan: &LogicalPlan) -> QuillSQLResult> { 10 | let LogicalPlan::Limit(limit) = plan else { 11 | return Ok(None); 12 | }; 13 | 14 | let Some(limit_value) = limit.limit else { 15 | return Ok(None); 16 | }; 17 | 18 | match limit.input.as_ref() { 19 | LogicalPlan::Sort(sort) => { 20 | let new_limit = { 21 | let sort_limit = limit.offset + limit_value; 22 | Some(sort.limit.map(|f| f.min(sort_limit)).unwrap_or(sort_limit)) 23 | }; 24 | if new_limit == sort.limit { 25 | Ok(None) 26 | } else { 27 | let new_sort = LogicalPlan::Sort(Sort { 28 | order_by: sort.order_by.clone(), 29 | input: sort.input.clone(), 30 | limit: new_limit, 31 | }); 32 | plan.with_new_inputs(&[new_sort]).map(Some) 33 | } 34 | } 35 | _ => Ok(None), 36 | } 37 | } 38 | 39 | fn name(&self) -> &str { 40 | "PushDownLimit" 41 | } 42 | 43 | fn apply_order(&self) -> Option { 44 | Some(ApplyOrder::TopDown) 45 | } 46 | } 47 | 48 | #[cfg(test)] 49 | mod tests { 50 | use crate::database::Database; 51 | use crate::optimizer::rule::PushDownLimit; 52 | use crate::optimizer::LogicalOptimizer; 53 | use crate::plan::logical_plan::{LogicalPlan, Sort}; 54 | use std::sync::Arc; 55 | 56 | fn build_optimizer() -> LogicalOptimizer { 57 | LogicalOptimizer::with_rules(vec![Arc::new(PushDownLimit)]) 58 | } 59 | 60 | #[test] 61 | fn push_down_limit() { 62 | let mut db = Database::new_temp().unwrap(); 63 | db.run("create table t1 (a int)").unwrap(); 64 | 65 | let plan = db 66 | .create_logical_plan("select a from t1 order by a limit 10") 67 | .unwrap(); 68 | let optimized_plan = build_optimizer().optimize(&plan).unwrap(); 69 | 70 | if let LogicalPlan::Limit(limit) = optimized_plan { 71 | if let LogicalPlan::Sort(Sort { limit, .. }) = limit.input.as_ref() { 72 | assert_eq!(limit, &Some(10)); 73 | } else { 74 | panic!("the second node should be limit"); 75 | } 76 | } else { 77 | panic!("the first node should be limit"); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /docs/src/modules/sql.md: -------------------------------------------------------------------------------- 1 | # SQL Front-End 2 | 3 | The SQL front-end lives in `src/sql/`. It turns raw UTF-8 query text into the abstract 4 | syntax trees (ASTs) consumed by planning, while layering Quill-specific name handling 5 | and diagnostics on top of [`sqlparser`](https://docs.rs/sqlparser). 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Parse SQL text into `sqlparser::ast::Statement` values. 12 | - Record precise spans so error messages can highlight the exact byte range. 13 | - Normalise identifiers (case folding, quoted names, multi-part paths). 14 | - Provide helper traits so the logical planner can lower AST nodes without duplicating 15 | syntax checks. 16 | 17 | --- 18 | 19 | ## Directory Layout 20 | 21 | | Path | Purpose | Key Types | 22 | | ---- | ------- | --------- | 23 | | `lexer.rs` | Token helpers that preserve offsets. | `Token`, `TokenExt` | 24 | | `parser.rs` | Single entry point used across the codebase. | `parse_sql`, `SqlInput` | 25 | | `ast/mod.rs` | Planner-facing helpers. | `NormalizedIdent`, `ObjectNameExt` | 26 | | `error.rs` | Span-aware parser errors. | `SqlError`, `SqlSpan` | 27 | 28 | --- 29 | 30 | ## Parsing Pipeline 31 | 32 | 1. **Lexing** – wrap sqlparser’s lexer so every token keeps start/end offsets. 33 | 2. **AST generation** – invoke sqlparser to produce standard `Statement` structs. 34 | 3. **Normalisation** – convert identifiers into `NormalizedIdent`, deal with schema 35 | qualifiers, and build pieces of `TableReference`. 36 | 4. **Planner bridge** – traits like `ColumnRefExt` expose methods such as `relation()` or 37 | `column()` so `LogicalPlanner` can treat different SQL syntaxes uniformly. 38 | 39 | --- 40 | 41 | ## Interactions 42 | 43 | - **Logical planner** consumes the AST directly and relies on helper traits from this 44 | module to convert identifiers into catalog references. 45 | - **Database / Session** catch `SqlError` values, so both CLI and HTTP front-ends show 46 | consistent caret diagnostics. 47 | - **Tests** (`tests/sql_example/*.slt`, `tests/sql_parser.rs`) assert on parser output and 48 | error strings to keep teaching feedback stable. 49 | 50 | --- 51 | 52 | ## Implementation Notes 53 | 54 | - `SqlSpan` stores byte offsets, which makes it trivial to slice the original SQL and 55 | render highlighted errors. 56 | - Extended statements (e.g., `EXPLAIN`, `BEGIN TRANSACTION`) show how to Layer 57 | Quill-specific syntax without forking sqlparser entirely. 58 | - We avoid desugaring at this stage so students can trace SQL → AST → logical plan step 59 | by step. 60 | 61 | --- 62 | 63 | ## Teaching Ideas 64 | 65 | - Add a new statement (`CREATE VIEW`, `ALTER TABLE ...`) and follow the AST through the 66 | pipeline. 67 | - Improve error hints (“Did you forget FROM?”) to see how better diagnostics aid users. 68 | - Write fuzz tests that round-trip SQL → AST → SQL to discuss parser determinism. 69 | -------------------------------------------------------------------------------- /docs/src/modules/buffer.md: -------------------------------------------------------------------------------- 1 | # Buffer Manager 2 | 3 | The buffer manager (`src/buffer/`) implements QuillSQL’s shared buffer pool, bridging the 4 | speed gap between RAM and disk. It lets storage/execution read and write pages safely 5 | while coordinating with WAL and asynchronous I/O. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Maintain a fixed-size set of page frames caching `TableHeap` and B+Tree pages. 12 | - Expose RAII-style guards (pin/unpin) that enforce safe concurrent access. 13 | - Keep the page table, replacement policy, dirty-page tracking, and WAL coordination in 14 | sync. 15 | - Submit async I/O through `DiskScheduler`. 16 | 17 | --- 18 | 19 | ## Directory Layout 20 | 21 | | Path | Description | Key Types | 22 | | ---- | ----------- | --------- | 23 | | `buffer_manager.rs` | Core buffer pool. | `BufferManager`, `BufferFrame` | 24 | | `page.rs` | Guard types and pin/unpin logic. | `ReadPageGuard`, `WritePageGuard` | 25 | | `replacer.rs` | LRU-K + TinyLFU replacement. | `Replacer` | 26 | | `metrics.rs` | Optional instrumentation hooks. | `BufferMetrics` | 27 | 28 | --- 29 | 30 | ## Key Mechanisms 31 | 32 | ### Guard Model 33 | - `ReadPageGuard`, `WritePageGuard`, and `UpgradeableGuard` ensure only compatible access 34 | modes coexist on a page. 35 | - Guards drop automatically to release pins; paired with Rust’s borrow checker, they make 36 | latch semantics tangible. 37 | 38 | ### Replacement Policy 39 | - **LRU-K** tracks the last K touches to protect hot pages from scan pollution. 40 | - **TinyLFU** decides whether a new page should enter the cache, offering probabilistic 41 | admission against noisy workloads. 42 | 43 | ### WAL Coordination 44 | - Before flushing a dirty page, the buffer checks `page_lsn` and asks `WalManager` to 45 | flush up to that LSN (write-ahead rule). 46 | - `set_wal_manager` wires the buffer to WAL so checkpoints can inspect the oldest dirty 47 | LSN. 48 | 49 | ### Disk Scheduler 50 | - All physical reads/writes go through `DiskScheduler::submit_*`, sharing worker threads 51 | with WAL and demonstrating the benefits of a unified I/O layer. 52 | 53 | --- 54 | 55 | ## Interactions 56 | 57 | - **Storage engine** – `TableHeap` and `BPlusTreeIndex` access pages exclusively through 58 | the buffer manager. 59 | - **Recovery** – checkpoints consult the buffer’s dirty page table to build the ARIES DPT. 60 | - **Background writer** – periodically walks `dirty_frames` to flush pages in the 61 | background. 62 | 63 | --- 64 | 65 | ## Teaching Ideas 66 | 67 | - Disable TinyLFU via feature flag, rerun sqllogictest, and compare hit rates. 68 | - Swap the replacement policy with CLOCK to experiment with cache algorithms. 69 | - Enable `RUST_LOG=buffer=debug` and trace the pin/unpin lifecycle of hot pages. 70 | 71 | --- 72 | 73 | Further reading: [Page & Page Guards](../buffer/page.md), 74 | [The Buffer Pool](../buffer/buffer_pool.md) 75 | -------------------------------------------------------------------------------- /docs/src/modules/plan.md: -------------------------------------------------------------------------------- 1 | # Query Planner Module 2 | 3 | `src/plan/` bridges parsed SQL and executable operators. It converts the AST into a 4 | logical plan, applies rewrites (via the optimizer), and finally emits a physical plan 5 | (`PhysicalPlan`) that the Volcano engine can run. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | 1. **LogicalPlanner** – walks the AST, binds table/column names using `PlannerContext`, 12 | performs type checking, and builds a `LogicalPlan` tree. 13 | 2. **PlannerContext** – exposes catalog lookups plus scope information for CTEs, subqueries, 14 | and aliases. 15 | 3. **PhysicalPlanner** – lowers an optimized `LogicalPlan` into a tree of Volcano operators. 16 | 17 | --- 18 | 19 | ## Directory Layout 20 | 21 | | Path | Description | Key Types | 22 | | ---- | ----------- | --------- | 23 | | `logical_plan.rs` | Logical algebra nodes. | `LogicalPlan`, `LogicalExpr`, `JoinType` | 24 | | `logical_planner.rs` | AST → logical transformation. | `LogicalPlanner` | 25 | | `physical_plan.rs` | `PhysicalPlan` enum definition. | `PhysicalPlan`, `Physical*` structs | 26 | | `physical_planner.rs` | Logical → physical lowering. | `PhysicalPlanner` | 27 | | `planner_context.rs` | Catalog/scope abstraction. | `PlannerContext` | 28 | 29 | --- 30 | 31 | ## Workflow 32 | 33 | 1. **Name binding** – `LogicalPlanner` resolves table + column references, creates 34 | `TableReference`s, and validates schemas via the catalog. 35 | 2. **Logical tree** – each SQL clause becomes a logical node (FROM → `SeqScan`, WHERE → 36 | `Filter`, GROUP BY → `Aggregate`, etc.). 37 | 3. **Physical selection** – `PhysicalPlanner` picks concrete algorithms (sequential scan, 38 | index scan, nested-loop join, sort, limit …). Because every physical node implements 39 | `VolcanoExecutor`, the execution engine can pull tuples immediately. 40 | 41 | --- 42 | 43 | ## Interactions 44 | 45 | - **SQL front-end** – provides the AST; helper traits (`NormalizedIdent`, etc.) keep name 46 | resolution consistent. 47 | - **Catalog** – `PlannerContext` relies on it to confirm table/index existence and fetch 48 | schemas. 49 | - **Optimizer** – operates purely on `LogicalPlan`; the planner must emit clean, 50 | traversable trees so rules can fire. 51 | - **Execution** – physical nodes carry `TableReference`, `SchemaRef`, and hints that the 52 | execution engine passes to the storage layer. 53 | 54 | --- 55 | 56 | ## Teaching Ideas 57 | 58 | - Implement a new logical operator (e.g., `LogicalDistinct`) and add the corresponding 59 | physical operator to trace the full lifecycle. 60 | - Experiment with early projection inside the logical plan and observe its impact on 61 | downstream operators. 62 | - Use `pretty_format_logical_plan`/`physical_plan` dumps to visualise rewrites before and 63 | after optimizer passes. 64 | 65 | --- 66 | 67 | Further reading: [The Lifecycle of a Query](../plan/lifecycle.md) 68 | -------------------------------------------------------------------------------- /docs/src/modules/optimizer.md: -------------------------------------------------------------------------------- 1 | # Optimizer Module 2 | 3 | `src/optimizer/` contains a lightweight, teaching-friendly rule engine. It rewrites 4 | `LogicalPlan` trees into cheaper equivalents without requiring a full cost-based 5 | framework. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Define the `OptimizerRule` trait (“match → rewrite”). 12 | - Ship built-in rules such as predicate pushdown, projection pruning, and limit pushdown. 13 | - Provide a pipeline (`LogicalOptimizer`) that repeatedly applies rules until reaching a 14 | fixpoint, while remaining extensible for future cost models. 15 | 16 | --- 17 | 18 | ## Directory Layout 19 | 20 | | Path | Description | Key Types | 21 | | ---- | ----------- | --------- | 22 | | `mod.rs` | Optimizer entry point. | `LogicalOptimizer` | 23 | | `rule.rs` | Trait + shared helpers. | `OptimizerRule` | 24 | | `rules/*` | Concrete rewrites. | `PushDownFilter`, `PushDownLimit`, … | 25 | 26 | --- 27 | 28 | ## How It Works 29 | 30 | 1. `LogicalOptimizer::optimize(plan)` iterates through the registered rule list. 31 | 2. Each rule implements `fn apply(&LogicalPlan) -> Option`. Returning `Some` 32 | means the rule fired; the pipeline restarts to reach a fixpoint. 33 | 3. Rules are pure functions, which keeps them easy to unit test and reason about. 34 | 35 | Examples: 36 | - **PushDownFilter** moves filters below scans/joins to reduce input size sooner. 37 | - **PushDownLimit** applies LIMIT before expensive joins/sorts when safe. 38 | - **PruneProjection** removes unused columns so execution/storage decode less data. 39 | 40 | ### Extending With Statistics 41 | 42 | The optimizer intentionally remains heuristics-only, and the physical planner sticks to 43 | simple sequential scans. For coursework, students can still read `TableStatistics` from 44 | the catalog to prototype their own cardinality estimates or cost heuristics (e.g., to 45 | experiment with when to prefer an index scan), but no estimator ships in-tree. 46 | 47 | --- 48 | 49 | ## Interactions 50 | 51 | - **LogicalPlan** – the optimizer only sees logical nodes; physical/storage layers remain 52 | untouched. 53 | - **Catalog / Statistics** – current rules are heuristic, but `TableStatistics` remains 54 | available for students who want to prototype their own cost-based decisions. 55 | - **Execution** – leaner logical plans translate into simpler physical plans (e.g., 56 | predicate pushdown allows `PhysicalSeqScan` to discard rows earlier). 57 | 58 | --- 59 | 60 | ## Teaching Ideas 61 | 62 | - Implement a new rule (join reordering, constant folding) and use `RUST_LOG=trace` to 63 | compare plan dumps before/after. 64 | - Discuss pipeline ordering—swap rule order and observe different outcomes. 65 | - Prototype a tiny cost estimator using row counts from `TableStatistics` to decide on 66 | index scans vs sequential scans. 67 | 68 | --- 69 | 70 | Further reading: [Rule-Based Optimization](../optimizer/rules.md) 71 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_drop.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::plan::logical_plan::{DropIndex, DropTable, LogicalPlan}; 3 | 4 | use super::LogicalPlanner; 5 | 6 | impl<'a> LogicalPlanner<'a> { 7 | pub fn plan_drop_table( 8 | &self, 9 | names: &[sqlparser::ast::ObjectName], 10 | if_exists: bool, 11 | cascade: bool, 12 | purge: bool, 13 | ) -> QuillSQLResult { 14 | if purge { 15 | return Err(QuillSQLError::NotSupport( 16 | "DROP TABLE ... PURGE is not supported".to_string(), 17 | )); 18 | } 19 | if names.len() != 1 { 20 | return Err(QuillSQLError::NotSupport( 21 | "DROP TABLE only supports a single target".to_string(), 22 | )); 23 | } 24 | 25 | let table_ref = self.bind_table_name(&names[0])?; 26 | if cascade { 27 | // Implicitly drop dependent indexes, so CASCADE behaves the same as default. 28 | // No-op, but accepted for compatibility. 29 | } 30 | 31 | Ok(LogicalPlan::DropTable(DropTable { 32 | name: table_ref, 33 | if_exists, 34 | })) 35 | } 36 | 37 | pub fn plan_drop_index( 38 | &self, 39 | names: &[sqlparser::ast::ObjectName], 40 | if_exists: bool, 41 | cascade: bool, 42 | purge: bool, 43 | ) -> QuillSQLResult { 44 | if cascade { 45 | return Err(QuillSQLError::NotSupport( 46 | "DROP INDEX ... CASCADE is not supported".to_string(), 47 | )); 48 | } 49 | if purge { 50 | return Err(QuillSQLError::NotSupport( 51 | "DROP INDEX ... PURGE is not supported".to_string(), 52 | )); 53 | } 54 | if names.len() != 1 { 55 | return Err(QuillSQLError::NotSupport( 56 | "DROP INDEX only supports a single target".to_string(), 57 | )); 58 | } 59 | 60 | let parts = &names[0].0; 61 | let (catalog, schema, name) = match parts.as_slice() { 62 | [ident] => (None, None, ident.value.clone()), 63 | [schema, ident] => (None, Some(schema.value.clone()), ident.value.clone()), 64 | [catalog, schema, ident] => ( 65 | Some(catalog.value.clone()), 66 | Some(schema.value.clone()), 67 | ident.value.clone(), 68 | ), 69 | _ => { 70 | return Err(QuillSQLError::Plan(format!( 71 | "DROP INDEX name '{}' has too many qualifiers", 72 | names[0] 73 | ))) 74 | } 75 | }; 76 | 77 | Ok(LogicalPlan::DropIndex(DropIndex { 78 | name, 79 | schema, 80 | catalog, 81 | if_exists, 82 | })) 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_create_table.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use std::collections::HashSet; 3 | 4 | use crate::catalog::{Column, DataType}; 5 | use crate::expression::Expr; 6 | use crate::plan::logical_plan::{CreateTable, LogicalPlan}; 7 | use crate::utils::scalar::ScalarValue; 8 | 9 | use super::LogicalPlanner; 10 | 11 | impl<'a> LogicalPlanner<'a> { 12 | pub fn plan_create_table( 13 | &self, 14 | name: &sqlparser::ast::ObjectName, 15 | column_defs: &Vec, 16 | if_not_exists: bool, 17 | ) -> QuillSQLResult { 18 | let name = self.bind_table_name(name)?; 19 | let mut columns = vec![]; 20 | for col_def in column_defs { 21 | let data_type: DataType = (&col_def.data_type).try_into()?; 22 | let not_null: bool = col_def 23 | .options 24 | .iter() 25 | .any(|opt| matches!(opt.option, sqlparser::ast::ColumnOption::NotNull)); 26 | let default_expr: Option<&sqlparser::ast::Expr> = col_def 27 | .options 28 | .iter() 29 | .find(|opt| matches!(opt.option, sqlparser::ast::ColumnOption::Default(_))) 30 | .map(|opt| { 31 | if let sqlparser::ast::ColumnOption::Default(expr) = &opt.option { 32 | expr 33 | } else { 34 | unreachable!() 35 | } 36 | }); 37 | let default = if let Some(expr) = default_expr { 38 | let expr = self.bind_expr(expr)?; 39 | match expr { 40 | Expr::Literal(lit) => lit.value.cast_to(&data_type)?, 41 | _ => { 42 | return Err(QuillSQLError::Internal( 43 | "The expr is not literal".to_string(), 44 | )) 45 | } 46 | } 47 | } else { 48 | ScalarValue::new_empty(data_type) 49 | }; 50 | 51 | columns.push( 52 | Column::new(col_def.name.value.clone(), data_type, !not_null) 53 | .with_relation(Some(name.clone())) 54 | .with_default(default), 55 | ) 56 | } 57 | 58 | check_column_name_conflict(&columns)?; 59 | Ok(LogicalPlan::CreateTable(CreateTable { 60 | name, 61 | columns, 62 | if_not_exists, 63 | })) 64 | } 65 | } 66 | 67 | fn check_column_name_conflict(columns: &[Column]) -> QuillSQLResult<()> { 68 | let mut names = HashSet::new(); 69 | for col in columns { 70 | if names.contains(col.name.as_str()) { 71 | return Err(QuillSQLError::Plan(format!( 72 | "Column names have conflict on '{}'", 73 | col.name 74 | ))); 75 | } else { 76 | names.insert(col.name.as_str()); 77 | } 78 | } 79 | Ok(()) 80 | } 81 | -------------------------------------------------------------------------------- /docs/src/modules/index.md: -------------------------------------------------------------------------------- 1 | # Index Module 2 | 3 | Indexes live in `src/storage/index/`. QuillSQL currently ships a B+Tree (B-link variant) 4 | that is exposed to execution via `IndexHandle`. Indexes allow point lookups and range 5 | scans in O(log n), dramatically reducing the need for full table scans. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Maintain an ordered key → `RecordId` mapping per indexed table. 12 | - Support point probes, range scans, insert/update/delete maintenance. 13 | - Cooperate with MVCC: entries reference heap tuples while visibility checks remain in 14 | execution/transaction code. 15 | - Provide `IndexHandle::range_scan`, returning a `TupleStream` so physical operators don’t 16 | need to know tree internals. 17 | 18 | --- 19 | 20 | ## Directory Layout 21 | 22 | | Path | Purpose | Key Types | 23 | | ---- | ------- | --------- | 24 | | `btree_index.rs` | Core B+Tree, page formats, insert/delete logic. | `BPlusTreeIndex` | 25 | | `btree_iterator.rs` | Range-scan iterator with sibling traversal. | `TreeIndexIterator` | 26 | | `btree_codec.rs` | Page encode/decode utilities. | `BPlusTreeLeafPageCodec` | 27 | 28 | --- 29 | 30 | ## Key Concepts 31 | 32 | ### B-link Structure 33 | Each leaf stores a pointer to its right sibling. Iterators use this to keep scanning even 34 | if a concurrent split occurs, avoiding restarts from the root and enabling latch-free 35 | range scans. 36 | 37 | ### Latch Crabbing 38 | Insert/delete operations climb the tree with shared latches and upgrade only when 39 | necessary (e.g., right before splitting), reducing contention. 40 | 41 | ### Range Scan → TupleStream 42 | `IndexHandle::range_scan` wraps `TreeIndexIterator` and automatically fetches heap tuples, 43 | returning `(rid, meta, tuple)` triples. Execution remains storage-agnostic. 44 | 45 | ### Inline Maintenance 46 | Index inserts/updates/deletes modify the tree immediately and emit logical WAL for redo. 47 | There is no deferred “index vacuum”; once a heap tuple is deleted its index entry is 48 | removed in the same transaction. 49 | 50 | --- 51 | 52 | ## Interactions 53 | 54 | - **Catalog** – stores `Arc` instances alongside table metadata so 55 | execution can fetch handles directly. 56 | - **Execution** – `PhysicalIndexScan` uses `ExecutionContext::index_stream`; DML operators 57 | call `insert_tuple_with_indexes` so heap writes and index maintenance stay in sync. 58 | - **Transaction/MVCC** – heaps store transaction metadata; indexes just reference RIDs, so 59 | MVCC visibility is enforced when tuples are materialised. 60 | - **Recovery** – WAL contains `IndexInsert/IndexDelete` records to replay structural 61 | changes after crashes. 62 | 63 | --- 64 | 65 | ## Teaching Ideas 66 | 67 | - Build a covering index example to show how avoiding heap lookups improves latency. 68 | - Instrument `TreeIndexIterator` to visualise sibling traversal during range scans. 69 | - Compare SeqScan vs IndexScan on selective predicates to highlight indexing benefits. 70 | 71 | --- 72 | 73 | Further reading: [B+Tree internals](../index/btree_index.md) 74 | -------------------------------------------------------------------------------- /src/storage/codec/tuple.rs: -------------------------------------------------------------------------------- 1 | use crate::catalog::SchemaRef; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::storage::codec::{DecodedData, ScalarValueCodec}; 4 | use crate::storage::tuple::Tuple; 5 | use crate::utils::bitmap::DynamicBitmap; 6 | use crate::utils::scalar::ScalarValue; 7 | 8 | pub struct TupleCodec; 9 | 10 | impl TupleCodec { 11 | pub fn encode(tuple: &Tuple) -> Vec { 12 | // null map 13 | let mut null_map = DynamicBitmap::new(); 14 | let mut attributes = Vec::new(); 15 | for (idx, value) in tuple.data.iter().enumerate() { 16 | null_map.set(idx, value.is_null()); 17 | if !value.is_null() { 18 | attributes.extend(ScalarValueCodec::encode(value)); 19 | } 20 | } 21 | 22 | let mut bytes = null_map.to_bytes(); 23 | bytes.extend(attributes); 24 | bytes 25 | } 26 | 27 | pub fn decode(bytes: &[u8], schema: SchemaRef) -> QuillSQLResult> { 28 | let mut total_offset = 0; 29 | 30 | let null_map_bytes = schema.column_count().div_ceil(8); 31 | let null_map = DynamicBitmap::from_bytes(&bytes[0..null_map_bytes]); 32 | total_offset += null_map_bytes; 33 | let mut bytes = &bytes[null_map_bytes..]; 34 | 35 | let mut data = vec![]; 36 | for (idx, col) in schema.columns.iter().enumerate() { 37 | let null = null_map.get(idx).ok_or(QuillSQLError::Internal( 38 | "null map size should be greater than or equal to col count".to_string(), 39 | ))?; 40 | if null { 41 | data.push(ScalarValue::new_empty(col.data_type)); 42 | } else { 43 | let (value, offset) = ScalarValueCodec::decode(bytes, col.data_type)?; 44 | data.push(value); 45 | total_offset += offset; 46 | bytes = &bytes[offset..]; 47 | } 48 | } 49 | 50 | Ok((Tuple::new(schema, data), total_offset)) 51 | } 52 | } 53 | 54 | #[cfg(test)] 55 | mod tests { 56 | use crate::catalog::{Column, DataType, Schema}; 57 | use crate::storage::codec::TupleCodec; 58 | use crate::storage::tuple::Tuple; 59 | use crate::utils::scalar::ScalarValue; 60 | use std::sync::Arc; 61 | 62 | #[test] 63 | fn tuple_codec() { 64 | let schema = Arc::new(Schema::new(vec![ 65 | Column::new("a", DataType::Boolean, true), 66 | Column::new("b", DataType::Int32, true), 67 | Column::new("c", DataType::UInt64, true), 68 | Column::new("d", DataType::Varchar(None), true), 69 | ])); 70 | let tuple = Tuple::new( 71 | schema.clone(), 72 | vec![ 73 | true.into(), 74 | ScalarValue::Int32(None), 75 | 1234u64.into(), 76 | "aabb".to_string().into(), 77 | ], 78 | ); 79 | let new_tuple = TupleCodec::decode(&TupleCodec::encode(&tuple), schema) 80 | .unwrap() 81 | .0; 82 | assert_eq!(new_tuple, tuple); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /public/terminal-preview.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | QUILLSQL TERMINAL 31 | 32 | 33 | GITHUB 34 | 35 | PROFILE 36 | 37 | QUILLSQL 38 | INTERACTIVE TTY · HELP FOR COMMANDS 39 | Welcome to QUILLSQL Terminal. Type `help` to get started. 40 | Current SQL endpoint: /api/sql 41 | 42 | quill@tty:~$ 43 | type help for usage · Shift+Enter for newline 44 | 45 | -------------------------------------------------------------------------------- /src/storage/codec/meta_page.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::storage::codec::{CommonCodec, DecodedData}; 3 | use crate::storage::page::MetaPage; 4 | 5 | pub struct MetaPageCodec; 6 | 7 | impl MetaPageCodec { 8 | pub fn encode(page: &MetaPage) -> Vec { 9 | let mut bytes = Vec::new(); 10 | bytes.extend(CommonCodec::encode_u32(page.major_version)); 11 | bytes.extend(CommonCodec::encode_u32(page.minor_version)); 12 | bytes.extend(CommonCodec::encode_u32(page.freelist_page_id)); 13 | bytes.extend(CommonCodec::encode_u32( 14 | page.information_schema_schemas_first_page_id, 15 | )); 16 | bytes.extend(CommonCodec::encode_u32( 17 | page.information_schema_tables_first_page_id, 18 | )); 19 | bytes.extend(CommonCodec::encode_u32( 20 | page.information_schema_columns_first_page_id, 21 | )); 22 | bytes.extend(CommonCodec::encode_u32( 23 | page.information_schema_indexes_first_page_id, 24 | )); 25 | bytes 26 | } 27 | 28 | pub fn decode(bytes: &[u8]) -> QuillSQLResult> { 29 | let mut left_bytes = bytes; 30 | 31 | let (major_version, offset) = CommonCodec::decode_u32(left_bytes)?; 32 | left_bytes = &left_bytes[offset..]; 33 | let (minor_version, offset) = CommonCodec::decode_u32(left_bytes)?; 34 | left_bytes = &left_bytes[offset..]; 35 | let (freelist_page_id, offset) = CommonCodec::decode_u32(left_bytes)?; 36 | left_bytes = &left_bytes[offset..]; 37 | let (information_schema_schemas_first_page_id, offset) = 38 | CommonCodec::decode_u32(left_bytes)?; 39 | left_bytes = &left_bytes[offset..]; 40 | let (information_schema_tables_first_page_id, offset) = 41 | CommonCodec::decode_u32(left_bytes)?; 42 | left_bytes = &left_bytes[offset..]; 43 | let (information_schema_columns_first_page_id, offset) = 44 | CommonCodec::decode_u32(left_bytes)?; 45 | left_bytes = &left_bytes[offset..]; 46 | let (information_schema_indexes_first_page_id, offset) = 47 | CommonCodec::decode_u32(left_bytes)?; 48 | left_bytes = &left_bytes[offset..]; 49 | 50 | Ok(( 51 | MetaPage { 52 | major_version, 53 | minor_version, 54 | freelist_page_id, 55 | information_schema_schemas_first_page_id, 56 | information_schema_tables_first_page_id, 57 | information_schema_columns_first_page_id, 58 | information_schema_indexes_first_page_id, 59 | }, 60 | bytes.len() - left_bytes.len(), 61 | )) 62 | } 63 | } 64 | 65 | #[cfg(test)] 66 | mod tests { 67 | use crate::storage::codec::MetaPageCodec; 68 | use crate::storage::page::MetaPage; 69 | 70 | #[test] 71 | fn meta_page_codec() { 72 | let page = MetaPage::try_new().unwrap(); 73 | let (new_page, _) = MetaPageCodec::decode(&MetaPageCodec::encode(&page)).unwrap(); 74 | assert_eq!(page, new_page); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/utils/table_ref.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] 2 | pub enum TableReference { 3 | /// An unqualified table reference, e.g. "table" 4 | Bare { 5 | /// The table name 6 | table: String, 7 | }, 8 | /// A partially resolved table reference, e.g. "schema.table" 9 | Partial { 10 | /// The schema containing the table 11 | schema: String, 12 | /// The table name 13 | table: String, 14 | }, 15 | /// A fully resolved table reference, e.g. "catalog.schema.table" 16 | Full { 17 | /// The catalog (aka database) containing the table 18 | catalog: String, 19 | /// The schema containing the table 20 | schema: String, 21 | /// The table name 22 | table: String, 23 | }, 24 | } 25 | 26 | impl TableReference { 27 | pub fn table(&self) -> &str { 28 | match self { 29 | Self::Full { table, .. } | Self::Partial { table, .. } | Self::Bare { table } => table, 30 | } 31 | } 32 | 33 | pub fn schema(&self) -> Option<&str> { 34 | match self { 35 | Self::Full { schema, .. } | Self::Partial { schema, .. } => Some(schema), 36 | _ => None, 37 | } 38 | } 39 | 40 | pub fn catalog(&self) -> Option<&str> { 41 | match self { 42 | Self::Full { catalog, .. } => Some(catalog), 43 | _ => None, 44 | } 45 | } 46 | 47 | pub fn resolved_eq(&self, other: &Self) -> bool { 48 | match self { 49 | TableReference::Bare { table } => table == other.table(), 50 | TableReference::Partial { schema, table } => { 51 | table == other.table() && other.schema().map_or(true, |s| s == schema) 52 | } 53 | TableReference::Full { 54 | catalog, 55 | schema, 56 | table, 57 | } => { 58 | table == other.table() 59 | && other.schema().map_or(true, |s| s == schema) 60 | && other.catalog().map_or(true, |c| c == catalog) 61 | } 62 | } 63 | } 64 | 65 | pub fn to_log_string(&self) -> String { 66 | match self { 67 | TableReference::Bare { table } => table.clone(), 68 | TableReference::Partial { schema, table } => format!("{schema}.{table}"), 69 | TableReference::Full { 70 | catalog, 71 | schema, 72 | table, 73 | } => format!("{catalog}.{schema}.{table}"), 74 | } 75 | } 76 | } 77 | 78 | impl std::fmt::Display for TableReference { 79 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 80 | match self { 81 | TableReference::Bare { table } => write!(f, "{table}"), 82 | TableReference::Partial { schema, table } => { 83 | write!(f, "{schema}.{table}") 84 | } 85 | TableReference::Full { 86 | catalog, 87 | schema, 88 | table, 89 | } => write!(f, "{catalog}.{schema}.{table}"), 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/storage/codec/freelist_page.rs: -------------------------------------------------------------------------------- 1 | use crate::buffer::PAGE_SIZE; 2 | use crate::error::QuillSQLResult; 3 | use crate::storage::codec::{CommonCodec, DecodedData}; 4 | use crate::storage::page::{FreelistPage, FreelistPageHeader}; 5 | 6 | pub struct FreelistPageHeaderCodec; 7 | 8 | impl FreelistPageHeaderCodec { 9 | pub fn encode(header: &FreelistPageHeader) -> Vec { 10 | let mut bytes = Vec::new(); 11 | bytes.extend(CommonCodec::encode_u32(header.next_page_id)); 12 | bytes.extend(CommonCodec::encode_u32(header.current_size)); 13 | bytes.extend(CommonCodec::encode_u32(header.max_size)); 14 | bytes 15 | } 16 | 17 | pub fn decode(bytes: &[u8]) -> QuillSQLResult> { 18 | let mut left_bytes = bytes; 19 | 20 | let (next_page_id, offset) = CommonCodec::decode_u32(left_bytes)?; 21 | left_bytes = &left_bytes[offset..]; 22 | 23 | let (current_size, offset) = CommonCodec::decode_u32(left_bytes)?; 24 | left_bytes = &left_bytes[offset..]; 25 | 26 | let (max_size, offset) = CommonCodec::decode_u32(left_bytes)?; 27 | left_bytes = &left_bytes[offset..]; 28 | 29 | Ok(( 30 | FreelistPageHeader { 31 | next_page_id, 32 | current_size, 33 | max_size, 34 | }, 35 | bytes.len() - left_bytes.len(), 36 | )) 37 | } 38 | } 39 | 40 | pub struct FreelistPageCodec; 41 | 42 | impl FreelistPageCodec { 43 | pub fn encode(page: &FreelistPage) -> Vec { 44 | let mut bytes = Vec::new(); 45 | bytes.extend(FreelistPageHeaderCodec::encode(&page.header)); 46 | for i in 0..page.header.current_size { 47 | bytes.extend(CommonCodec::encode_u32(page.array[i as usize])) 48 | } 49 | // make sure length of bytes is BUSTUBX_PAGE_SIZE 50 | assert!(bytes.len() <= PAGE_SIZE); 51 | bytes.extend(vec![0; PAGE_SIZE - bytes.len()]); 52 | bytes 53 | } 54 | 55 | pub fn decode(bytes: &[u8]) -> QuillSQLResult> { 56 | let mut left_bytes = bytes; 57 | 58 | let (header, offset) = FreelistPageHeaderCodec::decode(left_bytes)?; 59 | left_bytes = &left_bytes[offset..]; 60 | 61 | let mut array = Vec::new(); 62 | for _ in 0..header.current_size { 63 | let (page_id, offset) = CommonCodec::decode_u32(left_bytes)?; 64 | left_bytes = &left_bytes[offset..]; 65 | array.push(page_id); 66 | } 67 | 68 | Ok((FreelistPage { header, array }, PAGE_SIZE)) 69 | } 70 | } 71 | 72 | #[cfg(test)] 73 | mod tests { 74 | use crate::storage::codec::FreelistPageCodec; 75 | use crate::storage::page::{FreelistPage, FreelistPageHeader, FREELIST_PAGE_MAX_SIZE}; 76 | 77 | #[test] 78 | fn freelist_page_codec() { 79 | let page = FreelistPage { 80 | header: FreelistPageHeader { 81 | next_page_id: 1, 82 | current_size: 3, 83 | max_size: *FREELIST_PAGE_MAX_SIZE as u32, 84 | }, 85 | array: vec![5, 6, 8], 86 | }; 87 | let (new_page, _) = FreelistPageCodec::decode(&FreelistPageCodec::encode(&page)).unwrap(); 88 | assert_eq!(page, new_page); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /docs/src/modules/catalog.md: -------------------------------------------------------------------------------- 1 | # Catalog Module 2 | 3 | `src/catalog/` acts as QuillSQL’s data dictionary. It tracks schema/table/index metadata, 4 | statistics, and the mapping between logical names and physical storage objects such as 5 | `TableHeap` and `BPlusTreeIndex`. Every layer—planner, execution, background workers—uses 6 | the catalog to discover structure. 7 | 8 | --- 9 | 10 | ## Responsibilities 11 | 12 | - Persist definitions for schemas, tables, columns, indexes, and constraints. 13 | - Map logical `TableReference`s to physical handles (heap files, index roots, file ids). 14 | - Store table statistics (row counts, histograms) that drive ANALYZE and optimization. 15 | - Manage the DDL lifecycle: creation and deletion update the in-memory registry and the 16 | on-disk metadata pages. 17 | 18 | --- 19 | 20 | ## Directory Layout 21 | 22 | | Path | Description | Key Types | 23 | | ---- | ----------- | --------- | 24 | | `mod.rs` | Public API surface. | `Catalog`, `TableHandleRef` | 25 | | `schema.rs` | Schema objects and table references. | `Schema`, `Column`, `TableReference` | 26 | | `registry/` | Thread-safe registry for heaps (MVCC vacuum). | `TableRegistry` | 27 | | `statistics.rs` | ANALYZE output and helpers. | `TableStatistics` | 28 | | `loader.rs` | Boot-time metadata loader. | `load_catalog_data` | 29 | 30 | --- 31 | 32 | ## Core Concepts 33 | 34 | ### TableReference 35 | Unified identifier (database, schema, table). Logical planner, execution, and transaction 36 | code all use it when requesting handles from the catalog. 37 | 38 | ### Registries 39 | `TableRegistry` maps internal IDs to `Arc` plus logical names. It is used by 40 | the MVCC vacuum worker to iterate user tables without poking directly into catalog data. 41 | 42 | ### Schema & Column 43 | `Schema` stores column definitions (type, default, nullability). Execution uses it when 44 | materialising tuples; the planner uses it to check expression types. `Schema::project` 45 | helps physical operators build projected outputs. 46 | 47 | ### TableStatistics 48 | `ANALYZE` writes row counts and histograms into the catalog. Optimizer rules and planner 49 | heuristics can consult these stats when deciding whether to push filters or pick indexes. 50 | Each column tracks null/non-null counts, min/max values, and a sample-based distinct 51 | estimate, enabling DuckDB-style selectivity heuristics (`1/distinct`, uniform ranges). 52 | 53 | --- 54 | 55 | ## Interactions 56 | 57 | - **SQL / Planner** – DDL planning calls `Catalog::create_table` / `create_index`; name 58 | binding relies on `Schema`. 59 | - **Execution** – `ExecutionContext::table_handle` and `index_handle` fetch physical 60 | handles through the catalog, so scans never hard-code heap locations. 61 | - **Background workers** – MVCC and index vacuum iterate the registries via `Arc` clones. 62 | - **Recovery** – `load_catalog_data` rebuilds the in-memory catalog from control files and 63 | metadata pages during startup. 64 | 65 | --- 66 | 67 | ## Teaching Ideas 68 | 69 | - Extend the schema system with hidden or computed columns and teach the catalog to store 70 | the extra metadata. 71 | - Add histogram bins to `TableStatistics` and demonstrate how a simple cost heuristic can 72 | choose better plans. 73 | - Turn on `RUST_LOG=catalog=debug` to observe how DDL mutates the registries. 74 | -------------------------------------------------------------------------------- /src/optimizer/rule/merge_limit.rs: -------------------------------------------------------------------------------- 1 | use crate::error::QuillSQLResult; 2 | use crate::optimizer::logical_optimizer::ApplyOrder; 3 | use crate::optimizer::LogicalOptimizerRule; 4 | use crate::plan::logical_plan::{Limit, LogicalPlan}; 5 | use std::cmp::min; 6 | use std::sync::Arc; 7 | 8 | pub struct MergeLimit; 9 | 10 | impl LogicalOptimizerRule for MergeLimit { 11 | fn try_optimize(&self, plan: &LogicalPlan) -> QuillSQLResult> { 12 | let LogicalPlan::Limit(parent) = plan else { 13 | return Ok(None); 14 | }; 15 | 16 | if let LogicalPlan::Limit(child) = &*parent.input { 17 | let new_limit = match (parent.limit, child.limit) { 18 | (Some(parent_limit), Some(child_limit)) => { 19 | Some(min(parent_limit, child_limit.saturating_sub(parent.offset))) 20 | } 21 | (Some(parent_limit), None) => Some(parent_limit), 22 | (None, Some(child_limit)) => Some(child_limit.saturating_sub(parent.offset)), 23 | (None, None) => None, 24 | }; 25 | let plan = LogicalPlan::Limit(Limit { 26 | limit: new_limit, 27 | offset: child.offset + parent.offset, 28 | input: Arc::new((*child.input).clone()), 29 | }); 30 | self.try_optimize(&plan) 31 | .map(|opt_plan| opt_plan.or_else(|| Some(plan))) 32 | } else { 33 | Ok(None) 34 | } 35 | } 36 | 37 | fn name(&self) -> &str { 38 | "MergeLimit" 39 | } 40 | 41 | fn apply_order(&self) -> Option { 42 | Some(ApplyOrder::TopDown) 43 | } 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use crate::catalog::EMPTY_SCHEMA_REF; 49 | use crate::optimizer::rule::MergeLimit; 50 | use crate::optimizer::LogicalOptimizer; 51 | use crate::plan::logical_plan::{EmptyRelation, Limit, LogicalPlan}; 52 | use std::sync::Arc; 53 | 54 | fn build_optimizer() -> LogicalOptimizer { 55 | LogicalOptimizer::with_rules(vec![Arc::new(MergeLimit)]) 56 | } 57 | 58 | #[test] 59 | fn merge_limit() { 60 | let plan = LogicalPlan::Limit(Limit { 61 | limit: Some(10), 62 | offset: 0, 63 | input: Arc::new(LogicalPlan::Limit(Limit { 64 | limit: Some(1000), 65 | offset: 0, 66 | input: Arc::new(LogicalPlan::Limit(Limit { 67 | limit: None, 68 | offset: 10, 69 | input: Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { 70 | produce_one_row: false, 71 | schema: EMPTY_SCHEMA_REF.clone(), 72 | })), 73 | })), 74 | })), 75 | }); 76 | let optimized_plan = build_optimizer().optimize(&plan).unwrap(); 77 | 78 | if let LogicalPlan::Limit(Limit { 79 | limit, 80 | offset, 81 | input, 82 | }) = optimized_plan 83 | { 84 | assert_eq!(limit, Some(10)); 85 | assert_eq!(offset, 10); 86 | assert!(matches!(input.as_ref(), LogicalPlan::EmptyRelation(_))); 87 | } else { 88 | panic!("the first node should be limit"); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/plan/logical_planner/plan_query.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{QuillSQLError, QuillSQLResult}; 2 | use crate::expression::Expr; 3 | use crate::utils::scalar::ScalarValue; 4 | use std::sync::Arc; 5 | 6 | use crate::plan::logical_plan::{Limit, LogicalPlan, Sort}; 7 | 8 | use super::LogicalPlanner; 9 | 10 | impl<'a> LogicalPlanner<'a> { 11 | pub fn plan_query(&self, query: &sqlparser::ast::Query) -> QuillSQLResult { 12 | let plan = self.plan_set_expr(&query.body)?; 13 | let plan = self.plan_order_by(plan, &query.order_by)?; 14 | self.plan_limit(plan, &query.limit, &query.offset) 15 | } 16 | 17 | pub fn plan_order_by( 18 | &self, 19 | input: LogicalPlan, 20 | order_by: &Vec, 21 | ) -> QuillSQLResult { 22 | if order_by.is_empty() { 23 | return Ok(input); 24 | } 25 | 26 | let mut order_by_exprs = vec![]; 27 | for order in order_by { 28 | order_by_exprs.push(self.bind_order_by_expr(order)?); 29 | } 30 | 31 | Ok(LogicalPlan::Sort(Sort { 32 | order_by: order_by_exprs, 33 | input: Arc::new(input), 34 | limit: None, 35 | })) 36 | } 37 | 38 | pub fn plan_limit( 39 | &self, 40 | input: LogicalPlan, 41 | limit: &Option, 42 | offset: &Option, 43 | ) -> QuillSQLResult { 44 | if limit.is_none() && offset.is_none() { 45 | return Ok(input); 46 | } 47 | 48 | let limit = match limit { 49 | None => None, 50 | Some(limit_expr) => { 51 | let n = match self.bind_expr(limit_expr)? { 52 | Expr::Literal(lit) => match lit.value { 53 | ScalarValue::Int64(Some(v)) if v >= 0 => Ok(v as usize), 54 | _ => Err(QuillSQLError::Plan(format!( 55 | "LIMIT must not be negative, {}", 56 | lit.value 57 | ))), 58 | }, 59 | _ => Err(QuillSQLError::Plan(format!( 60 | "LIMIT must be literal, {}", 61 | limit_expr 62 | ))), 63 | }?; 64 | Some(n) 65 | } 66 | }; 67 | 68 | let offset = match offset { 69 | None => 0, 70 | Some(offset_expr) => match self.bind_expr(&offset_expr.value)? { 71 | Expr::Literal(lit) => match lit.value { 72 | ScalarValue::Int64(Some(v)) => { 73 | if v < 0 { 74 | return Err(QuillSQLError::Plan(format!("Offset must be >= 0, {}", v))); 75 | } 76 | Ok(v as usize) 77 | } 78 | _ => Err(QuillSQLError::Plan(format!( 79 | "Offset value not int64, {}", 80 | lit.value 81 | ))), 82 | }, 83 | _ => Err(QuillSQLError::Plan(format!( 84 | "Offset expression not expected, {}", 85 | offset_expr 86 | ))), 87 | }?, 88 | }; 89 | 90 | Ok(LogicalPlan::Limit(Limit { 91 | limit, 92 | offset, 93 | input: Arc::new(input), 94 | })) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /docs/src/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributor's Guide 2 | 3 | Welcome, and thank you for your interest in contributing to QuillSQL! Whether you're fixing a bug, adding a new feature, or improving the documentation, this guide will help you get started. 4 | 5 | ## 1. Getting Started: Your Development Environment 6 | 7 | ### Prerequisites 8 | 9 | - **Rust**: QuillSQL is written in Rust. If you don't have it yet, install it via [rustup](https://rustup.rs/). This will provide you with `rustc` (the compiler) and `cargo` (the package manager and build tool). 10 | ```bash 11 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 12 | ``` 13 | - **Build Essentials**: Ensure you have a C++ compiler like `gcc` or `clang` installed, which is a common dependency for some Rust crates. 14 | 15 | ### Setup 16 | 17 | 1. **Fork the Repository**: Start by forking the main QuillSQL repository to your own GitHub account. 18 | 19 | 2. **Clone Your Fork**: Clone your forked repository to your local machine. 20 | ```bash 21 | git clone https://github.com/feichai0017/QuillSQL.git 22 | cd QuillSQL 23 | ``` 24 | 25 | 3. **Build the Project**: Compile the entire project to ensure everything is set up correctly. 26 | ```bash 27 | cargo build 28 | ``` 29 | 30 | ## 2. Development Workflow 31 | 32 | ### Running Tests 33 | 34 | Before and after making any changes, it's crucial to run the test suite to ensure you haven't broken anything. 35 | 36 | - **Run all unit and integration tests**: 37 | ```bash 38 | cargo test 39 | ``` 40 | 41 | - **Run the benchmark suite**: 42 | ```bash 43 | cargo bench 44 | ``` 45 | 46 | ### Code Style and Quality 47 | 48 | We adhere to the standard Rust coding style and use tools to enforce it. 49 | 50 | - **Formatting**: Before committing, please format your code with `rustfmt`. 51 | ```bash 52 | cargo fmt --all 53 | ``` 54 | 55 | - **Linting**: We use `clippy` to catch common mistakes and improve code quality. Please ensure `clippy` passes without warnings. 56 | ```bash 57 | cargo clippy --all-targets -- -D warnings 58 | ``` 59 | 60 | ### Submitting Your Contribution 61 | 62 | 1. **Create a New Branch**: Create a descriptive branch name for your feature or bugfix. 63 | ```bash 64 | git checkout -b my-awesome-feature 65 | ``` 66 | 67 | 2. **Make Your Changes**: Write your code. Add new tests to cover your changes. Ensure all existing tests still pass. 68 | 69 | 3. **Format and Lint**: Run `cargo fmt` and `cargo clippy` as described above. 70 | 71 | 4. **Commit Your Work**: Write a clear and concise commit message. 72 | ```bash 73 | git add . 74 | git commit -m "feat: Add support for window functions" 75 | ``` 76 | 77 | 5. **Push to Your Fork**: Push your branch to your fork on GitHub. 78 | ```bash 79 | git push -u origin my-awesome-feature 80 | ``` 81 | 82 | 6. **Open a Pull Request**: Go to the original QuillSQL repository on GitHub. You should see a prompt to open a Pull Request from your new branch. Fill out the PR template with a description of your changes. 83 | 84 | ## 3. Working on the Documentation 85 | 86 | The documentation is built using `mdbook`. To preview your changes locally, you'll need to install it and the `mermaid` plugin. 87 | 88 | 1. **Install `mdbook` and `mdbook-mermaid`**: 89 | ```bash 90 | cargo install mdbook 91 | cargo install mdbook-mermaid 92 | ``` 93 | 94 | 2. **Serve the Book Locally**: Run the following command from the root of the project. 95 | ```bash 96 | mdbook serve docs 97 | ``` 98 | This will build the book and start a local web server. You can open your browser to `http://localhost:3000` to see the live-previewed documentation. 99 | -------------------------------------------------------------------------------- /src/execution/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod physical_plan; 2 | use crate::catalog::SchemaRef; 3 | use crate::error::{QuillSQLError, QuillSQLResult}; 4 | use crate::execution::physical_plan::PhysicalPlan; 5 | use crate::expression::{Expr, ExprTrait}; 6 | use crate::storage::{ 7 | engine::{StorageEngine, TableBinding}, 8 | table_heap::TableHeap, 9 | tuple::Tuple, 10 | }; 11 | use crate::transaction::{Transaction, TransactionManager, TxnContext}; 12 | use crate::utils::scalar::ScalarValue; 13 | use crate::{catalog::Catalog, utils::table_ref::TableReference}; 14 | use std::sync::Arc; 15 | 16 | pub trait VolcanoExecutor { 17 | fn init(&self, _context: &mut ExecutionContext) -> QuillSQLResult<()> { 18 | Ok(()) 19 | } 20 | 21 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult>; 22 | 23 | fn output_schema(&self) -> SchemaRef; 24 | } 25 | 26 | /// Shared state threaded through every physical operator during execution. 27 | /// Exposes MVCC helpers, storage access, expression evaluation and DDL utilities. 28 | pub struct ExecutionContext<'a> { 29 | /// Mutable reference to the global catalog (schema + metadata). 30 | pub catalog: &'a mut Catalog, 31 | /// Pluggable storage engine used for heap/index access. 32 | storage: Arc, 33 | /// Transaction runtime wrapper (snapshot, locks, undo tracking). 34 | txn: TxnContext<'a>, 35 | } 36 | 37 | impl<'a> ExecutionContext<'a> { 38 | pub fn new( 39 | catalog: &'a mut Catalog, 40 | txn: &'a mut Transaction, 41 | txn_mgr: Arc, 42 | storage: Arc, 43 | ) -> Self { 44 | Self { 45 | catalog, 46 | storage, 47 | txn: TxnContext::new(txn_mgr, txn), 48 | } 49 | } 50 | 51 | /// Evaluate an expression expected to produce a boolean result. 52 | pub fn eval_predicate(&self, expr: &Expr, tuple: &Tuple) -> QuillSQLResult { 53 | match expr.evaluate(tuple)? { 54 | ScalarValue::Boolean(Some(v)) => Ok(v), 55 | ScalarValue::Boolean(None) => Ok(false), 56 | other => Err(QuillSQLError::Execution(format!( 57 | "predicate value must be boolean, got {}", 58 | other 59 | ))), 60 | } 61 | } 62 | 63 | /// Evaluate an arbitrary scalar expression. 64 | pub fn eval_expr(&self, expr: &Expr, tuple: &Tuple) -> QuillSQLResult { 65 | expr.evaluate(tuple) 66 | } 67 | 68 | /// Look up the table heap through the storage engine. 69 | pub fn table(&self, table: &TableReference) -> QuillSQLResult { 70 | self.storage.table(self.catalog, table) 71 | } 72 | 73 | pub fn table_heap(&self, table: &TableReference) -> QuillSQLResult> { 74 | Ok(self.table(table)?.table_heap()) 75 | } 76 | 77 | pub fn txn_ctx(&self) -> &TxnContext<'a> { 78 | &self.txn 79 | } 80 | 81 | pub fn txn_ctx_mut(&mut self) -> &mut TxnContext<'a> { 82 | &mut self.txn 83 | } 84 | } 85 | 86 | pub struct ExecutionEngine<'a> { 87 | pub context: ExecutionContext<'a>, 88 | } 89 | impl<'a> ExecutionEngine<'a> { 90 | pub fn execute(&mut self, plan: Arc) -> QuillSQLResult> { 91 | plan.init(&mut self.context)?; 92 | let mut result = Vec::new(); 93 | loop { 94 | let next_tuple = plan.next(&mut self.context)?; 95 | if let Some(tuple) = next_tuple { 96 | result.push(tuple); 97 | } else { 98 | break; 99 | } 100 | } 101 | Ok(result) 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/execution/physical_plan/nested_loop_join.rs: -------------------------------------------------------------------------------- 1 | //! Teaching-first nested loop join with optional predicate evaluation. 2 | 3 | use log::debug; 4 | use std::cell::RefCell; 5 | use std::sync::Arc; 6 | 7 | use crate::catalog::SchemaRef; 8 | use crate::expression::Expr; 9 | use crate::{ 10 | error::QuillSQLResult, 11 | execution::{ExecutionContext, VolcanoExecutor}, 12 | plan::logical_plan::JoinType, 13 | storage::tuple::Tuple, 14 | }; 15 | 16 | use super::PhysicalPlan; 17 | 18 | #[derive(Debug)] 19 | pub struct PhysicalNestedLoopJoin { 20 | pub join_type: JoinType, 21 | pub condition: Option, 22 | pub left_input: Arc, 23 | pub right_input: Arc, 24 | pub schema: SchemaRef, 25 | 26 | left_tuple: RefCell>, 27 | } 28 | impl PhysicalNestedLoopJoin { 29 | pub fn new( 30 | join_type: JoinType, 31 | condition: Option, 32 | left_input: Arc, 33 | right_input: Arc, 34 | schema: SchemaRef, 35 | ) -> Self { 36 | PhysicalNestedLoopJoin { 37 | join_type, 38 | condition, 39 | left_input, 40 | right_input, 41 | schema, 42 | left_tuple: RefCell::new(None), 43 | } 44 | } 45 | } 46 | impl VolcanoExecutor for PhysicalNestedLoopJoin { 47 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 48 | debug!("init nested loop join executor"); 49 | self.left_input.init(context)?; 50 | self.right_input.init(context)?; 51 | self.left_tuple.borrow_mut().take(); 52 | Ok(()) 53 | } 54 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 55 | let mut left_next_tuple = if self.left_tuple.borrow().is_none() { 56 | self.left_input.next(context)? 57 | } else { 58 | self.left_tuple.borrow().clone() 59 | }; 60 | 61 | while left_next_tuple.is_some() { 62 | let left_tuple = left_next_tuple.clone().unwrap(); 63 | 64 | let mut right_next_tuple = self.right_input.next(context)?; 65 | while right_next_tuple.is_some() { 66 | let right_tuple = right_next_tuple.unwrap(); 67 | 68 | // TODO judge if matches 69 | if let Some(condition) = &self.condition { 70 | let merged_tuple = 71 | Tuple::try_merge(vec![left_tuple.clone(), right_tuple.clone()])?; 72 | if context.eval_predicate(condition, &merged_tuple)? { 73 | self.left_tuple.borrow_mut().replace(left_tuple.clone()); 74 | return Ok(Some(Tuple::try_merge(vec![left_tuple, right_tuple])?)); 75 | } 76 | } else { 77 | // save latest left_next_result before return 78 | self.left_tuple.borrow_mut().replace(left_tuple.clone()); 79 | 80 | return Ok(Some(Tuple::try_merge(vec![left_tuple, right_tuple])?)); 81 | } 82 | 83 | right_next_tuple = self.right_input.next(context)?; 84 | } 85 | 86 | // reset right executor 87 | self.right_input.init(context)?; 88 | left_next_tuple = self.left_input.next(context)?; 89 | } 90 | Ok(None) 91 | } 92 | 93 | fn output_schema(&self) -> SchemaRef { 94 | self.schema.clone() 95 | } 96 | } 97 | 98 | impl std::fmt::Display for PhysicalNestedLoopJoin { 99 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 100 | write!(f, "NestedLoopJoin") 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/tests/sql_test.rs: -------------------------------------------------------------------------------- 1 | use crate::database::Database; 2 | use crate::error::QuillSQLError; 3 | use crate::session::SessionContext; 4 | use crate::storage::tuple::Tuple; 5 | use regex::Regex; 6 | use sqllogictest::{DBOutput, DefaultColumnType}; 7 | use std::path::{Path, PathBuf}; 8 | 9 | pub struct QuillSQLDB { 10 | db: Database, 11 | session: SessionContext, 12 | } 13 | 14 | impl Default for QuillSQLDB { 15 | fn default() -> Self { 16 | Self::new() 17 | } 18 | } 19 | 20 | impl QuillSQLDB { 21 | pub fn new() -> Self { 22 | let db = Database::new_temp().unwrap(); 23 | let session = SessionContext::new(db.default_isolation()); 24 | Self { db, session } 25 | } 26 | } 27 | 28 | fn tuples_to_sqllogictest_string(tuples: Vec) -> Vec> { 29 | let mut output = vec![]; 30 | for tuple in tuples.iter() { 31 | let mut row = vec![]; 32 | for value in tuple.data.iter() { 33 | row.push(format!("{value}")); 34 | } 35 | output.push(row); 36 | } 37 | output 38 | } 39 | 40 | impl sqllogictest::DB for QuillSQLDB { 41 | type Error = QuillSQLError; 42 | type ColumnType = DefaultColumnType; 43 | 44 | fn run(&mut self, sql: &str) -> Result, Self::Error> { 45 | let is_query_sql = { 46 | let lower_sql = sql.trim_start().to_ascii_lowercase(); 47 | lower_sql.starts_with("select") || lower_sql.starts_with("explain") 48 | }; 49 | let tuples = self.db.run_with_session(&mut self.session, sql)?; 50 | if tuples.is_empty() { 51 | if is_query_sql { 52 | return Ok(DBOutput::Rows { 53 | types: vec![], 54 | rows: vec![], 55 | }); 56 | } else { 57 | return Ok(DBOutput::StatementComplete(0)); 58 | } 59 | } 60 | let types = vec![DefaultColumnType::Any; tuples[0].schema.column_count()]; 61 | let rows = tuples_to_sqllogictest_string(tuples); 62 | Ok(DBOutput::Rows { types, rows }) 63 | } 64 | } 65 | 66 | #[test] 67 | fn sqllogictest() { 68 | let test_files = read_dir_recursive("src/tests/sql_example/"); 69 | println!("test_files: {:?}", test_files); 70 | 71 | for file in test_files { 72 | let db = QuillSQLDB::new(); 73 | let mut tester = sqllogictest::Runner::new(db); 74 | println!( 75 | "======== start to run file {} ========", 76 | file.to_str().unwrap() 77 | ); 78 | tester.run_file(file).unwrap(); 79 | } 80 | } 81 | 82 | #[allow(dead_code)] 83 | fn read_dir_recursive>(path: P) -> Vec { 84 | let mut dst = vec![]; 85 | read_dir_recursive_impl(&mut dst, path.as_ref()); 86 | dst 87 | } 88 | 89 | fn read_dir_recursive_impl(dst: &mut Vec, path: &Path) { 90 | let push_file = |dst: &mut Vec, path: PathBuf| { 91 | // skip _xxx.slt file 92 | if Regex::new(r"/_.*\.slt") 93 | .unwrap() 94 | .is_match(path.to_str().unwrap()) 95 | { 96 | println!("skip file: {:?}", path); 97 | } else { 98 | dst.push(path); 99 | } 100 | }; 101 | 102 | if path.is_dir() { 103 | let entries = std::fs::read_dir(path).unwrap(); 104 | for entry in entries { 105 | let path = entry.unwrap().path(); 106 | 107 | if path.is_dir() { 108 | read_dir_recursive_impl(dst, &path); 109 | } else { 110 | push_file(dst, path); 111 | } 112 | } 113 | } else { 114 | push_file(dst, path.to_path_buf()); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/execution/physical_plan/seq_scan.rs: -------------------------------------------------------------------------------- 1 | //! Table sequential scan operator (full-table read with MVCC filtering). 2 | 3 | use std::cell::RefCell; 4 | use std::sync::OnceLock; 5 | 6 | use super::scan::ScanPrefetch; 7 | use crate::catalog::SchemaRef; 8 | use crate::execution::physical_plan::{resolve_table_binding, stream_not_ready}; 9 | use crate::storage::{ 10 | engine::{TableBinding, TupleStream}, 11 | page::{RecordId, TupleMeta}, 12 | }; 13 | use crate::transaction::LockMode; 14 | use crate::utils::table_ref::TableReference; 15 | use crate::{ 16 | error::QuillSQLResult, 17 | execution::{ExecutionContext, VolcanoExecutor}, 18 | storage::tuple::Tuple, 19 | }; 20 | 21 | const PREFETCH_BATCH: usize = 64; 22 | 23 | pub struct PhysicalSeqScan { 24 | pub table: TableReference, 25 | pub table_schema: SchemaRef, 26 | 27 | iterator: RefCell>>, 28 | prefetch: ScanPrefetch, 29 | table_binding: OnceLock, 30 | } 31 | 32 | impl PhysicalSeqScan { 33 | pub fn new(table: TableReference, table_schema: SchemaRef) -> Self { 34 | PhysicalSeqScan { 35 | table, 36 | table_schema, 37 | iterator: RefCell::new(None), 38 | prefetch: ScanPrefetch::new(PREFETCH_BATCH), 39 | table_binding: OnceLock::new(), 40 | } 41 | } 42 | 43 | fn consume_row( 44 | &self, 45 | context: &mut ExecutionContext, 46 | rid: RecordId, 47 | meta: TupleMeta, 48 | tuple: Tuple, 49 | ) -> QuillSQLResult> { 50 | context 51 | .txn_ctx_mut() 52 | .read_visible_tuple(&self.table, rid, &meta, tuple) 53 | } 54 | } 55 | 56 | impl VolcanoExecutor for PhysicalSeqScan { 57 | fn init(&self, context: &mut ExecutionContext) -> QuillSQLResult<()> { 58 | context 59 | .txn_ctx_mut() 60 | .lock_table(self.table.clone(), LockMode::IntentionShared)?; 61 | let binding = resolve_table_binding(&self.table_binding, context, &self.table)?; 62 | let stream = binding.scan()?; 63 | self.iterator.replace(Some(stream)); 64 | self.prefetch.clear(); 65 | Ok(()) 66 | } 67 | 68 | fn next(&self, context: &mut ExecutionContext) -> QuillSQLResult> { 69 | loop { 70 | if let Some((rid, meta, tuple)) = self.prefetch.pop_front() { 71 | if let Some(result) = self.consume_row(context, rid, meta, tuple)? { 72 | return Ok(Some(result)); 73 | } 74 | continue; 75 | } 76 | 77 | if !self.prefetch.refill(|limit, out| { 78 | let mut guard = self.iterator.borrow_mut(); 79 | let stream = guard.as_mut().ok_or_else(|| stream_not_ready("SeqScan"))?; 80 | for _ in 0..limit { 81 | match stream.next()? { 82 | Some(entry) => out.push_back(entry), 83 | None => break, 84 | } 85 | } 86 | Ok(()) 87 | })? { 88 | return Ok(None); 89 | } 90 | } 91 | } 92 | 93 | fn output_schema(&self) -> SchemaRef { 94 | self.table_schema.clone() 95 | } 96 | } 97 | 98 | impl std::fmt::Display for PhysicalSeqScan { 99 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 100 | write!(f, "SeqScan") 101 | } 102 | } 103 | 104 | impl std::fmt::Debug for PhysicalSeqScan { 105 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 106 | f.debug_struct("PhysicalSeqScan") 107 | .field("table", &self.table) 108 | .field("table_schema", &self.table_schema) 109 | .finish() 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /docs/src/modules/storage.md: -------------------------------------------------------------------------------- 1 | # Storage Engine 2 | 3 | The storage engine persists relational data, covering heap files, indexes, page formats, 4 | and the handles exposed to execution. Understanding this layer is key to reasoning about 5 | performance, MVCC, and recovery. 6 | 7 | --- 8 | 9 | ## Responsibilities 10 | 11 | - Manage `TableHeap` insert/delete/update paths and their MVCC metadata. 12 | - Maintain indexes (see the [Index module](./index.md) for details). 13 | - Expose the `StorageEngine` trait so execution can fetch `TableHandle` / `IndexHandle` 14 | instances per table. 15 | - Provide `TupleStream` so sequential and index scans share a unified interface. 16 | 17 | --- 18 | 19 | ## Directory Layout 20 | 21 | | Path | Purpose | Key Types | 22 | | ---- | ------- | --------- | 23 | | `engine.rs` | Default engine plus handle definitions. | `StorageEngine`, `TableHandle`, `TupleStream` | 24 | | `table_heap/` | Heap storage + MVCC logic. | `TableHeap`, `MvccHeap` | 25 | | `index/` | B+Tree implementation. | `BPlusTreeIndex` | 26 | | `page/` | Page, RID, tuple metadata. | `Page`, `RecordId`, `TupleMeta` | 27 | | `tuple/` | Row encoding and projection helpers. | `Tuple` | 28 | | `disk_manager.rs` | File layout and page I/O. | `DiskManager` | 29 | | `disk_scheduler.rs` | `io_uring`-backed async scheduler. | `DiskScheduler` | 30 | 31 | --- 32 | 33 | ## Core Abstractions 34 | 35 | ### StorageEngine Trait 36 | ```rust 37 | pub trait StorageEngine { 38 | fn table(&self, catalog: &Catalog, table: &TableReference) 39 | -> QuillSQLResult>; 40 | fn indexes(&self, catalog: &Catalog, table: &TableReference) 41 | -> QuillSQLResult>>; 42 | } 43 | ``` 44 | The default implementation wraps the row-oriented heap + B+Tree combo, but the trait is 45 | ready for column stores, remote storage, or async engines. 46 | 47 | ### TableHandle 48 | Offers `full_scan()`, `insert`, `delete`, `update`, and 49 | `prepare_row_for_write`. MVCC, undo, and locking concerns live here so execution operators 50 | only describe intent. Every delete/update now receives the table’s index handles so 51 | `HeapTableHandle` can delete or re-insert keys in tandem with heap tuples—exactly the 52 | behaviour CMU 15-445’s buffer/heap projects walk you through. 53 | 54 | ### TupleStream 55 | Minimal iterator that returns `(RecordId, TupleMeta, Tuple)` triples. Index scans use 56 | `IndexScanRequest` to describe ranges. 57 | 58 | --- 59 | 60 | ## Interactions 61 | 62 | - **Execution** – `ExecutionContext::table_stream` / `index_stream` delegate to handles. 63 | - **Transaction** – Handle methods call into `TxnContext` to acquire locks, record undo, 64 | and emit WAL. 65 | - **Buffer Manager** – `TableHeap`/`BPlusTreeIndex` access pages through the shared buffer 66 | pool. 67 | - **Recovery** – Heap/index mutations generate WAL records (`HeapInsert`, `HeapDelete`, 68 | `IndexInsert`, …) that ARIES replays. 69 | - **Background** – MVCC vacuum and index cleanup obtain handles and iterate tuples via 70 | the same abstractions as foreground scans. 71 | 72 | --- 73 | 74 | ## Teaching Ideas 75 | 76 | - Implement a toy columnar handle to show how the execution engine can stay agnostic to 77 | storage layout. 78 | - Extend the `TableHandle::full_scan` / `TableIterator` plumbing to accept projection hints 79 | so students can experiment with column pruning. 80 | - Enable `RUST_LOG=storage::table_heap=trace` and trace MVCC version chains as updates 81 | occur. 82 | - Follow the CMU 15-445 Lab 2 flow: instrument `TableBinding::delete` to print every RID 83 | + key pair, run an UPDATE with multiple indexes, and confirm the WAL stream contains the 84 | matching HeapInsert/HeapDelete + IndexLeafInsert/IndexLeafDelete entries. 85 | 86 | --- 87 | 88 | Further reading: [Disk I/O](../storage/disk_io.md), 89 | [Page & Tuple Layout](../storage/page_layouts.md), 90 | [Table Heap & MVCC](../storage/table_heap.md) 91 | -------------------------------------------------------------------------------- /src/utils/cache/tiny_lfu.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::DefaultHasher; 2 | use std::hash::{Hash, Hasher}; 3 | 4 | /// TinyLFU-style admission filter: approximate frequency via 4-bit counters in a simple 5 | /// Count-Min Sketch. This is a minimal, lockless (external locking) and CPU-cheap version 6 | /// intended to bias admission decisions before the main replacer. 7 | #[derive(Debug)] 8 | pub struct TinyLFU { 9 | width: usize, 10 | depth: usize, 11 | tables: Vec>, // 4-bit per counter packed into u8 (2 counters per byte) 12 | } 13 | 14 | impl TinyLFU { 15 | pub fn new(width: usize, depth: usize) -> Self { 16 | let width = width.next_power_of_two(); 17 | let depth = depth.max(1).min(4); 18 | let tables = (0..depth).map(|_| vec![0u8; (width + 1) / 2]).collect(); 19 | Self { 20 | width, 21 | depth, 22 | tables, 23 | } 24 | } 25 | 26 | #[inline] 27 | fn hash_i(&self, key: u64, i: usize) -> usize { 28 | let mut h = DefaultHasher::new(); 29 | (key.wrapping_add((i as u64) << 32)).hash(&mut h); 30 | (h.finish() as usize) & (self.width - 1) 31 | } 32 | 33 | #[inline] 34 | fn load_counter(slot: &mut [u8], idx: usize) -> u8 { 35 | let byte = &mut slot[idx / 2]; 36 | if idx % 2 == 0 { 37 | *byte & 0x0F 38 | } else { 39 | (*byte >> 4) & 0x0F 40 | } 41 | } 42 | 43 | #[inline] 44 | fn store_counter(slot: &mut [u8], idx: usize, val: u8) { 45 | let b = &mut slot[idx / 2]; 46 | if idx % 2 == 0 { 47 | *b = (*b & 0xF0) | (val & 0x0F); 48 | } else { 49 | *b = (*b & 0x0F) | ((val & 0x0F) << 4); 50 | } 51 | } 52 | 53 | /// Record an access for the 64-bit key. 54 | pub fn admit_record(&mut self, key: u64) { 55 | for i in 0..self.depth { 56 | let idx = self.hash_i(key, i); 57 | let slot = &mut self.tables[i]; 58 | let mut c = Self::load_counter(slot, idx); 59 | if c < 15 { 60 | c += 1; 61 | } 62 | Self::store_counter(slot, idx, c); 63 | } 64 | } 65 | 66 | /// Estimate frequency for the key (min of counters). 67 | pub fn estimate(&self, key: u64) -> u8 { 68 | let mut minv = 15u8; 69 | for i in 0..self.depth { 70 | let idx = self.hash_i(key, i); 71 | let slot = &self.tables[i]; 72 | let c = if idx / 2 < slot.len() { 73 | if idx % 2 == 0 { 74 | slot[idx / 2] & 0x0F 75 | } else { 76 | (slot[idx / 2] >> 4) & 0x0F 77 | } 78 | } else { 79 | 0 80 | }; 81 | if c < minv { 82 | minv = c; 83 | } 84 | } 85 | minv 86 | } 87 | 88 | /// Periodic aging to prevent counter saturation. Halves all counters. 89 | pub fn age(&mut self) { 90 | for t in self.tables.iter_mut() { 91 | for b in t.iter_mut() { 92 | let lo = (*b & 0x0F) >> 1; 93 | let hi = ((*b >> 4) & 0x0F) >> 1; 94 | *b = (hi << 4) | lo; 95 | } 96 | } 97 | } 98 | } 99 | 100 | #[cfg(test)] 101 | mod tests { 102 | use super::*; 103 | 104 | #[test] 105 | fn tiny_lfu_basic() { 106 | let mut f = TinyLFU::new(1024, 4); 107 | let k1 = 123u64; 108 | let k2 = 456u64; 109 | for _ in 0..8 { 110 | f.admit_record(k1); 111 | } 112 | for _ in 0..2 { 113 | f.admit_record(k2); 114 | } 115 | assert!(f.estimate(k1) >= f.estimate(k2)); 116 | f.age(); 117 | assert!(f.estimate(k1) >= f.estimate(k2)); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /docs/src/storage/disk_io.md: -------------------------------------------------------------------------------- 1 | # Disk I/O — Scheduler, io_uring Data Pages & WAL Runtime 2 | 3 | ## 1. Architecture 4 | 5 | - **Request Path**: foreground components enqueue `DiskRequest` objects via `DiskScheduler::{schedule_read, schedule_write, …}`. A dispatcher thread drains the global channel and distributes work round-robin to N io_uring workers. Each worker owns its own ring and file-descriptor cache, so once a request is forwarded, execution proceeds entirely off the foreground thread. 6 | - **Stable APIs**: `schedule_read(page_id)`, `schedule_write(page_id, Bytes)`, `schedule_read_pages(Vec)`, `schedule_allocate()`, `schedule_deallocate(page_id)` — every call returns a channel the caller can block on or poll. 7 | - **Batch Reads**: `ReadPages` fans out per-page SQEs while a shared `BatchState` tracks completions. Even if the kernel completes I/O out of order, the caller receives a `Vec` that preserves the original page order. 8 | 9 | ## 2. WAL Runtime (buffered I/O) 10 | 11 | - Dedicated WAL runtime threads handle sequential WAL appends/reads using buffered I/O. They now keep a per-thread cache of open segment files, eliminating repeated `open()`/`close()` on every log record. 12 | - Worker count defaults to `max(1, available_parallelism / 2)` but is tunable through `IOSchedulerConfig`. 13 | - Optional `sync` on a request triggers `sync_data` / `fdatasync` so `WalManager` can honour synchronous commit or checkpoint barriers. Data pages stay on the io_uring dataplane; WAL always uses buffered writes. 14 | 15 | ## 3. io_uring Backend (Linux) 16 | 17 | - Each worker owns an `IoUring` with configurable `queue_depth`, optional SQPOLL idle timeout, and a pool of registered fixed buffers sized to `PAGE_SIZE`. Workers submit SQEs asynchronously and drain CQEs in small batches to keep the ring warm. 18 | - Read batching relies on shared `BatchState` instances (`Rc>`) so multi-page callers see ordered results without blocking the kernel on serialization. 19 | - Writes keep their payload alive until completion; if a fixed buffer slot is available we reuse it, otherwise we fall back to heap buffers. A companion `WriteState` tracks an optional `fdatasync` so the caller still observes exactly one `Result<()>` once all CQEs land. 20 | - Errors (short read/write, errno) are normalised into `QuillSQLError` values that flow back on the original channel. 21 | 22 | ## 4. Configuration 23 | 24 | - `config::IOSchedulerConfig` controls: 25 | - `workers`: number of io_uring workers (default = available parallelism). 26 | - `wal_workers`: WAL runtime threads (default workers / 2). 27 | - `iouring_queue_depth`, `iouring_fixed_buffers`, `iouring_sqpoll_idle_ms`. 28 | - `fsync_on_write`: whether data-page writes also issue `fdatasync` (WAL sync is managed separately by `WalManager`). 29 | 30 | ## 5. Concurrency & Safety 31 | 32 | - Worker-local file descriptors plus positional I/O remove shared mutable state on the hot path. The new per-worker handle cache further reduces syscall overhead. 33 | - Shutdown sequence: enqueue `Shutdown`, dispatcher forwards it to every worker, each worker drains outstanding SQEs/CQEs, and finally dispatcher + workers are joined. 34 | - BufferPool and TableHeap integrate via the same scheduler channels; inflight guards 35 | prevent duplicate page fetches even when multiple scans touch adjacent pages. 36 | 37 | ## 6. Performance Notes 38 | 39 | - Random page access benefits from fewer syscalls and deeper outstanding queue depth than the blocking fallback. 40 | - Only the io_uring backend currently ships (Linux x86_64). A portable fallback remains future work. 41 | - For large sequential scans, rely on the buffer pool's sequential access pattern or add 42 | a custom iterator on top of `ReadPages` if you want to experiment with direct I/O. 43 | 44 | ## 7. Future Work 45 | 46 | - Queue-depth aware scheduling and CQE bulk harvesting. 47 | - Optional group commit (aggregate writes, single fsync) behind configuration. 48 | - Metrics hooks (queue depth, submit/complete throughput, latency percentiles, error codes). 49 | - Cross-platform fallback backend and richer prioritisation/throttling policies. 50 | - Control-plane knobs for throttling individual background workers. 51 | -------------------------------------------------------------------------------- /src/recovery/resource_manager.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::{Arc, RwLock}; 3 | 4 | use once_cell::sync::Lazy; 5 | use std::sync::OnceLock; 6 | 7 | use crate::buffer::BufferManager; 8 | use crate::error::{QuillSQLError, QuillSQLResult}; 9 | use crate::recovery::wal::codec::{decode_page_write, ResourceManagerId, WalFrame}; 10 | use crate::recovery::Lsn; 11 | use crate::storage::disk_scheduler::DiskScheduler; 12 | 13 | #[derive(Clone)] 14 | pub struct RedoContext { 15 | pub disk_scheduler: Arc, 16 | pub buffer_pool: Option>, 17 | } 18 | 19 | #[derive(Clone)] 20 | pub struct UndoContext { 21 | pub disk_scheduler: Arc, 22 | pub buffer_pool: Option>, 23 | } 24 | 25 | pub trait ResourceManager: Send + Sync { 26 | fn redo(&self, frame: &WalFrame, ctx: &RedoContext) -> QuillSQLResult; 27 | fn undo(&self, frame: &WalFrame, ctx: &UndoContext) -> QuillSQLResult<()>; 28 | 29 | fn transaction_id(&self, _frame: &WalFrame) -> Option { 30 | None 31 | } 32 | } 33 | 34 | static REGISTRY: Lazy>>> = 35 | Lazy::new(|| RwLock::new(HashMap::new())); 36 | 37 | pub fn register_resource_manager(id: ResourceManagerId, manager: Arc) { 38 | let mut guard = REGISTRY 39 | .write() 40 | .expect("resource manager registry poisoned"); 41 | guard.insert(id, manager); 42 | } 43 | 44 | pub fn get_resource_manager(id: ResourceManagerId) -> Option> { 45 | let guard = REGISTRY.read().expect("resource manager registry poisoned"); 46 | guard.get(&id).cloned() 47 | } 48 | 49 | #[derive(Default)] 50 | struct PageResourceManager; 51 | 52 | impl PageResourceManager { 53 | fn page_requires_redo( 54 | &self, 55 | ctx: &RedoContext, 56 | page_id: u32, 57 | record_lsn: Lsn, 58 | ) -> QuillSQLResult { 59 | if let Some(bpm) = &ctx.buffer_pool { 60 | match bpm.fetch_page_read(page_id) { 61 | Ok(guard) => Ok(guard.lsn() < record_lsn), 62 | Err(_) => Ok(true), 63 | } 64 | } else { 65 | Ok(true) 66 | } 67 | } 68 | 69 | fn redo_page_write( 70 | &self, 71 | ctx: &RedoContext, 72 | payload: crate::recovery::wal::codec::PageWritePayload, 73 | ) -> QuillSQLResult<()> { 74 | debug_assert_eq!(payload.page_image.len(), crate::buffer::PAGE_SIZE); 75 | let bytes = bytes::Bytes::from(payload.page_image); 76 | let rx = ctx.disk_scheduler.schedule_write(payload.page_id, bytes)?; 77 | rx.recv().map_err(|e| { 78 | QuillSQLError::Internal(format!("WAL recovery write recv failed: {}", e)) 79 | })??; 80 | Ok(()) 81 | } 82 | } 83 | 84 | impl ResourceManager for PageResourceManager { 85 | fn redo(&self, frame: &WalFrame, ctx: &RedoContext) -> QuillSQLResult { 86 | if frame.info != 0 { 87 | return Err(QuillSQLError::Internal(format!( 88 | "Unknown Page info kind: {}", 89 | frame.info 90 | ))); 91 | } 92 | let payload = decode_page_write(&frame.body)?; 93 | if !self.page_requires_redo(ctx, payload.page_id, frame.lsn)? { 94 | return Ok(0); 95 | } 96 | self.redo_page_write(ctx, payload)?; 97 | Ok(1) 98 | } 99 | 100 | fn undo(&self, _frame: &WalFrame, _ctx: &UndoContext) -> QuillSQLResult<()> { 101 | Ok(()) 102 | } 103 | } 104 | 105 | static DEFAULT_RESOURCE_MANAGERS: OnceLock<()> = OnceLock::new(); 106 | 107 | pub fn ensure_default_resource_managers_registered() { 108 | DEFAULT_RESOURCE_MANAGERS.get_or_init(|| { 109 | register_resource_manager( 110 | ResourceManagerId::Page, 111 | Arc::new(PageResourceManager::default()), 112 | ); 113 | crate::storage::heap_recovery::ensure_heap_resource_manager_registered(); 114 | crate::storage::index::index_recovery::ensure_index_resource_manager_registered(); 115 | }); 116 | } 117 | -------------------------------------------------------------------------------- /src/recovery/wal/buffer.rs: -------------------------------------------------------------------------------- 1 | use crate::recovery::wal::Lsn; 2 | use crate::utils::ring_buffer::ConcurrentRingBuffer; 3 | 4 | use super::record::WalRecord; 5 | use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; 6 | 7 | #[derive(Debug)] 8 | pub struct WalBuffer { 9 | queue: ConcurrentRingBuffer, 10 | len: AtomicUsize, 11 | bytes: AtomicUsize, 12 | last_enqueued_end: AtomicU64, 13 | } 14 | 15 | impl WalBuffer { 16 | pub fn with_capacity(capacity: usize) -> Self { 17 | Self { 18 | queue: ConcurrentRingBuffer::with_capacity(capacity.max(1)), 19 | len: AtomicUsize::new(0), 20 | bytes: AtomicUsize::new(0), 21 | last_enqueued_end: AtomicU64::new(0), 22 | } 23 | } 24 | 25 | pub fn push(&self, record: WalRecord) { 26 | let encoded_len = record.encoded_len() as usize; 27 | let end_lsn = record.end_lsn; 28 | let mut pending = record; 29 | loop { 30 | match self.queue.try_push(pending) { 31 | Ok(()) => break, 32 | Err(returned) => { 33 | pending = returned; 34 | std::hint::spin_loop(); 35 | } 36 | } 37 | } 38 | self.len.fetch_add(1, Ordering::Release); 39 | self.bytes.fetch_add(encoded_len, Ordering::Release); 40 | self.last_enqueued_end.store(end_lsn, Ordering::Release); 41 | } 42 | 43 | #[inline] 44 | pub fn len(&self) -> usize { 45 | self.len.load(Ordering::Acquire) 46 | } 47 | 48 | #[inline] 49 | pub fn bytes(&self) -> usize { 50 | self.bytes.load(Ordering::Acquire) 51 | } 52 | 53 | #[inline] 54 | pub fn highest_end_lsn(&self) -> Lsn { 55 | self.last_enqueued_end.load(Ordering::Acquire) 56 | } 57 | 58 | #[inline] 59 | pub fn is_empty(&self) -> bool { 60 | self.len.load(Ordering::Acquire) == 0 61 | } 62 | 63 | pub fn drain_until(&self, upto: Lsn) -> (Vec, usize) { 64 | let mut drained = Vec::new(); 65 | let mut released = 0usize; 66 | loop { 67 | let Some(front) = self.queue.peek_clone() else { 68 | break; 69 | }; 70 | if front.end_lsn > upto { 71 | break; 72 | } 73 | if let Some(record) = self.queue.pop() { 74 | released += record.encoded_len() as usize; 75 | drained.push(record); 76 | } else { 77 | break; 78 | } 79 | } 80 | if !drained.is_empty() { 81 | self.len.fetch_sub(drained.len(), Ordering::Release); 82 | self.bytes.fetch_sub(released, Ordering::Release); 83 | } 84 | (drained, released) 85 | } 86 | 87 | pub fn pending(&self) -> Vec { 88 | self.queue.snapshot() 89 | } 90 | } 91 | 92 | #[cfg(test)] 93 | mod tests { 94 | use super::*; 95 | use bytes::Bytes; 96 | 97 | fn make_record(start: Lsn, len: usize) -> WalRecord { 98 | WalRecord { 99 | start_lsn: start, 100 | end_lsn: start + len as u64, 101 | payload: Bytes::from(vec![0u8; len]), 102 | } 103 | } 104 | 105 | #[test] 106 | fn push_updates_length_and_bytes() { 107 | let buffer = WalBuffer::with_capacity(8); 108 | buffer.push(make_record(0, 16)); 109 | buffer.push(make_record(16, 32)); 110 | 111 | assert_eq!(buffer.len(), 2); 112 | assert_eq!(buffer.bytes(), 48); 113 | assert_eq!(buffer.highest_end_lsn(), 48); 114 | } 115 | 116 | #[test] 117 | fn drain_until_releases_records_and_bytes() { 118 | let buffer = WalBuffer::with_capacity(8); 119 | buffer.push(make_record(0, 10)); 120 | buffer.push(make_record(10, 20)); 121 | buffer.push(make_record(30, 5)); 122 | 123 | let (drained, released) = buffer.drain_until(30); 124 | assert_eq!(drained.len(), 2); 125 | assert_eq!(released, 30); 126 | assert_eq!(buffer.len(), 1); 127 | assert_eq!(buffer.bytes(), 5); 128 | assert_eq!(buffer.highest_end_lsn(), 35); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/recovery/wal/codec/checkpoint.rs: -------------------------------------------------------------------------------- 1 | use crate::buffer::PageId; 2 | use crate::error::{QuillSQLError, QuillSQLResult}; 3 | use crate::recovery::Lsn; 4 | use crate::transaction::TransactionId; 5 | 6 | #[derive(Debug, Clone)] 7 | pub struct CheckpointPayload { 8 | pub last_lsn: Lsn, 9 | pub dirty_pages: Vec, 10 | pub active_transactions: Vec, 11 | /// Dirty Page Table: (page_id, recLSN) 12 | pub dpt: Vec<(PageId, Lsn)>, 13 | } 14 | 15 | pub fn encode_checkpoint(body: &CheckpointPayload) -> Vec { 16 | // Checkpoint (rmid=Checkpoint, info=0) 17 | // body: last_lsn(8) + dirty_pages_count(4) + dirty_pages[] + active_txns_count(4) + active_txns[] + dpt_count(4) + dpt[] 18 | let mut buf = Vec::new(); 19 | buf.extend_from_slice(&body.last_lsn.to_le_bytes()); 20 | buf.extend_from_slice(&(body.dirty_pages.len() as u32).to_le_bytes()); 21 | for page_id in &body.dirty_pages { 22 | buf.extend_from_slice(&page_id.to_le_bytes()); 23 | } 24 | buf.extend_from_slice(&(body.active_transactions.len() as u32).to_le_bytes()); 25 | for txn_id in &body.active_transactions { 26 | buf.extend_from_slice(&txn_id.to_le_bytes()); 27 | } 28 | buf.extend_from_slice(&(body.dpt.len() as u32).to_le_bytes()); 29 | for (page_id, rec_lsn) in &body.dpt { 30 | buf.extend_from_slice(&page_id.to_le_bytes()); 31 | buf.extend_from_slice(&rec_lsn.to_le_bytes()); 32 | } 33 | buf 34 | } 35 | 36 | pub fn decode_checkpoint(bytes: &[u8]) -> QuillSQLResult { 37 | if bytes.len() < 8 + 4 + 4 + 4 { 38 | return Err(QuillSQLError::Internal( 39 | "Checkpoint payload too short".to_string(), 40 | )); 41 | } 42 | let last_lsn = u64::from_le_bytes(bytes[0..8].try_into().unwrap()); 43 | let mut offset = 8; 44 | let dirty_pages_len = 45 | u32::from_le_bytes(bytes[offset..offset + 4].try_into().unwrap()) as usize; 46 | offset += 4; 47 | let mut dirty_pages = Vec::with_capacity(dirty_pages_len); 48 | for _ in 0..dirty_pages_len { 49 | if bytes.len() < offset + 4 { 50 | return Err(QuillSQLError::Internal( 51 | "Checkpoint dirty pages truncated".to_string(), 52 | )); 53 | } 54 | let page_id = u32::from_le_bytes(bytes[offset..offset + 4].try_into().unwrap()); 55 | offset += 4; 56 | dirty_pages.push(page_id); 57 | } 58 | if bytes.len() < offset + 4 { 59 | return Err(QuillSQLError::Internal( 60 | "Checkpoint active transactions truncated".to_string(), 61 | )); 62 | } 63 | let active_txn_len = u32::from_le_bytes(bytes[offset..offset + 4].try_into().unwrap()) as usize; 64 | offset += 4; 65 | let mut active_transactions = Vec::with_capacity(active_txn_len); 66 | for _ in 0..active_txn_len { 67 | if bytes.len() < offset + 8 { 68 | return Err(QuillSQLError::Internal( 69 | "Checkpoint active transactions truncated".to_string(), 70 | )); 71 | } 72 | let txn_id = u64::from_le_bytes(bytes[offset..offset + 8].try_into().unwrap()); 73 | offset += 8; 74 | active_transactions.push(txn_id); 75 | } 76 | if bytes.len() < offset + 4 { 77 | return Err(QuillSQLError::Internal( 78 | "Checkpoint DPT length missing".to_string(), 79 | )); 80 | } 81 | let dpt_len = u32::from_le_bytes(bytes[offset..offset + 4].try_into().unwrap()) as usize; 82 | offset += 4; 83 | let required_dpt = offset + dpt_len * (4 + 8); 84 | if bytes.len() < required_dpt { 85 | return Err(QuillSQLError::Internal( 86 | "Checkpoint DPT truncated".to_string(), 87 | )); 88 | } 89 | let mut dpt = Vec::with_capacity(dpt_len); 90 | let mut cur = offset; 91 | for _ in 0..dpt_len { 92 | let pid = u32::from_le_bytes(bytes[cur..cur + 4].try_into().unwrap()); 93 | cur += 4; 94 | let lsn = u64::from_le_bytes(bytes[cur..cur + 8].try_into().unwrap()); 95 | cur += 8; 96 | dpt.push((pid, lsn)); 97 | } 98 | Ok(CheckpointPayload { 99 | last_lsn, 100 | dirty_pages, 101 | active_transactions, 102 | dpt, 103 | }) 104 | } 105 | --------------------------------------------------------------------------------