├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md ├── config └── entangledb.yaml ├── husky └── cloud │ ├── build.sh │ ├── entangledb1 │ ├── data │ │ └── .gitkeep │ └── entangledb.yaml │ ├── entangledb2 │ ├── data │ │ └── .gitkeep │ └── entangledb.yaml │ ├── entangledb3 │ ├── data │ │ └── .gitkeep │ └── entangledb.yaml │ ├── entangledb4 │ ├── data │ │ └── .gitkeep │ └── entangledb.yaml │ └── entangledb5 │ ├── data │ └── .gitkeep │ └── entangledb.yaml ├── learning_resources.md └── src ├── bin ├── entangledb.rs └── entanglesql.rs ├── client.rs ├── error.rs ├── lib.rs ├── raft ├── log.rs ├── message.rs ├── mod.rs ├── node │ ├── candidate.rs │ ├── follower.rs │ ├── leader.rs │ └── mod.rs ├── server.rs └── state.rs ├── server.rs ├── sql ├── engine │ ├── kv.rs │ ├── mod.rs │ └── raft.rs ├── execution │ ├── aggregation.rs │ ├── join.rs │ ├── mod.rs │ ├── mutation.rs │ ├── query.rs │ ├── schema.rs │ └── source.rs ├── mod.rs ├── parser │ ├── ast.rs │ ├── lexer.rs │ └── mod.rs ├── plan │ ├── mod.rs │ ├── optimizer.rs │ └── planner.rs ├── schema.rs └── types │ ├── expression.rs │ └── mod.rs └── storage ├── bincode.rs ├── debug.rs ├── engine ├── bitcask.rs ├── memory.rs └── mod.rs ├── golden ├── bitcask │ ├── compact-after │ ├── compact-before │ └── log └── mvcc │ ├── anomaly_dirty_read │ ├── anomaly_dirty_write │ ├── anomaly_fuzzy_read │ ├── anomaly_lost_update │ ├── anomaly_phantom_read │ ├── anomaly_read_skew │ ├── anomaly_write_skew │ ├── begin │ ├── begin_as_of │ ├── begin_read_only │ ├── delete │ ├── delete_conflict │ ├── get │ ├── get_isolation │ ├── resume │ ├── rollback │ ├── scan │ ├── scan_isolation │ ├── scan_key_version_encoding │ ├── scan_prefix │ ├── set │ ├── set_conflict │ └── unversioned ├── keycode.rs ├── mod.rs └── mvcc.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /clusters/*/entangledb*/data 2 | /data 3 | /target 4 | .vscode/ 5 | **/*.rs.bk 6 | .aider* 7 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "entangledb" 3 | description = "A distributed SQL database" 4 | version = "0.1.0" 5 | edition = "2021" 6 | default-run = "entangledb" 7 | 8 | [lib] 9 | doctest = false 10 | 11 | [dependencies] 12 | bincode = "~1.3.3" 13 | clap = { version = "~4.4.2", features = ["cargo"] } 14 | config = "~0.13.3" 15 | derivative = "~2.2.0" 16 | fs4 = "~0.6.6" 17 | futures = "~0.3.15" 18 | futures-util = "~0.3.15" 19 | hex = "~0.4.3" 20 | lazy_static = "~1.4.0" 21 | log = "~0.4.14" 22 | names = "~0.14.0" 23 | rand = "~0.8.3" 24 | regex = "1.5.4" 25 | rustyline = "~12.0.0" 26 | rustyline-derive = "0.9.0" 27 | serde = "~1.0.126" 28 | serde_bytes = "~0.11.12" 29 | serde_derive = "~1.0.126" 30 | simplelog = "~0.12.1" 31 | tokio = { version = "~1.32.0", features = [ 32 | "macros", 33 | "rt", 34 | "rt-multi-thread", 35 | "net", 36 | "io-util", 37 | "time", 38 | "sync", 39 | ] } 40 | tokio-serde = { version = "~0.8", features = ["bincode"] } 41 | tokio-stream = { version = "~0.1.6", features = ["net"] } 42 | tokio-util = { version = "~0.7.8", features = ["codec"] } 43 | uuid = { version = "~1.4.1", features = ["v4"] } 44 | 45 | [dev-dependencies] 46 | goldenfile = "~1.5.2" 47 | paste = "~1.0.14" 48 | pretty_assertions = "~1.4.0" 49 | serial_test = "~2.0.0" 50 | tempdir = "~0.3.7" 51 | tempfile = "~3.8.0" 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | - [Overview](#overview) 3 | - [Usage](#usage) 4 | - [TODO](#todo) 5 | - [MVCC in entangleDB](#mvcc-in-entangledb) 6 | - [SQL Query Execution in entangleDB](#sql-query-execution-in-entangledb) 7 | - [entangleDB Raft Consensus Engine](#entangledb-raft-consensus-engine) 8 | - [What I am trying to build](#what-i-am-trying-to-build) 9 | - [Distributed Consensus Engine](#1-distributed-consensus-engine) 10 | - [Transaction Engine](#2-transaction-engine) 11 | - [Storage Engine](#3-storage-engine) 12 | - [Query Engine](#4-query-engine) 13 | - [SQL Interface and PostgreSQL Compatibility](#5-sql-interface-and-postgresql-compatibility) 14 | - [Proposed Architecture](#proposed-architecture) 15 | - [SQL Engine](#sql-engine) 16 | - [Raft Engine](#raft-engine) 17 | - [Storage Engine](#storage-engine) 18 | - [entangleDB Peers](#entangledb-peers) 19 | - [Example SQL Queries that you will be able to execute in entangleDB](#example-sql-queries-that-you-will-be-able-to-execute-in-entangledb) 20 | - [Learning Resources I've been using for building the database](#learning-resources-ive-been-using-for-building-the-database) 21 | 22 | ## Overview 23 | 24 | I'm working on creating entangleDB, a project that's all about really getting to know how databases work from the inside out. My aim is to deeply understand everything about databases, from the big picture down to the small details. It's a way for me to build a strong foundation in database. 25 | 26 | The name "entangleDB" is special because it's in honor of a friend who loves databases just as much as I do. 27 | 28 | The plan is to write the database in Rust. My main goal is to create something that's not only useful for me to learn from but also helpful for others who are interested in diving deep into how databases work. I'm hoping to make it postgresSQL compatible. 29 | 30 | ## Usage 31 | Pre-requisite is to have the Rust compiler; follow this doc to install the [Rust compiler](https://www.rust-lang.org/tools/install) 32 | 33 | entangledb cluster can be started on `localhost` ports `3201` to `3205`: 34 | 35 | ``` 36 | (cd husky/cloud && ./build.sh) 37 | ``` 38 | 39 | Client can be used to connect with the node on `localhost` port `3205`: 40 | 41 | ``` 42 | cargo run --release --bin entanglesql 43 | 44 | Connected to EntangleDB node "5". Enter !help for instructions. 45 | entangledb> SELECT * FROM dishes; 46 | poha 47 | breads 48 | korma 49 | ``` 50 | 51 | ## TODO 52 | 1. Make the isolation level configurable; currently, it is set to repeatable read (snapshot). 53 | 2. Implement partitions, both hash and range types. 54 | 3. Utilize generics throughout in Rust, thereby eliminating the need for std::fmt::Display + Send + Sync. 55 | 4. Consider the use of runtime assertions instead of employing Error::Internal ubiquitously. 56 | 5. Revisit the implementation of time-travel queries 57 | 58 | ## MVCC in entangleDB 59 | 60 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/0a923e2d-75fc-469e-9ce7-504af45c73c7) 61 | 62 | ## SQL Query Execution in entangleDB 63 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/a90fc90c-91e7-4ee8-a06f-887629a82401) 64 | 65 | ## entangleDB Raft Consensus Engine 66 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/a56f02b9-d172-4ab3-8883-230d7b1326b4) 67 | 68 | ## What I am trying to build 69 | 70 | ### 1. Distributed Consensus Engine 71 | 72 | The design for entangleDB centers around a custom-built consensus engine, intended for high availability in distributed settings. This engine will be crucial in maintaining consistent and reliable state management across various nodes. 73 | 74 | A key focus will be on linearizable state machine replication, an essential feature for ensuring data consistency across all nodes, especially for applications that require strong consistency. 75 | 76 | ### 2. Transaction Engine 77 | 78 | The proposed transaction engine for entangleDB is committed to adhering to ACID properties, ensuring reliability and integrity in every transaction. 79 | 80 | The plan includes the implementation of Snapshot Isolation and Serializable Isolation, with the aim of optimizing transaction handling for enhanced concurrency and data integrity. 81 | 82 | ### 3. Storage Engine 83 | 84 | The planned storage engine for entangleDB will explore a variety of storage formats to find and utilize the most efficient methods for data storage and retrieval. 85 | 86 | The storage layer is being designed for flexibility, to support a range of backend technologies and meet diverse storage requirements. 87 | 88 | ### 4. Query Engine 89 | 90 | The development of the query engine will focus on rapid and effective query processing, utilizing advanced optimization algorithms. 91 | 92 | A distinctive feature of entangleDB will be its ability to handle time-travel queries, allowing users to access and analyze data from different historical states. 93 | 94 | ### 5. SQL Interface and PostgreSQL Compatibility 95 | 96 | The SQL interface for entangleDB is intended to support a wide array of SQL functionalities, including complex queries, joins, aggregates, and window functions. 97 | 98 | Compatibility with PostgreSQL’s wire protocol is a goal, to facilitate smooth integration with existing PostgreSQL setups and offer a solid alternative for database system upgrades or migrations. 99 | 100 | ## Proposed Architecture 101 | Screenshot 2023-12-02 at 1 26 15 PM 102 | 103 | ## SQL Engine 104 | 105 | The SQL Engine is responsible for the intake and processing of SQL queries. It consists of: 106 | 107 | - **SQL Session**: The processing pipeline within a session includes: 108 | - `Parser`: Interprets SQL queries and converts them into a machine-understandable format. 109 | - `Planner`: Devises an execution plan based on the parsed input. 110 | - `Executor`: Carries out the plan, accessing and modifying the database. 111 | 112 | Adjacent to the session is the: 113 | 114 | - **SQL Storage Raft Backend**: This component integrates with the Raft consensus protocol to ensure distributed transactions are consistent and resilient. 115 | 116 | ## Raft Engine 117 | 118 | The Raft Engine is crucial for maintaining a consistent state across the distributed system: 119 | 120 | - **Raft Node**: This consensus node confirms that all database transactions are in sync across the network. 121 | - **Raft Log**: A record of all transactions agreed upon by the Raft consensus algorithm, which is crucial for data integrity and fault tolerance. 122 | 123 | ## Storage Engine 124 | 125 | The Storage Engine is where the actual data is stored and managed: 126 | 127 | - **State Machine Driver**: Comprising of: 128 | - `State Machine Interface`: An intermediary that conveys state changes from the Raft log to the storage layer. 129 | - `Key Value Backend`: The primary storage layer, consisting of: 130 | - `Bitcask Engine`: A simple, fast on-disk storage system for key-value data. 131 | - `MVCC Storage`: Handles multiple versions of data for read-write concurrency control. 132 | 133 | ## entangleDB Peers 134 | 135 | - interaction between multiple database instances or "peers". 136 | 137 | ## Example SQL Queries that you will be able to execute in entangleDB 138 | 139 | ```sql 140 | -- Transaction example with a table creation, data insertion, and selection 141 | BEGIN; 142 | 143 | CREATE TABLE employees (id INT PRIMARY KEY, name VARCHAR, department VARCHAR); 144 | INSERT INTO employees VALUES (1, 'Alice', 'Engineering'), (2, 'Bob', 'HR'); 145 | SELECT * FROM employees; 146 | 147 | COMMIT; 148 | 149 | -- Aggregation query with JOIN 150 | SELECT department, AVG(salary) FROM employees JOIN salaries ON employees.id = salaries.emp_id GROUP BY department; 151 | 152 | -- Time-travel query 153 | SELECT * FROM employees AS OF SYSTEM TIME '-5m'; 154 | ``` 155 | 156 | ## Learning Resources I've been using for building the database 157 | 158 | For a comprehensive list of resources that have been learning what to build in a distributed database, check out the [Learning Resources](https://github.com/TypicalDefender/entangleDB/blob/main/learning_resources.md) page. 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /config/entangledb.yaml: -------------------------------------------------------------------------------- 1 | # The node ID, peer ID/address map (empty for single node), and log level. 2 | id: 1 3 | peers: {} 4 | log_level: INFO 5 | 6 | # Network addresses to bind the SQL and Raft servers to. 7 | listen_sql: 0.0.0.0:3205 8 | listen_raft: 0.0.0.0:3305 9 | 10 | 11 | data_dir: data 12 | compact_threshold: 0.2 13 | sync: true 14 | 15 | storage_raft: bitcask 16 | 17 | storage_sql: bitcask 18 | -------------------------------------------------------------------------------- /husky/cloud/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | cargo build --release --bin entangledb 6 | 7 | for ID in 1 2 3 4 5; do 8 | (cargo run -q --release -- -c entangledb$ID/entangledb.yaml 2>&1 | sed -e "s/\\(.*\\)/entangledb$ID \\1/g") & 9 | done 10 | 11 | trap 'kill $(jobs -p)' EXIT 12 | wait < <(jobs -p) -------------------------------------------------------------------------------- /husky/cloud/entangledb1/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb1/data/.gitkeep -------------------------------------------------------------------------------- /husky/cloud/entangledb1/entangledb.yaml: -------------------------------------------------------------------------------- 1 | id: 1 2 | data_dir: entangledb1/data 3 | sync: false 4 | listen_sql: 0.0.0.0:3201 5 | listen_raft: 0.0.0.0:3301 6 | peers: 7 | '2': 127.0.0.1:3302 8 | '3': 127.0.0.1:3303 9 | '4': 127.0.0.1:3304 10 | '5': 127.0.0.1:3305 -------------------------------------------------------------------------------- /husky/cloud/entangledb2/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb2/data/.gitkeep -------------------------------------------------------------------------------- /husky/cloud/entangledb2/entangledb.yaml: -------------------------------------------------------------------------------- 1 | id: 2 2 | data_dir: entangledb2/data 3 | sync: false 4 | listen_sql: 0.0.0.0:3202 5 | listen_raft: 0.0.0.0:3302 6 | peers: 7 | '1': 127.0.0.1:3301 8 | '3': 127.0.0.1:3303 9 | '4': 127.0.0.1:3304 10 | '5': 127.0.0.1:3305 -------------------------------------------------------------------------------- /husky/cloud/entangledb3/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb3/data/.gitkeep -------------------------------------------------------------------------------- /husky/cloud/entangledb3/entangledb.yaml: -------------------------------------------------------------------------------- 1 | id: 3 2 | data_dir: entangledb3/data 3 | sync: false 4 | listen_sql: 0.0.0.0:3203 5 | listen_raft: 0.0.0.0:3303 6 | peers: 7 | '1': 127.0.0.1:3301 8 | '2': 127.0.0.1:3302 9 | '4': 127.0.0.1:3304 10 | '5': 127.0.0.1:3305 -------------------------------------------------------------------------------- /husky/cloud/entangledb4/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb4/data/.gitkeep -------------------------------------------------------------------------------- /husky/cloud/entangledb4/entangledb.yaml: -------------------------------------------------------------------------------- 1 | id: 4 2 | data_dir: entangledb4/data 3 | sync: false 4 | listen_sql: 0.0.0.0:3204 5 | listen_raft: 0.0.0.0:3304 6 | peers: 7 | '1': 127.0.0.1:3301 8 | '2': 127.0.0.1:3302 9 | '3': 127.0.0.1:3303 10 | '5': 127.0.0.1:3305 -------------------------------------------------------------------------------- /husky/cloud/entangledb5/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb5/data/.gitkeep -------------------------------------------------------------------------------- /husky/cloud/entangledb5/entangledb.yaml: -------------------------------------------------------------------------------- 1 | id: 5 2 | data_dir: entangledb5/data 3 | sync: false 4 | listen_sql: 0.0.0.0:3205 5 | listen_raft: 0.0.0.0:3305 6 | peers: 7 | '1': 127.0.0.1:3301 8 | '2': 127.0.0.1:3302 9 | '3': 127.0.0.1:3303 10 | '4': 127.0.0.1:3304 -------------------------------------------------------------------------------- /learning_resources.md: -------------------------------------------------------------------------------- 1 | # Learning Resources I've been using for building the database 2 | 3 | ### Introductory Materials 4 | 5 | **1. Lectures by Andy Pavlo** 6 | - **CMU 15-445 Intro to Database Systems**: [YouTube Playlist](https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi) (A Pavlo 2019) 7 | - **CMU 15-721 Advanced Database Systems**: [YouTube Playlist](https://www.youtube.com/playlist?list=PLSE8ODhjZXjasmrEd2_Yi1deeE360zv5O) (A Pavlo 2020) 8 | 9 | **2. Books by Martin Kleppman and Alex Petrov** 10 | - **Designing Data-Intensive Applications**: [Link to Book](https://dataintensive.net/) (M Kleppmann 2017) 11 | - **Database Internals**: [Link to Book](https://www.databass.dev) (A Petrov 2019) 12 | 13 | ### Raft Algorithm 14 | 15 | **1. Original Paper and Talks** 16 | - **In Search of an Understandable Consensus Algorithm**: [Raft Paper](https://raft.github.io/raft.pdf) (D Ongaro, J Ousterhout 2014) 17 | - **Designing for Understandability: The Raft Consensus Algorithm**: [YouTube Video](https://www.youtube.com/watch?v=vYp4LYbnnW8) (J Ousterhout 2016) 18 | 19 | **2. Student Guide** 20 | - **Students' Guide to Raft**: [Blog Post](https://thesquareplanet.com/blog/students-guide-to-raft/) (J Gjengset 2016) 21 | 22 | ### Parsing Techniques 23 | 24 | **1. Books by Thorsten Ball** 25 | - **Writing An Interpreter In Go**: [Link to Book](https://interpreterbook.com) (T Ball 2016) 26 | - **Writing A Compiler In Go**: [Link to Book](https://compilerbook.com) (T Ball 2018) 27 | 28 | **2. Blog Post** 29 | - **Parsing Expressions by Precedence Climbing**: [Blog Post](https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing) (E Bendersky 2012) 30 | 31 | ### Transactions and Consistency 32 | 33 | **1. Overviews and Classic Papers** 34 | - **Consistency Models**: [Jepsen Article](https://jepsen.io/consistency) (Jepsen 2016) 35 | - **A Critique of ANSI SQL Isolation Levels**: [Research Paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf) (H Berenson et al 1995) 36 | - **Generalized Isolation Level Definitions**: [Research Paper](http://pmg.csail.mit.edu/papers/icde00.pdf) (A Adya, B Liskov, P O'Neil 2000) 37 | 38 | **2. Blog Posts on MVCC Implementation** 39 | - **Implementing Your Own Transactions with MVCC**: [Blog Post](https://levelup.gitconnected.com/implementing-your-own-transactions-with-mvcc-bba11cab8e70) (E Chance 2015) 40 | - **How Postgres Makes Transactions Atomic**: [Blog Post](https://brandur.org/postgres-atomicity) (B Leach 2017) 41 | -------------------------------------------------------------------------------- /src/bin/entangledb.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * entangledb is the entangledb server. It takes configuration via a configuration file, command-line 3 | * parameters, and environment variables, then starts up a entangledb TCP server that communicates with 4 | * SQL clients (port 3205) and Raft peers (port 3305). 5 | */ 6 | 7 | #![warn(clippy::all)] 8 | 9 | use serde_derive::Deserialize; 10 | use std::collections::HashMap; 11 | use entangledb::error::{Error, Result}; 12 | use entangledb::raft; 13 | use entangledb::sql; 14 | use entangledb::storage; 15 | use entangledb::Server; 16 | 17 | #[tokio::main] 18 | async fn main() -> Result<()> { 19 | let args = clap::command!() 20 | .arg( 21 | clap::Arg::new("config") 22 | .short('c') 23 | .long("config") 24 | .help("Configuration file path") 25 | .default_value("config/entangledb.yaml"), 26 | ) 27 | .get_matches(); 28 | let cfg = Config::new(args.get_one::("config").unwrap().as_ref())?; 29 | 30 | let loglevel = cfg.log_level.parse::()?; 31 | let mut logconfig = simplelog::ConfigBuilder::new(); 32 | if loglevel != simplelog::LevelFilter::Debug { 33 | logconfig.add_filter_allow_str("entangledb"); 34 | } 35 | simplelog::SimpleLogger::init(loglevel, logconfig.build())?; 36 | 37 | let path = std::path::Path::new(&cfg.data_dir); 38 | let raft_log = match cfg.storage_raft.as_str() { 39 | "bitcask" | "" => raft::Log::new( 40 | Box::new(storage::engine::BitCask::new_compact( 41 | path.join("log"), 42 | cfg.compact_threshold, 43 | )?), 44 | cfg.sync, 45 | )?, 46 | "memory" => raft::Log::new(Box::new(storage::engine::Memory::new()), false)?, 47 | name => return Err(Error::Config(format!("Unknown Raft storage engine {}", name))), 48 | }; 49 | let raft_state: Box = match cfg.storage_sql.as_str() { 50 | "bitcask" | "" => { 51 | let engine = 52 | storage::engine::BitCask::new_compact(path.join("state"), cfg.compact_threshold)?; 53 | Box::new(sql::engine::Raft::new_state(engine)?) 54 | } 55 | "memory" => { 56 | let engine = storage::engine::Memory::new(); 57 | Box::new(sql::engine::Raft::new_state(engine)?) 58 | } 59 | name => return Err(Error::Config(format!("Unknown SQL storage engine {}", name))), 60 | }; 61 | 62 | Server::new(cfg.id, cfg.peers, raft_log, raft_state) 63 | .await? 64 | .listen(&cfg.listen_sql, &cfg.listen_raft) 65 | .await? 66 | .serve() 67 | .await 68 | } 69 | 70 | #[derive(Debug, Deserialize)] 71 | struct Config { 72 | id: raft::NodeID, 73 | peers: HashMap, 74 | listen_sql: String, 75 | listen_raft: String, 76 | log_level: String, 77 | data_dir: String, 78 | compact_threshold: f64, 79 | sync: bool, 80 | storage_raft: String, 81 | storage_sql: String, 82 | } 83 | 84 | impl Config { 85 | fn new(file: &str) -> Result { 86 | Ok(config::Config::builder() 87 | .set_default("id", "entangledb")? 88 | .set_default("listen_sql", "0.0.0.0:3205")? 89 | .set_default("listen_raft", "0.0.0.0:3305")? 90 | .set_default("log_level", "info")? 91 | .set_default("data_dir", "data")? 92 | .set_default("compact_threshold", 0.2)? 93 | .set_default("sync", true)? 94 | .set_default("storage_raft", "bitcask")? 95 | .set_default("storage_sql", "bitcask")? 96 | .add_source(config::File::with_name(file)) 97 | .add_source(config::Environment::with_prefix("entangledb")) 98 | .build()? 99 | .try_deserialize()?) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/bin/entanglesql.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | 3 | use rustyline::history::DefaultHistory; 4 | use rustyline::validate::{ValidationContext, ValidationResult, Validator}; 5 | use rustyline::{error::ReadlineError, Editor, Modifiers}; 6 | use rustyline_derive::{Completer, Helper, Highlighter, Hinter}; 7 | use entangledb::error::{Error, Result}; 8 | use entangledb::sql::execution::ResultSet; 9 | use entangledb::sql::parser::{Lexer, Token}; 10 | use entangledb::Client; 11 | 12 | #[tokio::main] 13 | async fn main() -> Result<()> { 14 | let opts = clap::command!() 15 | .name("entanglesql") 16 | .about("An EntangleDB client.") 17 | .args([ 18 | clap::Arg::new("command"), 19 | clap::Arg::new("host") 20 | .short('H') 21 | .long("host") 22 | .help("Host to connect to") 23 | .default_value("127.0.0.1"), 24 | clap::Arg::new("port") 25 | .short('p') 26 | .long("port") 27 | .help("Port number to connect to") 28 | .value_parser(clap::value_parser!(u16)) 29 | .default_value("3205"), 30 | ]) 31 | .get_matches(); 32 | 33 | let mut entanglesql = 34 | EntangleSQL::new(opts.get_one::("host").unwrap(), *opts.get_one("port").unwrap()) 35 | .await?; 36 | 37 | if let Some(command) = opts.get_one::("command") { 38 | entanglesql.execute(command).await 39 | } else { 40 | entanglesql.run().await 41 | } 42 | } 43 | 44 | /// The EntangleSQL REPL 45 | struct EntangleSQL { 46 | client: Client, 47 | editor: Editor, 48 | history_path: Option, 49 | show_headers: bool, 50 | } 51 | 52 | impl EntangleSQL { 53 | async fn new(host: &str, port: u16) -> Result { 54 | Ok(Self { 55 | client: Client::new((host, port)).await?, 56 | editor: Editor::new()?, 57 | history_path: std::env::var_os("HOME") 58 | .map(|home| std::path::Path::new(&home).join(".entanglesql.history")), 59 | show_headers: false, 60 | }) 61 | } 62 | 63 | /// Executes a line of input 64 | async fn execute(&mut self, input: &str) -> Result<()> { 65 | if input.starts_with('!') { 66 | self.execute_command(input).await 67 | } else if !input.is_empty() { 68 | self.execute_query(input).await 69 | } else { 70 | Ok(()) 71 | } 72 | } 73 | 74 | /// Handles a REPL command (prefixed by !, e.g. !help) 75 | async fn execute_command(&mut self, input: &str) -> Result<()> { 76 | let mut input = input.split_ascii_whitespace(); 77 | let command = input.next().ok_or_else(|| Error::Parse("Expected command.".to_string()))?; 78 | 79 | let getargs = |n| { 80 | let args: Vec<&str> = input.collect(); 81 | if args.len() != n { 82 | Err(Error::Parse(format!("{}: expected {} args, got {}", command, n, args.len()))) 83 | } else { 84 | Ok(args) 85 | } 86 | }; 87 | 88 | match command { 89 | "!headers" => match getargs(1)?[0] { 90 | "on" => { 91 | self.show_headers = true; 92 | println!("Headers enabled"); 93 | } 94 | "off" => { 95 | self.show_headers = false; 96 | println!("Headers disabled"); 97 | } 98 | v => return Err(Error::Parse(format!("Invalid value {}, expected on or off", v))), 99 | }, 100 | "!help" => println!( 101 | r#" 102 | Enter a SQL statement terminated by a semicolon (;) to execute it and display the result. 103 | The following commands are also available: 104 | 105 | !headers Enable or disable column headers 106 | !help This help message 107 | !status Display server status 108 | !table [table] Display table schema, if it exists 109 | !tables List tables 110 | "# 111 | ), 112 | "!status" => { 113 | let status = self.client.status().await?; 114 | let mut node_logs = status 115 | .raft 116 | .node_last_index 117 | .iter() 118 | .map(|(id, index)| format!("{}:{}", id, index)) 119 | .collect::>(); 120 | node_logs.sort(); 121 | println!( 122 | r#" 123 | Server: {server} (leader {leader} in term {term} with {nodes} nodes) 124 | Raft log: {committed} committed, {applied} applied, {raft_size} MB ({raft_storage} storage) 125 | Node logs: {logs} 126 | MVCC: {active_txns} active txns, {versions} versions 127 | Storage: {keys} keys, {logical_size} MB logical, {nodes}x {disk_size} MB disk, {garbage_percent}% garbage ({sql_storage} engine) 128 | "#, 129 | server = status.raft.server, 130 | leader = status.raft.leader, 131 | term = status.raft.term, 132 | nodes = status.raft.node_last_index.len(), 133 | committed = status.raft.commit_index, 134 | applied = status.raft.apply_index, 135 | raft_storage = status.raft.storage, 136 | raft_size = 137 | format_args!("{:.3}", status.raft.storage_size as f64 / 1000.0 / 1000.0), 138 | logs = node_logs.join(" "), 139 | versions = status.mvcc.versions, 140 | active_txns = status.mvcc.active_txns, 141 | keys = status.mvcc.storage.keys, 142 | logical_size = 143 | format_args!("{:.3}", status.mvcc.storage.size as f64 / 1000.0 / 1000.0), 144 | garbage_percent = format_args!( 145 | "{:.0}", 146 | if status.mvcc.storage.total_disk_size > 0 { 147 | status.mvcc.storage.garbage_disk_size as f64 148 | / status.mvcc.storage.total_disk_size as f64 149 | * 100.0 150 | } else { 151 | 0.0 152 | } 153 | ), 154 | disk_size = format_args!( 155 | "{:.3}", 156 | status.mvcc.storage.total_disk_size as f64 / 1000.0 / 1000.0 157 | ), 158 | sql_storage = status.mvcc.storage.name, 159 | ) 160 | } 161 | "!table" => { 162 | let args = getargs(1)?; 163 | println!("{}", self.client.get_table(args[0]).await?); 164 | } 165 | "!tables" => { 166 | getargs(0)?; 167 | for table in self.client.list_tables().await? { 168 | println!("{}", table) 169 | } 170 | } 171 | c => return Err(Error::Parse(format!("Unknown command {}", c))), 172 | } 173 | Ok(()) 174 | } 175 | 176 | /// Runs a query and displays the results 177 | async fn execute_query(&mut self, query: &str) -> Result<()> { 178 | match self.client.execute(query).await? { 179 | ResultSet::Begin { version, read_only } => match read_only { 180 | false => println!("Began transaction at new version {}", version), 181 | true => println!("Began read-only transaction at version {}", version), 182 | }, 183 | ResultSet::Commit { version: id } => println!("Committed transaction {}", id), 184 | ResultSet::Rollback { version: id } => println!("Rolled back transaction {}", id), 185 | ResultSet::Create { count } => println!("Created {} rows", count), 186 | ResultSet::Delete { count } => println!("Deleted {} rows", count), 187 | ResultSet::Update { count } => println!("Updated {} rows", count), 188 | ResultSet::CreateTable { name } => println!("Created table {}", name), 189 | ResultSet::DropTable { name } => println!("Dropped table {}", name), 190 | ResultSet::Explain(plan) => println!("{}", plan), 191 | ResultSet::Query { columns, mut rows } => { 192 | if self.show_headers { 193 | println!( 194 | "{}", 195 | columns 196 | .iter() 197 | .map(|c| c.name.as_deref().unwrap_or("?")) 198 | .collect::>() 199 | .join("|") 200 | ); 201 | } 202 | while let Some(row) = rows.next().transpose()? { 203 | println!( 204 | "{}", 205 | row.into_iter().map(|v| format!("{}", v)).collect::>().join("|") 206 | ); 207 | } 208 | } 209 | } 210 | Ok(()) 211 | } 212 | 213 | /// Prompts the user for input 214 | fn prompt(&mut self) -> Result> { 215 | let prompt = match self.client.txn() { 216 | Some((version, false)) => format!("entangledb:{}> ", version), 217 | Some((version, true)) => format!("entangledb@{}> ", version), 218 | None => "entangledb> ".into(), 219 | }; 220 | match self.editor.readline(&prompt) { 221 | Ok(input) => { 222 | self.editor.add_history_entry(&input)?; 223 | Ok(Some(input.trim().to_string())) 224 | } 225 | Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => Ok(None), 226 | Err(err) => Err(err.into()), 227 | } 228 | } 229 | 230 | /// Runs the EntangleSQL REPL 231 | async fn run(&mut self) -> Result<()> { 232 | if let Some(path) = &self.history_path { 233 | match self.editor.load_history(path) { 234 | Ok(_) => {} 235 | Err(ReadlineError::Io(ref err)) if err.kind() == std::io::ErrorKind::NotFound => {} 236 | Err(err) => return Err(err.into()), 237 | }; 238 | } 239 | // self.editor.set_helper(Some(InputValidator)); 240 | self.editor.set_helper(Some(EntangleInputValidator)); 241 | // Make sure multiline pastes are interpreted as normal inputs. 242 | self.editor.bind_sequence( 243 | rustyline::KeyEvent(rustyline::KeyCode::BracketedPasteStart, Modifiers::NONE), 244 | rustyline::Cmd::Noop, 245 | ); 246 | 247 | let status = self.client.status().await?; 248 | println!( 249 | "Connected to EntangleDB node \"{}\". Enter !help for instructions.", 250 | status.raft.server 251 | ); 252 | 253 | while let Some(input) = self.prompt()? { 254 | match self.execute(&input).await { 255 | Ok(()) => {} 256 | error @ Err(Error::Internal(_)) => return error, 257 | Err(error) => println!("Error: {}", error), 258 | } 259 | } 260 | 261 | if let Some(path) = &self.history_path { 262 | self.editor.save_history(path)?; 263 | } 264 | Ok(()) 265 | } 266 | } 267 | 268 | /// A Rustyline helper for multiline editing. It parses input lines and determines if they make up a complete command or not. 269 | #[derive(Completer, Helper, Highlighter, Hinter)] 270 | struct EntangleInputValidator; 271 | 272 | impl Validator for EntangleInputValidator { 273 | fn validate(&self, ctx: &mut ValidationContext) -> rustyline::Result { 274 | let input = ctx.input(); 275 | 276 | // Empty lines and ! commands are fine. 277 | if input.is_empty() || input.starts_with('!') || input == ";" { 278 | return Ok(ValidationResult::Valid(None)); 279 | } 280 | 281 | // For SQL statements, just look for any semicolon or lexer error and if found accept the 282 | // input and rely on the server to do further validation and error handling. Otherwise, 283 | // wait for more input. 284 | for result in Lexer::new(ctx.input()) { 285 | match result { 286 | Ok(Token::Semicolon) => return Ok(ValidationResult::Valid(None)), 287 | Err(_) => return Ok(ValidationResult::Valid(None)), 288 | _ => {} 289 | } 290 | } 291 | Ok(ValidationResult::Incomplete) 292 | } 293 | 294 | fn validate_while_typing(&self) -> bool { 295 | false 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /src/client.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{Error, Result}; 2 | use crate::server::{Request, Response}; 3 | use crate::sql::engine::Status; 4 | use crate::sql::execution::ResultSet; 5 | use crate::sql::schema::Table; 6 | 7 | use futures::future::FutureExt as _; 8 | use futures::sink::SinkExt as _; 9 | use futures::stream::TryStreamExt as _; 10 | use rand::Rng as _; 11 | use std::cell::Cell; 12 | use std::future::Future; 13 | use std::ops::{Deref, Drop}; 14 | use std::sync::Arc; 15 | use tokio::net::{TcpStream, ToSocketAddrs}; 16 | use tokio::sync::{Mutex, MutexGuard}; 17 | use tokio_util::codec::{Framed, LengthDelimitedCodec}; 18 | 19 | type Connection = tokio_serde::Framed< 20 | Framed, 21 | Result, 22 | Request, 23 | tokio_serde::formats::Bincode, Request>, 24 | >; 25 | 26 | /// Number of serialization retries in with_txn() 27 | const WITH_TXN_RETRIES: u8 = 8; 28 | 29 | /// A entangledb client 30 | #[derive(Clone)] 31 | pub struct Client { 32 | conn: Arc>, 33 | txn: Cell>, 34 | } 35 | 36 | impl Client { 37 | /// Creates a new client 38 | pub async fn new(addr: A) -> Result { 39 | Ok(Self { 40 | conn: Arc::new(Mutex::new(tokio_serde::Framed::new( 41 | Framed::new(TcpStream::connect(addr).await?, LengthDelimitedCodec::new()), 42 | tokio_serde::formats::Bincode::default(), 43 | ))), 44 | txn: Cell::new(None), 45 | }) 46 | } 47 | 48 | /// Call a server method 49 | async fn call(&self, request: Request) -> Result { 50 | let mut conn = self.conn.lock().await; 51 | self.call_locked(&mut conn, request).await 52 | } 53 | 54 | /// Call a server method while holding the mutex lock 55 | async fn call_locked( 56 | &self, 57 | conn: &mut MutexGuard<'_, Connection>, 58 | request: Request, 59 | ) -> Result { 60 | conn.send(request).await?; 61 | match conn.try_next().await? { 62 | Some(result) => result, 63 | None => Err(Error::Internal("Server disconnected".into())), 64 | } 65 | } 66 | 67 | /// Executes a query 68 | pub async fn execute(&self, query: &str) -> Result { 69 | let mut conn = self.conn.lock().await; 70 | let mut resultset = 71 | match self.call_locked(&mut conn, Request::Execute(query.into())).await? { 72 | Response::Execute(rs) => rs, 73 | resp => return Err(Error::Internal(format!("Unexpected response {:?}", resp))), 74 | }; 75 | if let ResultSet::Query { columns, .. } = resultset { 76 | // FIXME We buffer rows for now to avoid lifetime hassles 77 | let mut rows = Vec::new(); 78 | while let Some(result) = conn.try_next().await? { 79 | match result? { 80 | Response::Row(Some(row)) => rows.push(row), 81 | Response::Row(None) => break, 82 | response => { 83 | return Err(Error::Internal(format!("Unexpected response {:?}", response))) 84 | } 85 | } 86 | } 87 | resultset = ResultSet::Query { columns, rows: Box::new(rows.into_iter().map(Ok)) } 88 | }; 89 | match &resultset { 90 | ResultSet::Begin { version, read_only } => self.txn.set(Some((*version, *read_only))), 91 | ResultSet::Commit { .. } => self.txn.set(None), 92 | ResultSet::Rollback { .. } => self.txn.set(None), 93 | _ => {} 94 | } 95 | Ok(resultset) 96 | } 97 | 98 | /// Fetches the table schema as SQL 99 | pub async fn get_table(&self, table: &str) -> Result { 100 | match self.call(Request::GetTable(table.into())).await? { 101 | Response::GetTable(t) => Ok(t), 102 | resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))), 103 | } 104 | } 105 | 106 | /// Lists database tables 107 | pub async fn list_tables(&self) -> Result> { 108 | match self.call(Request::ListTables).await? { 109 | Response::ListTables(t) => Ok(t), 110 | resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))), 111 | } 112 | } 113 | 114 | /// Checks server status 115 | pub async fn status(&self) -> Result { 116 | match self.call(Request::Status).await? { 117 | Response::Status(s) => Ok(s), 118 | resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))), 119 | } 120 | } 121 | 122 | /// Returns the version and read-only state of the txn 123 | pub fn txn(&self) -> Option<(u64, bool)> { 124 | self.txn.get() 125 | } 126 | 127 | /// Runs a query in a transaction, automatically retrying serialization failures with 128 | /// exponential backoff. 129 | pub async fn with_txn(&self, mut with: W) -> Result 130 | where 131 | W: FnMut(Client) -> F, 132 | F: Future>, 133 | { 134 | for i in 0..WITH_TXN_RETRIES { 135 | if i > 0 { 136 | tokio::time::sleep(std::time::Duration::from_millis( 137 | 2_u64.pow(i as u32 - 1) * rand::thread_rng().gen_range(25..=75), 138 | )) 139 | .await; 140 | } 141 | let result = async { 142 | self.execute("BEGIN").await?; 143 | let result = with(self.clone()).await?; 144 | self.execute("COMMIT").await?; 145 | Ok(result) 146 | } 147 | .await; 148 | if result.is_err() { 149 | self.execute("ROLLBACK").await.ok(); 150 | if matches!(result, Err(Error::Serialization) | Err(Error::Abort)) { 151 | continue; 152 | } 153 | } 154 | return result; 155 | } 156 | Err(Error::Serialization) 157 | } 158 | } 159 | 160 | /// A entangledb client pool 161 | pub struct Pool { 162 | clients: Vec>, 163 | } 164 | 165 | impl Pool { 166 | /// Creates a new connection pool for the given servers, eagerly connecting clients. 167 | pub async fn new(addrs: Vec, size: u64) -> Result { 168 | let mut addrs = addrs.into_iter().cycle(); 169 | let clients = futures::future::try_join_all( 170 | std::iter::from_fn(|| { 171 | Some(Client::new(addrs.next().unwrap()).map(|r| r.map(Mutex::new))) 172 | }) 173 | .take(size as usize), 174 | ) 175 | .await?; 176 | Ok(Self { clients }) 177 | } 178 | 179 | /// Fetches a client from the pool. It is reset (i.e. any open txns are rolled back) and 180 | /// returned when it goes out of scope. 181 | pub async fn get(&self) -> PoolClient<'_> { 182 | let (client, index, _) = 183 | futures::future::select_all(self.clients.iter().map(|m| m.lock().boxed())).await; 184 | PoolClient::new(index, client) 185 | } 186 | 187 | /// Returns the size of the pool 188 | pub fn size(&self) -> usize { 189 | self.clients.len() 190 | } 191 | } 192 | 193 | /// A client returned from the pool 194 | pub struct PoolClient<'a> { 195 | id: usize, 196 | client: MutexGuard<'a, Client>, 197 | } 198 | 199 | impl<'a> PoolClient<'a> { 200 | /// Creates a new PoolClient 201 | fn new(id: usize, client: MutexGuard<'a, Client>) -> Self { 202 | Self { id, client } 203 | } 204 | 205 | /// Returns the ID of the client in the pool 206 | pub fn id(&self) -> usize { 207 | self.id 208 | } 209 | } 210 | 211 | impl<'a> Deref for PoolClient<'a> { 212 | type Target = MutexGuard<'a, Client>; 213 | 214 | fn deref(&self) -> &Self::Target { 215 | &self.client 216 | } 217 | } 218 | 219 | impl<'a> Drop for PoolClient<'a> { 220 | fn drop(&mut self) { 221 | if self.txn().is_some() { 222 | // FIXME This should disconnect or destroy the client if it errors. 223 | futures::executor::block_on(self.client.execute("ROLLBACK")).ok(); 224 | } 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use serde_derive::{Deserialize, Serialize}; 2 | use std::fmt::{self, Display}; 3 | 4 | /// Result returning Error 5 | pub type Result = std::result::Result; 6 | 7 | /// entangledb errors. All except Internal are considered user-facing. 8 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 9 | pub enum Error { 10 | Abort, 11 | Config(String), 12 | Internal(String), 13 | Parse(String), 14 | ReadOnly, 15 | Serialization, 16 | Value(String), 17 | } 18 | 19 | impl std::error::Error for Error {} 20 | 21 | impl Display for Error { 22 | fn fmt(&self, f: &mut std::fmt::Formatter) -> fmt::Result { 23 | match self { 24 | Error::Config(s) | Error::Internal(s) | Error::Parse(s) | Error::Value(s) => { 25 | write!(f, "{}", s) 26 | } 27 | Error::Abort => write!(f, "Operation aborted"), 28 | Error::Serialization => write!(f, "Serialization failure, retry transaction"), 29 | Error::ReadOnly => write!(f, "Read-only transaction"), 30 | } 31 | } 32 | } 33 | 34 | impl serde::ser::Error for Error { 35 | fn custom(msg: T) -> Self { 36 | Error::Internal(msg.to_string()) 37 | } 38 | } 39 | 40 | impl serde::de::Error for Error { 41 | fn custom(msg: T) -> Self { 42 | Error::Internal(msg.to_string()) 43 | } 44 | } 45 | 46 | impl From> for Error { 47 | fn from(err: Box) -> Self { 48 | Error::Internal(err.to_string()) 49 | } 50 | } 51 | 52 | impl From for Error { 53 | fn from(err: config::ConfigError) -> Self { 54 | Error::Config(err.to_string()) 55 | } 56 | } 57 | 58 | impl From for Error { 59 | fn from(err: hex::FromHexError) -> Self { 60 | Error::Internal(err.to_string()) 61 | } 62 | } 63 | 64 | impl From for Error { 65 | fn from(err: log::ParseLevelError) -> Self { 66 | Error::Config(err.to_string()) 67 | } 68 | } 69 | 70 | impl From for Error { 71 | fn from(err: log::SetLoggerError) -> Self { 72 | Error::Config(err.to_string()) 73 | } 74 | } 75 | 76 | impl From for Error { 77 | fn from(err: regex::Error) -> Self { 78 | Error::Value(err.to_string()) 79 | } 80 | } 81 | 82 | impl From for Error { 83 | fn from(err: rustyline::error::ReadlineError) -> Self { 84 | Error::Internal(err.to_string()) 85 | } 86 | } 87 | 88 | impl From for Error { 89 | fn from(err: std::array::TryFromSliceError) -> Self { 90 | Error::Internal(err.to_string()) 91 | } 92 | } 93 | 94 | impl From for Error { 95 | fn from(err: std::num::TryFromIntError) -> Self { 96 | Error::Value(err.to_string()) 97 | } 98 | } 99 | 100 | impl From for Error { 101 | fn from(err: std::io::Error) -> Self { 102 | Error::Internal(err.to_string()) 103 | } 104 | } 105 | 106 | impl From for Error { 107 | fn from(err: std::net::AddrParseError) -> Self { 108 | Error::Internal(err.to_string()) 109 | } 110 | } 111 | 112 | impl From for Error { 113 | fn from(err: std::num::ParseFloatError) -> Self { 114 | Error::Parse(err.to_string()) 115 | } 116 | } 117 | 118 | impl From for Error { 119 | fn from(err: std::num::ParseIntError) -> Self { 120 | Error::Parse(err.to_string()) 121 | } 122 | } 123 | 124 | impl From for Error { 125 | fn from(err: std::string::FromUtf8Error) -> Self { 126 | Error::Internal(err.to_string()) 127 | } 128 | } 129 | 130 | impl From> for Error { 131 | fn from(err: std::sync::PoisonError) -> Self { 132 | Error::Internal(err.to_string()) 133 | } 134 | } 135 | 136 | impl From for Error { 137 | fn from(err: tokio::task::JoinError) -> Self { 138 | Error::Internal(err.to_string()) 139 | } 140 | } 141 | 142 | impl From for Error { 143 | fn from(err: tokio::sync::mpsc::error::TryRecvError) -> Self { 144 | Error::Internal(err.to_string()) 145 | } 146 | } 147 | 148 | impl From> for Error { 149 | fn from(err: tokio::sync::mpsc::error::SendError) -> Self { 150 | Error::Internal(err.to_string()) 151 | } 152 | } 153 | 154 | impl From> for Error { 155 | fn from(err: tokio::sync::mpsc::error::TrySendError) -> Self { 156 | Error::Internal(err.to_string()) 157 | } 158 | } 159 | 160 | impl From for Error { 161 | fn from(err: tokio::sync::oneshot::error::RecvError) -> Self { 162 | Error::Internal(err.to_string()) 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | #![allow(clippy::new_without_default)] 3 | #![allow(clippy::unneeded_field_pattern)] 4 | 5 | pub mod client; 6 | pub mod error; 7 | pub mod raft; 8 | pub mod server; 9 | pub mod sql; 10 | pub mod storage; 11 | 12 | pub use client::Client; 13 | pub use server::Server; 14 | -------------------------------------------------------------------------------- /src/raft/message.rs: -------------------------------------------------------------------------------- 1 | use super::{Entry, Index, NodeID, Status, Term}; 2 | use crate::error::Result; 3 | 4 | use serde_derive::{Deserialize, Serialize}; 5 | 6 | /// A message address. 7 | #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)] 8 | pub enum Address { 9 | /// Broadcast to all peers. Only valid as an outbound recipient (to). 10 | Broadcast, 11 | /// A node with the specified node ID (local or remote). Valid both as 12 | /// sender and recipient. 13 | Node(NodeID), 14 | /// A local client. Can only send ClientRequest messages, and receive 15 | /// ClientResponse messages. 16 | Client, 17 | } 18 | 19 | impl Address { 20 | /// Unwraps the node ID, or panics if address is not of kind Node. 21 | pub fn unwrap(&self) -> NodeID { 22 | match self { 23 | Self::Node(id) => *id, 24 | _ => panic!("unwrap called on non-Node address {:?}", self), 25 | } 26 | } 27 | } 28 | 29 | /// A message passed between Raft nodes. 30 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 31 | pub struct Message { 32 | /// The current term of the sender. Must be set, unless the sender is 33 | /// Address::Client, in which case it must be 0. 34 | pub term: Term, 35 | /// The sender address. 36 | pub from: Address, 37 | /// The recipient address. 38 | pub to: Address, 39 | /// The message payload. 40 | pub event: Event, 41 | } 42 | 43 | /// An event contained within messages. 44 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 45 | pub enum Event { 46 | /// Leaders send periodic heartbeats to its followers. 47 | Heartbeat { 48 | /// The index of the leader's last committed log entry. 49 | commit_index: Index, 50 | /// The term of the leader's last committed log entry. 51 | commit_term: Term, 52 | }, 53 | /// Followers confirm loyalty to leader after heartbeats. 54 | ConfirmLeader { 55 | /// The commit_index of the original leader heartbeat, to confirm 56 | /// read requests. 57 | commit_index: Index, 58 | /// If false, the follower does not have the entry at commit_index 59 | /// and would like the leader to replicate it. 60 | has_committed: bool, 61 | }, 62 | 63 | /// Candidates solicit votes from all peers when campaigning for leadership. 64 | SolicitVote { 65 | // The index of the candidate's last stored log entry 66 | last_index: Index, 67 | // The term of the candidate's last stored log entry 68 | last_term: Term, 69 | }, 70 | 71 | /// Followers may grant a single vote to a candidate per term, on a 72 | /// first-come basis. Candidates implicitly vote for themselves. 73 | GrantVote, 74 | 75 | /// Leaders replicate log entries to followers by appending it to their log. 76 | AppendEntries { 77 | /// The index of the log entry immediately preceding the submitted commands. 78 | base_index: Index, 79 | /// The term of the log entry immediately preceding the submitted commands. 80 | base_term: Term, 81 | /// Commands to replicate. 82 | entries: Vec, 83 | }, 84 | /// Followers may accept a set of log entries from a leader. 85 | AcceptEntries { 86 | /// The index of the last log entry. 87 | last_index: Index, 88 | }, 89 | /// Followers may also reject a set of log entries from a leader. 90 | RejectEntries, 91 | 92 | /// A client request. This can be submitted to the leader, or to a follower 93 | /// which will forward it to its leader. If there is no leader, or the 94 | /// leader or term changes, the request is aborted with an Error::Abort 95 | /// ClientResponse and the client must retry. 96 | ClientRequest { 97 | /// The request ID. This is arbitrary, but must be globally unique for 98 | /// the duration of the request. 99 | id: RequestID, 100 | /// The request. 101 | request: Request, 102 | }, 103 | 104 | /// A client response. 105 | ClientResponse { 106 | /// The response ID. This matches the ID of the ClientRequest. 107 | id: RequestID, 108 | /// The response, or an error. 109 | response: Result, 110 | }, 111 | } 112 | 113 | /// A client request ID. 114 | pub type RequestID = Vec; 115 | 116 | /// A client request. 117 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 118 | pub enum Request { 119 | Query(Vec), 120 | Mutate(Vec), 121 | Status, 122 | } 123 | 124 | /// A client response. 125 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 126 | pub enum Response { 127 | Query(Vec), 128 | Mutate(Vec), 129 | Status(Status), 130 | } 131 | -------------------------------------------------------------------------------- /src/raft/mod.rs: -------------------------------------------------------------------------------- 1 | mod log; 2 | mod message; 3 | mod node; 4 | mod server; 5 | mod state; 6 | 7 | pub use self::log::{Engine, Entry, Index, Log}; 8 | pub use message::{Address, Event, Message, Request, RequestID, Response}; 9 | pub use node::{Node, NodeID, Status, Term}; 10 | pub use server::Server; 11 | pub use state::{Driver, Instruction, State}; 12 | -------------------------------------------------------------------------------- /src/raft/server.rs: -------------------------------------------------------------------------------- 1 | use super::{Address, Event, Log, Message, Node, NodeID, Request, Response, State}; 2 | use crate::error::{Error, Result}; 3 | 4 | use ::log::{debug, error}; 5 | use futures::{sink::SinkExt as _, FutureExt as _}; 6 | use std::collections::HashMap; 7 | use std::time::Duration; 8 | use tokio::net::{TcpListener, TcpStream}; 9 | use tokio::sync::{mpsc, oneshot}; 10 | use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream, UnboundedReceiverStream}; 11 | use tokio_stream::StreamExt as _; 12 | use tokio_util::codec::{Framed, LengthDelimitedCodec}; 13 | use uuid::Uuid; 14 | 15 | /// The interval between Raft ticks, the unit of time for e.g. heartbeats and 16 | /// elections. 17 | const TICK_INTERVAL: Duration = Duration::from_millis(100); 18 | 19 | /// A Raft server. 20 | pub struct Server { 21 | node: Node, 22 | peers: HashMap, 23 | node_rx: mpsc::UnboundedReceiver, 24 | } 25 | 26 | impl Server { 27 | /// Creates a new Raft cluster 28 | pub async fn new( 29 | id: NodeID, 30 | peers: HashMap, 31 | log: Log, 32 | state: Box, 33 | ) -> Result { 34 | let (node_tx, node_rx) = mpsc::unbounded_channel(); 35 | Ok(Self { 36 | node: Node::new(id, peers.keys().copied().collect(), log, state, node_tx).await?, 37 | peers, 38 | node_rx, 39 | }) 40 | } 41 | 42 | /// Connects to peers and serves requests. 43 | pub async fn serve( 44 | self, 45 | listener: TcpListener, 46 | client_rx: mpsc::UnboundedReceiver<(Request, oneshot::Sender>)>, 47 | ) -> Result<()> { 48 | let (tcp_in_tx, tcp_in_rx) = mpsc::unbounded_channel::(); 49 | let (tcp_out_tx, tcp_out_rx) = mpsc::unbounded_channel::(); 50 | let (task, tcp_receiver) = Self::tcp_receive(listener, tcp_in_tx).remote_handle(); 51 | tokio::spawn(task); 52 | let (task, tcp_sender) = Self::tcp_send(self.peers, tcp_out_rx).remote_handle(); 53 | tokio::spawn(task); 54 | let (task, eventloop) = 55 | Self::eventloop(self.node, self.node_rx, client_rx, tcp_in_rx, tcp_out_tx) 56 | .remote_handle(); 57 | tokio::spawn(task); 58 | 59 | tokio::try_join!(tcp_receiver, tcp_sender, eventloop)?; 60 | Ok(()) 61 | } 62 | 63 | /// Runs the event loop. 64 | async fn eventloop( 65 | mut node: Node, 66 | node_rx: mpsc::UnboundedReceiver, 67 | client_rx: mpsc::UnboundedReceiver<(Request, oneshot::Sender>)>, 68 | tcp_rx: mpsc::UnboundedReceiver, 69 | tcp_tx: mpsc::UnboundedSender, 70 | ) -> Result<()> { 71 | let mut node_rx = UnboundedReceiverStream::new(node_rx); 72 | let mut tcp_rx = UnboundedReceiverStream::new(tcp_rx); 73 | let mut client_rx = UnboundedReceiverStream::new(client_rx); 74 | 75 | let mut ticker = tokio::time::interval(TICK_INTERVAL); 76 | let mut requests = HashMap::, oneshot::Sender>>::new(); 77 | loop { 78 | tokio::select! { 79 | _ = ticker.tick() => node = node.tick()?, 80 | 81 | Some(msg) = tcp_rx.next() => node = node.step(msg)?, 82 | 83 | Some(msg) = node_rx.next() => { 84 | match msg { 85 | Message{to: Address::Node(_), ..} => tcp_tx.send(msg)?, 86 | Message{to: Address::Broadcast, ..} => tcp_tx.send(msg)?, 87 | Message{to: Address::Client, event: Event::ClientResponse{ id, response }, ..} => { 88 | if let Some(response_tx) = requests.remove(&id) { 89 | response_tx 90 | .send(response) 91 | .map_err(|e| Error::Internal(format!("Failed to send response {:?}", e)))?; 92 | } 93 | } 94 | _ => return Err(Error::Internal(format!("Unexpected message {:?}", msg))), 95 | } 96 | } 97 | 98 | Some((request, response_tx)) = client_rx.next() => { 99 | let id = Uuid::new_v4().as_bytes().to_vec(); 100 | let msg = Message{ 101 | from: Address::Client, 102 | to: Address::Node(node.id()), 103 | term: 0, 104 | event: Event::ClientRequest{id: id.clone(), request}, 105 | }; 106 | node = node.step(msg)?; 107 | requests.insert(id, response_tx); 108 | } 109 | } 110 | } 111 | } 112 | 113 | /// Receives inbound messages from peers via TCP. 114 | async fn tcp_receive( 115 | listener: TcpListener, 116 | in_tx: mpsc::UnboundedSender, 117 | ) -> Result<()> { 118 | let mut listener = TcpListenerStream::new(listener); 119 | while let Some(socket) = listener.try_next().await? { 120 | let peer = socket.peer_addr()?; 121 | let peer_in_tx = in_tx.clone(); 122 | tokio::spawn(async move { 123 | debug!("Raft peer {} connected", peer); 124 | match Self::tcp_receive_peer(socket, peer_in_tx).await { 125 | Ok(()) => debug!("Raft peer {} disconnected", peer), 126 | Err(err) => error!("Raft peer {} error: {}", peer, err.to_string()), 127 | }; 128 | }); 129 | } 130 | Ok(()) 131 | } 132 | 133 | /// Receives inbound messages from a peer via TCP. 134 | async fn tcp_receive_peer( 135 | socket: TcpStream, 136 | in_tx: mpsc::UnboundedSender, 137 | ) -> Result<()> { 138 | let mut stream = tokio_serde::SymmetricallyFramed::<_, Message, _>::new( 139 | Framed::new(socket, LengthDelimitedCodec::new()), 140 | tokio_serde::formats::SymmetricalBincode::::default(), 141 | ); 142 | while let Some(message) = stream.try_next().await? { 143 | in_tx.send(message)?; 144 | } 145 | Ok(()) 146 | } 147 | 148 | /// Sends outbound messages to peers via TCP. 149 | async fn tcp_send( 150 | peers: HashMap, 151 | out_rx: mpsc::UnboundedReceiver, 152 | ) -> Result<()> { 153 | let mut out_rx = UnboundedReceiverStream::new(out_rx); 154 | let mut peer_txs: HashMap> = HashMap::new(); 155 | 156 | for (id, addr) in peers.into_iter() { 157 | let (tx, rx) = mpsc::channel::(1000); 158 | peer_txs.insert(id, tx); 159 | tokio::spawn(Self::tcp_send_peer(addr, rx)); 160 | } 161 | 162 | while let Some(message) = out_rx.next().await { 163 | let to = match message.to { 164 | Address::Broadcast => peer_txs.keys().copied().collect(), 165 | Address::Node(peer) => vec![peer], 166 | addr => { 167 | error!("Received outbound message for non-TCP address {:?}", addr); 168 | continue; 169 | } 170 | }; 171 | for id in to { 172 | match peer_txs.get_mut(&id) { 173 | Some(tx) => match tx.try_send(message.clone()) { 174 | Ok(()) => {} 175 | Err(mpsc::error::TrySendError::Full(_)) => { 176 | debug!("Full send buffer for peer {}, discarding message", id) 177 | } 178 | Err(error) => return Err(error.into()), 179 | }, 180 | None => error!("Received outbound message for unknown peer {}", id), 181 | } 182 | } 183 | } 184 | Ok(()) 185 | } 186 | 187 | /// Sends outbound messages to a peer, continuously reconnecting. 188 | async fn tcp_send_peer(addr: String, out_rx: mpsc::Receiver) { 189 | let mut out_rx = ReceiverStream::new(out_rx); 190 | loop { 191 | match TcpStream::connect(&addr).await { 192 | Ok(socket) => { 193 | debug!("Connected to Raft peer {}", addr); 194 | match Self::tcp_send_peer_session(socket, &mut out_rx).await { 195 | Ok(()) => break, 196 | Err(err) => error!("Failed sending to Raft peer {}: {}", addr, err), 197 | } 198 | } 199 | Err(err) => error!("Failed connecting to Raft peer {}: {}", addr, err), 200 | } 201 | tokio::time::sleep(Duration::from_millis(1000)).await; 202 | } 203 | debug!("Disconnected from Raft peer {}", addr); 204 | } 205 | 206 | /// Sends outbound messages to a peer via a TCP session. 207 | async fn tcp_send_peer_session( 208 | socket: TcpStream, 209 | out_rx: &mut ReceiverStream, 210 | ) -> Result<()> { 211 | let mut stream = tokio_serde::SymmetricallyFramed::<_, Message, _>::new( 212 | Framed::new(socket, LengthDelimitedCodec::new()), 213 | tokio_serde::formats::SymmetricalBincode::::default(), 214 | ); 215 | while let Some(message) = out_rx.next().await { 216 | stream.send(message).await?; 217 | } 218 | Ok(()) 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/server.rs: -------------------------------------------------------------------------------- 1 | use crate::error::{Error, Result}; 2 | use crate::raft; 3 | use crate::sql; 4 | use crate::sql::engine::Engine as _; 5 | use crate::sql::execution::ResultSet; 6 | use crate::sql::schema::{Catalog as _, Table}; 7 | use crate::sql::types::Row; 8 | 9 | use ::log::{debug, error, info}; 10 | use futures::sink::SinkExt as _; 11 | use serde_derive::{Deserialize, Serialize}; 12 | use std::collections::HashMap; 13 | use tokio::net::{TcpListener, TcpStream}; 14 | use tokio::sync::mpsc; 15 | use tokio_stream::wrappers::TcpListenerStream; 16 | use tokio_stream::StreamExt as _; 17 | use tokio_util::codec::{Framed, LengthDelimitedCodec}; 18 | 19 | /// A entangledb server. 20 | /// It encapsulates the Raft consensus server and SQL server functionalities. 21 | /// The server manages both Raft and SQL client connections, processing incoming 22 | /// requests and dispatching them to the appropriate internal components. 23 | pub struct Server { 24 | raft: raft::Server, 25 | raft_listener: Option, 26 | sql_listener: Option, 27 | } 28 | 29 | impl Server { 30 | /// Creates a new entangledb server. 31 | /// Initializes a new server instance with the provided Raft configuration. 32 | /// 33 | /// # Arguments 34 | /// * `id` - The unique identifier for the Raft node. 35 | /// * `peers` - A map of peer node IDs to their associated network addresses. 36 | /// * `raft_log` - The Raft log implementation. 37 | /// * `raft_state` - The persistent state storage for the Raft consensus algorithm. 38 | /// 39 | /// # Returns 40 | /// A result containing the new server instance or an error if initialization fails. 41 | pub async fn new( 42 | id: raft::NodeID, 43 | peers: HashMap, 44 | raft_log: raft::Log, 45 | raft_state: Box, 46 | ) -> Result { 47 | Ok(Server { 48 | raft: raft::Server::new(id, peers, raft_log, raft_state).await?, 49 | raft_listener: None, 50 | sql_listener: None, 51 | }) 52 | } 53 | 54 | /// Starts listening on the given ports. Must be called before serve. 55 | /// Sets up the TCP listeners for both SQL and Raft communication. 56 | /// 57 | /// # Arguments 58 | /// * `sql_addr` - The address to listen for SQL client connections. 59 | /// * `raft_addr` - The address to listen for Raft peer connections. 60 | /// 61 | /// # Returns 62 | /// A result containing the server instance with listeners configured or an error if listening fails. 63 | pub async fn listen(mut self, sql_addr: &str, raft_addr: &str) -> Result { 64 | let (sql, raft) = 65 | tokio::try_join!(TcpListener::bind(sql_addr), TcpListener::bind(raft_addr),)?; 66 | info!("Listening on {} (SQL) and {} (Raft)", sql.local_addr()?, raft.local_addr()?); 67 | self.sql_listener = Some(sql); 68 | self.raft_listener = Some(raft); 69 | Ok(self) 70 | } 71 | 72 | /// Serves Raft and SQL requests until the returned future is dropped. Consumes the server. 73 | /// Starts the event loop for handling incoming Raft and SQL connections. 74 | /// This function will run indefinitely until the server is shut down. 75 | /// 76 | /// # Returns 77 | /// A result indicating the success or failure of the server event loop. 78 | pub async fn serve(self) -> Result<()> { 79 | let sql_listener = self 80 | .sql_listener 81 | .ok_or_else(|| Error::Internal("Must listen before serving".into()))?; 82 | let raft_listener = self 83 | .raft_listener 84 | .ok_or_else(|| Error::Internal("Must listen before serving".into()))?; 85 | let (raft_tx, raft_rx) = mpsc::unbounded_channel(); 86 | let sql_engine = sql::engine::Raft::new(raft_tx); 87 | 88 | tokio::try_join!( 89 | self.raft.serve(raft_listener, raft_rx), 90 | Self::serve_sql(sql_listener, sql_engine), 91 | )?; 92 | Ok(()) 93 | } 94 | 95 | /// Serves SQL clients. 96 | /// Accepts incoming SQL client connections and handles their requests in separate tasks. 97 | /// 98 | /// # Arguments 99 | /// * `listener` - The TCP listener for SQL client connections. 100 | /// * `engine` - The SQL engine instance used for executing SQL commands. 101 | /// 102 | /// # Returns 103 | /// A result indicating the success or failure of serving SQL clients. 104 | async fn serve_sql(listener: TcpListener, engine: sql::engine::Raft) -> Result<()> { 105 | let mut listener = TcpListenerStream::new(listener); 106 | while let Some(socket) = listener.try_next().await? { 107 | let peer = socket.peer_addr()?; 108 | let session = Session::new(engine.clone())?; 109 | tokio::spawn(async move { 110 | info!("Client {} connected", peer); 111 | match session.handle(socket).await { 112 | Ok(()) => info!("Client {} disconnected", peer), 113 | Err(err) => error!("Client {} error: {}", peer, err), 114 | } 115 | }); 116 | } 117 | Ok(()) 118 | } 119 | } 120 | 121 | /// A client request. 122 | /// Enumerates the different types of requests that a client can send to the server. 123 | #[derive(Debug, Serialize, Deserialize)] 124 | pub enum Request { 125 | Execute(String), 126 | GetTable(String), 127 | ListTables, 128 | Status, 129 | } 130 | 131 | /// A server response. 132 | /// Enumerates the different types of responses that the server can send back to the client. 133 | #[derive(Debug, Serialize, Deserialize)] 134 | pub enum Response { 135 | Execute(ResultSet), 136 | Row(Option), 137 | GetTable(Table), 138 | ListTables(Vec), 139 | Status(sql::engine::Status), 140 | } 141 | 142 | /// A client session coupled to a SQL session. 143 | /// Manages the state and communication for a single client's connection to the SQL server. 144 | pub struct Session { 145 | engine: sql::engine::Raft, 146 | sql: sql::engine::Session, 147 | } 148 | 149 | impl Session { 150 | /// Creates a new client session. 151 | /// Initializes a new session for a client connected to the SQL server. 152 | /// 153 | /// # Arguments 154 | /// * `engine` - The SQL engine instance used for executing SQL commands. 155 | /// 156 | /// # Returns 157 | /// A result containing the new session instance or an error if initialization fails. 158 | fn new(engine: sql::engine::Raft) -> Result { 159 | Ok(Self { sql: engine.session()?, engine }) 160 | } 161 | 162 | /// Handles a client connection. 163 | /// Processes incoming requests from the client and sends appropriate responses. 164 | /// 165 | /// # Arguments 166 | /// * `socket` - The TCP stream representing the client's connection. 167 | /// 168 | /// # Returns 169 | /// A result indicating the success or failure of handling the client connection. 170 | async fn handle(mut self, socket: TcpStream) -> Result<()> { 171 | let mut stream = tokio_serde::Framed::new( 172 | Framed::new(socket, LengthDelimitedCodec::new()), 173 | tokio_serde::formats::Bincode::default(), 174 | ); 175 | while let Some(request) = stream.try_next().await? { 176 | let mut response = tokio::task::block_in_place(|| self.request(request)); 177 | let mut rows: Box> + Send> = 178 | Box::new(std::iter::empty()); 179 | if let Ok(Response::Execute(ResultSet::Query { rows: ref mut resultrows, .. })) = 180 | &mut response 181 | { 182 | rows = Box::new( 183 | std::mem::replace(resultrows, Box::new(std::iter::empty())) 184 | .map(|result| result.map(|row| Response::Row(Some(row)))) 185 | .chain(std::iter::once(Ok(Response::Row(None)))) 186 | .scan(false, |err_sent, response| match (&err_sent, &response) { 187 | (true, _) => None, 188 | (_, Err(error)) => { 189 | *err_sent = true; 190 | Some(Err(error.clone())) 191 | } 192 | _ => Some(response), 193 | }) 194 | .fuse(), 195 | ); 196 | } 197 | stream.send(response).await?; 198 | stream.send_all(&mut tokio_stream::iter(rows.map(Ok))).await?; 199 | } 200 | Ok(()) 201 | } 202 | 203 | /// Executes a request. 204 | /// Processes a single request from the client and generates the corresponding response. 205 | /// 206 | /// # Arguments 207 | /// * `request` - The client request to be processed. 208 | /// 209 | /// # Returns 210 | /// A result containing the server response to the request or an error if processing fails. 211 | pub fn request(&mut self, request: Request) -> Result { 212 | debug!("Processing request {:?}", request); 213 | let response = match request { 214 | Request::Execute(query) => Response::Execute(self.sql.execute(&query)?), 215 | Request::GetTable(table) => { 216 | Response::GetTable(self.sql.read_with_txn(|txn| txn.must_read_table(&table))?) 217 | } 218 | Request::ListTables => Response::ListTables( 219 | self.sql.read_with_txn(|txn| Ok(txn.scan_tables()?.map(|t| t.name).collect()))?, 220 | ), 221 | Request::Status => Response::Status(self.engine.status()?), 222 | }; 223 | debug!("Returning response {:?}", response); 224 | Ok(response) 225 | } 226 | } 227 | 228 | impl Drop for Session { 229 | fn drop(&mut self) { 230 | /// Automatically rolls back any active transaction when the session is dropped. 231 | tokio::task::block_in_place(|| self.sql.execute("ROLLBACK").ok()); 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/sql/engine/mod.rs: -------------------------------------------------------------------------------- 1 | //! The SQL engine provides fundamental CRUD storage operations. 2 | mod kv; 3 | pub mod raft; 4 | pub use kv::KV; 5 | pub use raft::{Raft, Status}; 6 | 7 | use super::execution::ResultSet; 8 | use super::parser::{ast, Parser}; 9 | use super::plan::Plan; 10 | use super::schema::Catalog; 11 | use super::types::{Expression, Row, Value}; 12 | use crate::error::{Error, Result}; 13 | 14 | use std::collections::HashSet; 15 | 16 | /// The SQL engine interface 17 | pub trait Engine: Clone { 18 | /// The transaction type 19 | type Transaction: Transaction; 20 | 21 | /// Begins a read-write transaction. 22 | fn begin(&self) -> Result; 23 | 24 | /// Begins a read-only transaction. 25 | fn begin_read_only(&self) -> Result; 26 | 27 | /// Begins a read-only transaction as of a historical version. 28 | fn begin_as_of(&self, version: u64) -> Result; 29 | 30 | /// Begins a session for executing individual statements 31 | fn session(&self) -> Result> { 32 | Ok(Session { engine: self.clone(), txn: None }) 33 | } 34 | } 35 | 36 | /// An SQL transaction 37 | pub trait Transaction: Catalog { 38 | /// The transaction's version 39 | fn version(&self) -> u64; 40 | /// Whether the transaction is read-only 41 | fn read_only(&self) -> bool; 42 | 43 | /// Commits the transaction 44 | fn commit(self) -> Result<()>; 45 | /// Rolls back the transaction 46 | fn rollback(self) -> Result<()>; 47 | 48 | /// Creates a new table row 49 | fn create(&mut self, table: &str, row: Row) -> Result<()>; 50 | /// Deletes a table row 51 | fn delete(&mut self, table: &str, id: &Value) -> Result<()>; 52 | /// Reads a table row, if it exists 53 | fn read(&self, table: &str, id: &Value) -> Result>; 54 | /// Reads an index entry, if it exists 55 | fn read_index(&self, table: &str, column: &str, value: &Value) -> Result>; 56 | /// Scans a table's rows 57 | fn scan(&self, table: &str, filter: Option) -> Result; 58 | /// Scans a column's index entries 59 | fn scan_index(&self, table: &str, column: &str) -> Result; 60 | /// Updates a table row 61 | fn update(&mut self, table: &str, id: &Value, row: Row) -> Result<()>; 62 | } 63 | 64 | /// An SQL session, which handles transaction control and simplified query execution 65 | pub struct Session { 66 | /// The underlying engine 67 | engine: E, 68 | /// The current session transaction, if any 69 | txn: Option, 70 | } 71 | 72 | impl Session { 73 | /// Executes a query, managing transaction status for the session 74 | pub fn execute(&mut self, query: &str) -> Result { 75 | // FIXME We should match on self.txn as well, but get this error: 76 | // error[E0009]: cannot bind by-move and by-ref in the same pattern 77 | // ...which seems like an arbitrary compiler limitation 78 | match Parser::new(query).parse()? { 79 | ast::Statement::Begin { .. } if self.txn.is_some() => { 80 | Err(Error::Value("Already in a transaction".into())) 81 | } 82 | ast::Statement::Begin { read_only: true, as_of: None } => { 83 | let txn = self.engine.begin_read_only()?; 84 | let result = ResultSet::Begin { version: txn.version(), read_only: true }; 85 | self.txn = Some(txn); 86 | Ok(result) 87 | } 88 | ast::Statement::Begin { read_only: true, as_of: Some(version) } => { 89 | let txn = self.engine.begin_as_of(version)?; 90 | let result = ResultSet::Begin { version, read_only: true }; 91 | self.txn = Some(txn); 92 | Ok(result) 93 | } 94 | ast::Statement::Begin { read_only: false, as_of: Some(_) } => { 95 | Err(Error::Value("Can't start read-write transaction in a given version".into())) 96 | } 97 | ast::Statement::Begin { read_only: false, as_of: None } => { 98 | let txn = self.engine.begin()?; 99 | let result = ResultSet::Begin { version: txn.version(), read_only: false }; 100 | self.txn = Some(txn); 101 | Ok(result) 102 | } 103 | ast::Statement::Commit | ast::Statement::Rollback if self.txn.is_none() => { 104 | Err(Error::Value("Not in a transaction".into())) 105 | } 106 | ast::Statement::Commit => { 107 | let txn = self.txn.take().unwrap(); 108 | let version = txn.version(); 109 | txn.commit()?; 110 | Ok(ResultSet::Commit { version }) 111 | } 112 | ast::Statement::Rollback => { 113 | let txn = self.txn.take().unwrap(); 114 | let version = txn.version(); 115 | txn.rollback()?; 116 | Ok(ResultSet::Rollback { version }) 117 | } 118 | ast::Statement::Explain(statement) => self.read_with_txn(|txn| { 119 | Ok(ResultSet::Explain(Plan::build(*statement, txn)?.optimize(txn)?.0)) 120 | }), 121 | statement if self.txn.is_some() => Plan::build(statement, self.txn.as_mut().unwrap())? 122 | .optimize(self.txn.as_mut().unwrap())? 123 | .execute(self.txn.as_mut().unwrap()), 124 | statement @ ast::Statement::Select { .. } => { 125 | let mut txn = self.engine.begin_read_only()?; 126 | let result = 127 | Plan::build(statement, &mut txn)?.optimize(&mut txn)?.execute(&mut txn); 128 | txn.rollback()?; 129 | result 130 | } 131 | statement => { 132 | let mut txn = self.engine.begin()?; 133 | match Plan::build(statement, &mut txn)?.optimize(&mut txn)?.execute(&mut txn) { 134 | Ok(result) => { 135 | txn.commit()?; 136 | Ok(result) 137 | } 138 | Err(error) => { 139 | txn.rollback()?; 140 | Err(error) 141 | } 142 | } 143 | } 144 | } 145 | } 146 | 147 | /// Runs a read-only closure in the session's transaction, or a new 148 | /// transaction if none is active. 149 | /// 150 | /// TODO: reconsider this 151 | pub fn read_with_txn(&mut self, f: F) -> Result 152 | where 153 | F: FnOnce(&mut E::Transaction) -> Result, 154 | { 155 | if let Some(ref mut txn) = self.txn { 156 | return f(txn); 157 | } 158 | let mut txn = self.engine.begin_read_only()?; 159 | let result = f(&mut txn); 160 | txn.rollback()?; 161 | result 162 | } 163 | } 164 | 165 | /// A row scan iterator 166 | pub type Scan = Box> + Send>; 167 | 168 | /// An index scan iterator 169 | pub type IndexScan = Box)>> + Send>; 170 | -------------------------------------------------------------------------------- /src/sql/engine/raft.rs: -------------------------------------------------------------------------------- 1 | use super::super::schema::{Catalog, Table, Tables}; 2 | use super::super::types::{Expression, Row, Value}; 3 | use super::{Engine as _, IndexScan, Scan, Transaction as _}; 4 | use crate::error::{Error, Result}; 5 | use crate::raft::{self, Entry}; 6 | use crate::storage::{self, bincode, mvcc::TransactionState}; 7 | 8 | use serde::{de::DeserializeOwned, Deserialize, Serialize}; 9 | use std::collections::HashSet; 10 | use tokio::sync::{mpsc, oneshot}; 11 | 12 | /// A Raft state machine mutation. 13 | /// 14 | /// TODO: use Cows for these. 15 | #[derive(Clone, Serialize, Deserialize)] 16 | enum Mutation { 17 | /// Begins a transaction 18 | Begin { read_only: bool, as_of: Option }, 19 | /// Commits the given transaction 20 | Commit(TransactionState), 21 | /// Rolls back the given transaction 22 | Rollback(TransactionState), 23 | 24 | /// Creates a new row 25 | Create { txn: TransactionState, table: String, row: Row }, 26 | /// Deletes a row 27 | Delete { txn: TransactionState, table: String, id: Value }, 28 | /// Updates a row 29 | Update { txn: TransactionState, table: String, id: Value, row: Row }, 30 | 31 | /// Creates a table 32 | CreateTable { txn: TransactionState, schema: Table }, 33 | /// Deletes a table 34 | DeleteTable { txn: TransactionState, table: String }, 35 | } 36 | 37 | /// A Raft state machine query. 38 | /// 39 | /// TODO: use Cows for these. 40 | #[derive(Clone, Serialize, Deserialize)] 41 | enum Query { 42 | /// Fetches engine status 43 | Status, 44 | 45 | /// Reads a row 46 | Read { txn: TransactionState, table: String, id: Value }, 47 | /// Reads an index entry 48 | ReadIndex { txn: TransactionState, table: String, column: String, value: Value }, 49 | /// Scans a table's rows 50 | Scan { txn: TransactionState, table: String, filter: Option }, 51 | /// Scans an index 52 | ScanIndex { txn: TransactionState, table: String, column: String }, 53 | 54 | /// Scans the tables 55 | ScanTables { txn: TransactionState }, 56 | /// Reads a table 57 | ReadTable { txn: TransactionState, table: String }, 58 | } 59 | 60 | /// Status for the Raft SQL engine. 61 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 62 | pub struct Status { 63 | pub raft: raft::Status, 64 | pub mvcc: storage::mvcc::Status, 65 | } 66 | 67 | /// A client for the local Raft node. 68 | #[derive(Clone)] 69 | struct Client { 70 | tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender>)>, 71 | } 72 | 73 | impl Client { 74 | /// Creates a new Raft client. 75 | fn new( 76 | tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender>)>, 77 | ) -> Self { 78 | Self { tx } 79 | } 80 | 81 | /// Executes a request against the Raft cluster. 82 | fn execute(&self, request: raft::Request) -> Result { 83 | let (response_tx, response_rx) = oneshot::channel(); 84 | self.tx.send((request, response_tx))?; 85 | futures::executor::block_on(response_rx)? 86 | } 87 | 88 | /// Mutates the Raft state machine, deserializing the response into the 89 | /// return type. 90 | fn mutate(&self, mutation: Mutation) -> Result { 91 | match self.execute(raft::Request::Mutate(bincode::serialize(&mutation)?))? { 92 | raft::Response::Mutate(response) => Ok(bincode::deserialize(&response)?), 93 | resp => Err(Error::Internal(format!("Unexpected Raft mutation response {:?}", resp))), 94 | } 95 | } 96 | 97 | /// Queries the Raft state machine, deserializing the response into the 98 | /// return type. 99 | fn query(&self, query: Query) -> Result { 100 | match self.execute(raft::Request::Query(bincode::serialize(&query)?))? { 101 | raft::Response::Query(response) => Ok(bincode::deserialize(&response)?), 102 | resp => Err(Error::Internal(format!("Unexpected Raft query response {:?}", resp))), 103 | } 104 | } 105 | 106 | /// Fetches Raft node status. 107 | fn status(&self) -> Result { 108 | match self.execute(raft::Request::Status)? { 109 | raft::Response::Status(status) => Ok(status), 110 | resp => Err(Error::Internal(format!("Unexpected Raft status response {:?}", resp))), 111 | } 112 | } 113 | } 114 | 115 | /// A SQL engine using a Raft state machine. 116 | #[derive(Clone)] 117 | pub struct Raft { 118 | client: Client, 119 | } 120 | 121 | impl Raft { 122 | /// Creates a new Raft-based SQL engine. 123 | pub fn new( 124 | tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender>)>, 125 | ) -> Self { 126 | Self { client: Client::new(tx) } 127 | } 128 | 129 | /// Creates an underlying state machine for a Raft engine. 130 | pub fn new_state(engine: E) -> Result> { 131 | State::new(engine) 132 | } 133 | 134 | /// Returns Raft SQL engine status. 135 | pub fn status(&self) -> Result { 136 | Ok(Status { raft: self.client.status()?, mvcc: self.client.query(Query::Status)? }) 137 | } 138 | } 139 | 140 | impl super::Engine for Raft { 141 | type Transaction = Transaction; 142 | 143 | fn begin(&self) -> Result { 144 | Transaction::begin(self.client.clone(), false, None) 145 | } 146 | 147 | fn begin_read_only(&self) -> Result { 148 | Transaction::begin(self.client.clone(), true, None) 149 | } 150 | 151 | fn begin_as_of(&self, version: u64) -> Result { 152 | Transaction::begin(self.client.clone(), true, Some(version)) 153 | } 154 | } 155 | 156 | /// A Raft-based SQL transaction. 157 | #[derive(Clone)] 158 | pub struct Transaction { 159 | client: Client, 160 | state: TransactionState, 161 | } 162 | 163 | impl Transaction { 164 | /// Starts a transaction in the given mode. 165 | fn begin(client: Client, read_only: bool, as_of: Option) -> Result { 166 | let state = client.mutate(Mutation::Begin { read_only, as_of })?; 167 | Ok(Self { client, state }) 168 | } 169 | } 170 | 171 | impl super::Transaction for Transaction { 172 | fn version(&self) -> u64 { 173 | self.state.version 174 | } 175 | 176 | fn read_only(&self) -> bool { 177 | self.state.read_only 178 | } 179 | 180 | fn commit(self) -> Result<()> { 181 | self.client.mutate(Mutation::Commit(self.state.clone())) 182 | } 183 | 184 | fn rollback(self) -> Result<()> { 185 | self.client.mutate(Mutation::Rollback(self.state.clone())) 186 | } 187 | 188 | fn create(&mut self, table: &str, row: Row) -> Result<()> { 189 | self.client.mutate(Mutation::Create { 190 | txn: self.state.clone(), 191 | table: table.to_string(), 192 | row, 193 | }) 194 | } 195 | 196 | fn delete(&mut self, table: &str, id: &Value) -> Result<()> { 197 | self.client.mutate(Mutation::Delete { 198 | txn: self.state.clone(), 199 | table: table.to_string(), 200 | id: id.clone(), 201 | }) 202 | } 203 | 204 | fn read(&self, table: &str, id: &Value) -> Result> { 205 | self.client.query(Query::Read { 206 | txn: self.state.clone(), 207 | table: table.to_string(), 208 | id: id.clone(), 209 | }) 210 | } 211 | 212 | fn read_index(&self, table: &str, column: &str, value: &Value) -> Result> { 213 | self.client.query(Query::ReadIndex { 214 | txn: self.state.clone(), 215 | table: table.to_string(), 216 | column: column.to_string(), 217 | value: value.clone(), 218 | }) 219 | } 220 | 221 | fn scan(&self, table: &str, filter: Option) -> Result { 222 | Ok(Box::new( 223 | self.client 224 | .query::>(Query::Scan { 225 | txn: self.state.clone(), 226 | table: table.to_string(), 227 | filter, 228 | })? 229 | .into_iter() 230 | .map(Ok), 231 | )) 232 | } 233 | 234 | fn scan_index(&self, table: &str, column: &str) -> Result { 235 | Ok(Box::new( 236 | self.client 237 | .query::>(Query::ScanIndex { 238 | txn: self.state.clone(), 239 | table: table.to_string(), 240 | column: column.to_string(), 241 | })? 242 | .into_iter() 243 | .map(Ok), 244 | )) 245 | } 246 | 247 | fn update(&mut self, table: &str, id: &Value, row: Row) -> Result<()> { 248 | self.client.mutate(Mutation::Update { 249 | txn: self.state.clone(), 250 | table: table.to_string(), 251 | id: id.clone(), 252 | row, 253 | }) 254 | } 255 | } 256 | 257 | impl Catalog for Transaction { 258 | fn create_table(&mut self, table: Table) -> Result<()> { 259 | self.client.mutate(Mutation::CreateTable { txn: self.state.clone(), schema: table }) 260 | } 261 | 262 | fn delete_table(&mut self, table: &str) -> Result<()> { 263 | self.client 264 | .mutate(Mutation::DeleteTable { txn: self.state.clone(), table: table.to_string() }) 265 | } 266 | 267 | fn read_table(&self, table: &str) -> Result> { 268 | self.client.query(Query::ReadTable { txn: self.state.clone(), table: table.to_string() }) 269 | } 270 | 271 | fn scan_tables(&self) -> Result { 272 | Ok(Box::new( 273 | self.client.query::>(Query::ScanTables { txn: self.state.clone() })?.into_iter(), 274 | )) 275 | } 276 | } 277 | 278 | /// The Raft state machine for the Raft-based SQL engine, using a KV SQL engine 279 | pub struct State { 280 | /// The underlying KV SQL engine 281 | engine: super::KV, 282 | /// The last applied index 283 | applied_index: u64, 284 | } 285 | 286 | impl State { 287 | /// Creates a new Raft state maching using the given storage engine. 288 | pub fn new(engine: E) -> Result { 289 | let engine = super::KV::new(engine); 290 | let applied_index = engine 291 | .get_metadata(b"applied_index")? 292 | .map(|b| bincode::deserialize(&b)) 293 | .unwrap_or(Ok(0))?; 294 | Ok(State { engine, applied_index }) 295 | } 296 | 297 | /// Mutates the state machine. 298 | fn mutate(&mut self, mutation: Mutation) -> Result> { 299 | match mutation { 300 | Mutation::Begin { read_only, as_of } => { 301 | let txn = if !read_only { 302 | self.engine.begin()? 303 | } else if let Some(version) = as_of { 304 | self.engine.begin_as_of(version)? 305 | } else { 306 | self.engine.begin_read_only()? 307 | }; 308 | bincode::serialize(&txn.state()) 309 | } 310 | Mutation::Commit(txn) => bincode::serialize(&self.engine.resume(txn)?.commit()?), 311 | Mutation::Rollback(txn) => bincode::serialize(&self.engine.resume(txn)?.rollback()?), 312 | 313 | Mutation::Create { txn, table, row } => { 314 | bincode::serialize(&self.engine.resume(txn)?.create(&table, row)?) 315 | } 316 | Mutation::Delete { txn, table, id } => { 317 | bincode::serialize(&self.engine.resume(txn)?.delete(&table, &id)?) 318 | } 319 | Mutation::Update { txn, table, id, row } => { 320 | bincode::serialize(&self.engine.resume(txn)?.update(&table, &id, row)?) 321 | } 322 | 323 | Mutation::CreateTable { txn, schema } => { 324 | bincode::serialize(&self.engine.resume(txn)?.create_table(schema)?) 325 | } 326 | Mutation::DeleteTable { txn, table } => { 327 | bincode::serialize(&self.engine.resume(txn)?.delete_table(&table)?) 328 | } 329 | } 330 | } 331 | } 332 | 333 | impl raft::State for State { 334 | fn get_applied_index(&self) -> u64 { 335 | self.applied_index 336 | } 337 | 338 | fn apply(&mut self, entry: Entry) -> Result> { 339 | assert_eq!(entry.index, self.applied_index + 1, "entry index not after applied index"); 340 | 341 | let result = match &entry.command { 342 | Some(command) => match self.mutate(bincode::deserialize(command)?) { 343 | error @ Err(Error::Internal(_)) => return error, // don't record as applied 344 | result => result, 345 | }, 346 | None => Ok(Vec::new()), 347 | }; 348 | self.applied_index = entry.index; 349 | self.engine.set_metadata(b"applied_index", bincode::serialize(&entry.index)?)?; 350 | result 351 | } 352 | 353 | fn query(&self, command: Vec) -> Result> { 354 | match bincode::deserialize(&command)? { 355 | Query::Read { txn, table, id } => { 356 | bincode::serialize(&self.engine.resume(txn)?.read(&table, &id)?) 357 | } 358 | Query::ReadIndex { txn, table, column, value } => { 359 | bincode::serialize(&self.engine.resume(txn)?.read_index(&table, &column, &value)?) 360 | } 361 | // FIXME These need to stream rows somehow 362 | Query::Scan { txn, table, filter } => bincode::serialize( 363 | &self.engine.resume(txn)?.scan(&table, filter)?.collect::>>()?, 364 | ), 365 | Query::ScanIndex { txn, table, column } => bincode::serialize( 366 | &self 367 | .engine 368 | .resume(txn)? 369 | .scan_index(&table, &column)? 370 | .collect::>>()?, 371 | ), 372 | Query::Status => bincode::serialize(&self.engine.kv.status()?), 373 | 374 | Query::ReadTable { txn, table } => { 375 | bincode::serialize(&self.engine.resume(txn)?.read_table(&table)?) 376 | } 377 | Query::ScanTables { txn } => { 378 | bincode::serialize(&self.engine.resume(txn)?.scan_tables()?.collect::>()) 379 | } 380 | } 381 | } 382 | } 383 | -------------------------------------------------------------------------------- /src/sql/execution/aggregation.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::plan::Aggregate; 3 | use super::super::types::{Column, Value}; 4 | use super::{Executor, ResultSet}; 5 | use crate::error::{Error, Result}; 6 | 7 | use std::cmp::Ordering; 8 | use std::collections::HashMap; 9 | 10 | /// An aggregation executor 11 | pub struct Aggregation { 12 | source: Box>, 13 | aggregates: Vec, 14 | accumulators: HashMap, Vec>>, 15 | } 16 | 17 | impl Aggregation { 18 | pub fn new(source: Box>, aggregates: Vec) -> Box { 19 | Box::new(Self { source, aggregates, accumulators: HashMap::new() }) 20 | } 21 | } 22 | 23 | impl Executor for Aggregation { 24 | #[allow(clippy::or_fun_call)] 25 | fn execute(mut self: Box, txn: &mut T) -> Result { 26 | let agg_count = self.aggregates.len(); 27 | match self.source.execute(txn)? { 28 | ResultSet::Query { columns, mut rows } => { 29 | while let Some(mut row) = rows.next().transpose()? { 30 | self.accumulators 31 | .entry(row.split_off(self.aggregates.len())) 32 | .or_insert(self.aggregates.iter().map(::from).collect()) 33 | .iter_mut() 34 | .zip(row) 35 | .try_for_each(|(acc, value)| acc.accumulate(&value))? 36 | } 37 | // If there were no rows and no group-by columns, return a row of empty accumulators: 38 | // SELECT COUNT(*) FROM t WHERE FALSE 39 | if self.accumulators.is_empty() && self.aggregates.len() == columns.len() { 40 | self.accumulators.insert( 41 | Vec::new(), 42 | self.aggregates.iter().map(::from).collect(), 43 | ); 44 | } 45 | Ok(ResultSet::Query { 46 | columns: columns 47 | .into_iter() 48 | .enumerate() 49 | .map(|(i, c)| if i < agg_count { Column { name: None } } else { c }) 50 | .collect(), 51 | rows: Box::new(self.accumulators.into_iter().map(|(bucket, accs)| { 52 | Ok(accs 53 | .into_iter() 54 | .map(|acc| acc.aggregate()) 55 | .chain(bucket.into_iter()) 56 | .collect()) 57 | })), 58 | }) 59 | } 60 | r => Err(Error::Internal(format!("Unexpected result {:?}", r))), 61 | } 62 | } 63 | } 64 | 65 | // An accumulator 66 | pub trait Accumulator: std::fmt::Debug + Send { 67 | // Accumulates a value 68 | fn accumulate(&mut self, value: &Value) -> Result<()>; 69 | 70 | // Calculates a final aggregate 71 | fn aggregate(&self) -> Value; 72 | } 73 | 74 | impl dyn Accumulator { 75 | fn from(aggregate: &Aggregate) -> Box { 76 | match aggregate { 77 | Aggregate::Average => Box::new(Average::new()), 78 | Aggregate::Count => Box::new(Count::new()), 79 | Aggregate::Max => Box::new(Max::new()), 80 | Aggregate::Min => Box::new(Min::new()), 81 | Aggregate::Sum => Box::new(Sum::new()), 82 | } 83 | } 84 | } 85 | 86 | // Count non-null values 87 | #[derive(Debug)] 88 | pub struct Count { 89 | count: u64, 90 | } 91 | 92 | impl Count { 93 | pub fn new() -> Self { 94 | Self { count: 0 } 95 | } 96 | } 97 | 98 | impl Accumulator for Count { 99 | fn accumulate(&mut self, value: &Value) -> Result<()> { 100 | match value { 101 | Value::Null => {} 102 | _ => self.count += 1, 103 | } 104 | Ok(()) 105 | } 106 | 107 | fn aggregate(&self) -> Value { 108 | Value::Integer(self.count as i64) 109 | } 110 | } 111 | 112 | // Average value 113 | #[derive(Debug)] 114 | pub struct Average { 115 | count: Count, 116 | sum: Sum, 117 | } 118 | 119 | impl Average { 120 | pub fn new() -> Self { 121 | Self { count: Count::new(), sum: Sum::new() } 122 | } 123 | } 124 | 125 | impl Accumulator for Average { 126 | fn accumulate(&mut self, value: &Value) -> Result<()> { 127 | self.count.accumulate(value)?; 128 | self.sum.accumulate(value)?; 129 | Ok(()) 130 | } 131 | 132 | fn aggregate(&self) -> Value { 133 | match (self.sum.aggregate(), self.count.aggregate()) { 134 | (Value::Integer(s), Value::Integer(c)) => Value::Integer(s / c), 135 | (Value::Float(s), Value::Integer(c)) => Value::Float(s / c as f64), 136 | _ => Value::Null, 137 | } 138 | } 139 | } 140 | 141 | // Maximum value 142 | #[derive(Debug)] 143 | pub struct Max { 144 | max: Option, 145 | } 146 | 147 | impl Max { 148 | pub fn new() -> Self { 149 | Self { max: None } 150 | } 151 | } 152 | 153 | impl Accumulator for Max { 154 | fn accumulate(&mut self, value: &Value) -> Result<()> { 155 | if let Some(max) = &mut self.max { 156 | match value.partial_cmp(max) { 157 | _ if max.datatype() != value.datatype() => *max = Value::Null, 158 | None => *max = Value::Null, 159 | Some(Ordering::Greater) => *max = value.clone(), 160 | Some(Ordering::Equal) | Some(Ordering::Less) => {} 161 | }; 162 | } else { 163 | self.max = Some(value.clone()) 164 | } 165 | Ok(()) 166 | } 167 | 168 | fn aggregate(&self) -> Value { 169 | match &self.max { 170 | Some(value) => value.clone(), 171 | None => Value::Null, 172 | } 173 | } 174 | } 175 | 176 | // Minimum value 177 | #[derive(Debug)] 178 | pub struct Min { 179 | min: Option, 180 | } 181 | 182 | impl Min { 183 | pub fn new() -> Self { 184 | Self { min: None } 185 | } 186 | } 187 | 188 | impl Accumulator for Min { 189 | fn accumulate(&mut self, value: &Value) -> Result<()> { 190 | if let Some(min) = &mut self.min { 191 | match value.partial_cmp(min) { 192 | _ if min.datatype() != value.datatype() => *min = Value::Null, 193 | None => *min = Value::Null, 194 | Some(Ordering::Less) => *min = value.clone(), 195 | Some(Ordering::Equal) | Some(Ordering::Greater) => {} 196 | }; 197 | } else { 198 | self.min = Some(value.clone()) 199 | } 200 | Ok(()) 201 | } 202 | 203 | fn aggregate(&self) -> Value { 204 | match &self.min { 205 | Some(value) => value.clone(), 206 | None => Value::Null, 207 | } 208 | } 209 | } 210 | 211 | // Sum of values 212 | #[derive(Debug)] 213 | pub struct Sum { 214 | sum: Option, 215 | } 216 | 217 | impl Sum { 218 | pub fn new() -> Self { 219 | Self { sum: None } 220 | } 221 | } 222 | 223 | impl Accumulator for Sum { 224 | fn accumulate(&mut self, value: &Value) -> Result<()> { 225 | self.sum = match (&self.sum, value) { 226 | (Some(Value::Integer(s)), Value::Integer(i)) => Some(Value::Integer(s + i)), 227 | (Some(Value::Float(s)), Value::Float(f)) => Some(Value::Float(s + f)), 228 | (None, Value::Integer(i)) => Some(Value::Integer(*i)), 229 | (None, Value::Float(f)) => Some(Value::Float(*f)), 230 | _ => Some(Value::Null), 231 | }; 232 | Ok(()) 233 | } 234 | 235 | fn aggregate(&self) -> Value { 236 | match &self.sum { 237 | Some(value) => value.clone(), 238 | None => Value::Null, 239 | } 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /src/sql/execution/join.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::types::{Expression, Rows}; 3 | use super::{Executor, ResultSet, Row, Value}; 4 | use crate::error::{Error, Result}; 5 | 6 | use std::collections::HashMap; 7 | 8 | /// A nested loop join executor, which checks each row in the left source against every row in 9 | /// the right source using the given predicate. 10 | pub struct NestedLoopJoin { 11 | left: Box>, 12 | right: Box>, 13 | predicate: Option, 14 | outer: bool, 15 | } 16 | 17 | impl NestedLoopJoin { 18 | pub fn new( 19 | left: Box>, 20 | right: Box>, 21 | predicate: Option, 22 | outer: bool, 23 | ) -> Box { 24 | Box::new(Self { left, right, predicate, outer }) 25 | } 26 | } 27 | 28 | impl Executor for NestedLoopJoin { 29 | fn execute(self: Box, txn: &mut T) -> Result { 30 | if let ResultSet::Query { mut columns, rows } = self.left.execute(txn)? { 31 | if let ResultSet::Query { columns: rcolumns, rows: rrows } = self.right.execute(txn)? { 32 | let right_width = rcolumns.len(); 33 | columns.extend(rcolumns); 34 | // FIXME Since making the iterators or sources clonable is non-trivial (requiring 35 | // either avoiding Rust standard iterators or making sources generic), we simply 36 | // fetch the entire right result as a vector. 37 | return Ok(ResultSet::Query { 38 | rows: Box::new(NestedLoopRows::new( 39 | rows, 40 | rrows.collect::>>()?, 41 | right_width, 42 | self.predicate, 43 | self.outer, 44 | )), 45 | columns, 46 | }); 47 | } 48 | } 49 | Err(Error::Internal("Unexpected result set".into())) 50 | } 51 | } 52 | 53 | struct NestedLoopRows { 54 | left: Rows, 55 | left_row: Option>, 56 | right: Box + Send>, 57 | right_vec: Vec, 58 | right_empty: Vec, 59 | right_hit: bool, 60 | predicate: Option, 61 | outer: bool, 62 | } 63 | 64 | impl NestedLoopRows { 65 | fn new( 66 | mut left: Rows, 67 | right: Vec, 68 | right_width: usize, 69 | predicate: Option, 70 | outer: bool, 71 | ) -> Self { 72 | Self { 73 | left_row: left.next(), 74 | left, 75 | right: Box::new(right.clone().into_iter()), 76 | right_vec: right, 77 | right_empty: std::iter::repeat(Value::Null).take(right_width).collect(), 78 | right_hit: false, 79 | predicate, 80 | outer, 81 | } 82 | } 83 | 84 | // Tries to get the next joined row, with error handling. 85 | fn try_next(&mut self) -> Result> { 86 | // While there is a valid left row, look for a right-hand match to return. 87 | while let Some(Ok(left_row)) = self.left_row.clone() { 88 | // If there is a hit in the remaining right rows, return it. 89 | if let Some(row) = self.try_next_hit(&left_row)? { 90 | self.right_hit = true; 91 | return Ok(Some(row)); 92 | } 93 | 94 | // Otherwise, continue with the next left row and reset the right source. 95 | self.left_row = self.left.next(); 96 | self.right = Box::new(self.right_vec.clone().into_iter()); 97 | 98 | // If this is an outer join, when we reach the end of the right items without a hit, 99 | // we should return a row with nulls for the right fields. 100 | if self.outer && !self.right_hit { 101 | let mut row = left_row; 102 | row.extend(self.right_empty.clone()); 103 | return Ok(Some(row)); 104 | } 105 | self.right_hit = false; 106 | } 107 | self.left_row.clone().transpose() 108 | } 109 | 110 | /// Tries to find the next combined row that matches the predicate in the remaining right rows. 111 | fn try_next_hit(&mut self, left_row: &[Value]) -> Result> { 112 | for right_row in &mut self.right { 113 | let mut row = left_row.to_vec(); 114 | row.extend(right_row); 115 | if let Some(predicate) = &self.predicate { 116 | match predicate.evaluate(Some(&row))? { 117 | Value::Boolean(true) => return Ok(Some(row)), 118 | Value::Boolean(false) => {} 119 | Value::Null => {} 120 | value => { 121 | return Err(Error::Value(format!( 122 | "Join predicate returned {}, expected boolean", 123 | value 124 | ))) 125 | } 126 | } 127 | } else { 128 | return Ok(Some(row)); 129 | } 130 | } 131 | Ok(None) 132 | } 133 | } 134 | 135 | impl Iterator for NestedLoopRows { 136 | type Item = Result; 137 | 138 | fn next(&mut self) -> Option { 139 | self.try_next().transpose() 140 | } 141 | } 142 | 143 | /// A hash join executor 144 | pub struct HashJoin { 145 | left: Box>, 146 | left_field: usize, 147 | right: Box>, 148 | right_field: usize, 149 | outer: bool, 150 | } 151 | 152 | impl HashJoin { 153 | pub fn new( 154 | left: Box>, 155 | left_field: usize, 156 | right: Box>, 157 | right_field: usize, 158 | outer: bool, 159 | ) -> Box { 160 | Box::new(Self { left, left_field, right, right_field, outer }) 161 | } 162 | } 163 | 164 | impl Executor for HashJoin { 165 | fn execute(self: Box, txn: &mut T) -> Result { 166 | if let ResultSet::Query { mut columns, rows } = self.left.execute(txn)? { 167 | if let ResultSet::Query { columns: rcolumns, rows: rrows } = self.right.execute(txn)? { 168 | let (l, r, outer) = (self.left_field, self.right_field, self.outer); 169 | let right: HashMap = rrows 170 | .map(|res| match res { 171 | Ok(row) if row.len() <= r => { 172 | Err(Error::Internal(format!("Right index {} out of bounds", r))) 173 | } 174 | Ok(row) => Ok((row[r].clone(), row)), 175 | Err(err) => Err(err), 176 | }) 177 | .collect::>()?; 178 | let empty = std::iter::repeat(Value::Null).take(rcolumns.len()); 179 | columns.extend(rcolumns); 180 | let rows = Box::new(rows.filter_map(move |res| match res { 181 | Ok(row) if row.len() <= l => { 182 | Some(Err(Error::Value(format!("Left index {} out of bounds", l)))) 183 | } 184 | Ok(mut row) => match right.get(&row[l]) { 185 | Some(hit) => { 186 | row.extend(hit.clone()); 187 | Some(Ok(row)) 188 | } 189 | None if outer => { 190 | row.extend(empty.clone()); 191 | Some(Ok(row)) 192 | } 193 | None => None, 194 | }, 195 | Err(err) => Some(Err(err)), 196 | })); 197 | return Ok(ResultSet::Query { columns, rows }); 198 | } 199 | } 200 | Err(Error::Internal("Unexpected result set".into())) 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/sql/execution/mod.rs: -------------------------------------------------------------------------------- 1 | mod aggregation; 2 | mod join; 3 | mod mutation; 4 | mod query; 5 | mod schema; 6 | mod source; 7 | 8 | use aggregation::Aggregation; 9 | use join::{HashJoin, NestedLoopJoin}; 10 | use mutation::{Delete, Insert, Update}; 11 | use query::{Filter, Limit, Offset, Order, Projection}; 12 | use schema::{CreateTable, DropTable}; 13 | use source::{IndexLookup, KeyLookup, Nothing, Scan}; 14 | 15 | use super::engine::Transaction; 16 | use super::plan::Node; 17 | use super::types::{Columns, Row, Rows, Value}; 18 | use crate::error::{Error, Result}; 19 | 20 | use derivative::Derivative; 21 | use serde_derive::{Deserialize, Serialize}; 22 | 23 | /// A plan executor 24 | pub trait Executor { 25 | /// Executes the executor, consuming it and returning a result set 26 | fn execute(self: Box, txn: &mut T) -> Result; 27 | } 28 | 29 | impl dyn Executor { 30 | /// Builds an executor for a plan node, consuming it 31 | pub fn build(node: Node) -> Box> { 32 | match node { 33 | Node::Aggregation { source, aggregates } => { 34 | Aggregation::new(Self::build(*source), aggregates) 35 | } 36 | Node::CreateTable { schema } => CreateTable::new(schema), 37 | Node::Delete { table, source } => Delete::new(table, Self::build(*source)), 38 | Node::DropTable { table } => DropTable::new(table), 39 | Node::Filter { source, predicate } => Filter::new(Self::build(*source), predicate), 40 | Node::HashJoin { left, left_field, right, right_field, outer } => HashJoin::new( 41 | Self::build(*left), 42 | left_field.0, 43 | Self::build(*right), 44 | right_field.0, 45 | outer, 46 | ), 47 | Node::IndexLookup { table, alias: _, column, values } => { 48 | IndexLookup::new(table, column, values) 49 | } 50 | Node::Insert { table, columns, expressions } => { 51 | Insert::new(table, columns, expressions) 52 | } 53 | Node::KeyLookup { table, alias: _, keys } => KeyLookup::new(table, keys), 54 | Node::Limit { source, limit } => Limit::new(Self::build(*source), limit), 55 | Node::NestedLoopJoin { left, left_size: _, right, predicate, outer } => { 56 | NestedLoopJoin::new(Self::build(*left), Self::build(*right), predicate, outer) 57 | } 58 | Node::Nothing => Nothing::new(), 59 | Node::Offset { source, offset } => Offset::new(Self::build(*source), offset), 60 | Node::Order { source, orders } => Order::new(Self::build(*source), orders), 61 | Node::Projection { source, expressions } => { 62 | Projection::new(Self::build(*source), expressions) 63 | } 64 | Node::Scan { table, filter, alias: _ } => Scan::new(table, filter), 65 | Node::Update { table, source, expressions } => Update::new( 66 | table, 67 | Self::build(*source), 68 | expressions.into_iter().map(|(i, _, e)| (i, e)).collect(), 69 | ), 70 | } 71 | } 72 | } 73 | 74 | /// An executor result set 75 | #[derive(Derivative, Serialize, Deserialize)] 76 | #[derivative(Debug, PartialEq)] 77 | pub enum ResultSet { 78 | // Transaction started 79 | Begin { 80 | version: u64, 81 | read_only: bool, 82 | }, 83 | // Transaction committed 84 | Commit { 85 | version: u64, 86 | }, 87 | // Transaction rolled back 88 | Rollback { 89 | version: u64, 90 | }, 91 | // Rows created 92 | Create { 93 | count: u64, 94 | }, 95 | // Rows deleted 96 | Delete { 97 | count: u64, 98 | }, 99 | // Rows updated 100 | Update { 101 | count: u64, 102 | }, 103 | // Table created 104 | CreateTable { 105 | name: String, 106 | }, 107 | // Table dropped 108 | DropTable { 109 | name: String, 110 | }, 111 | // Query result 112 | Query { 113 | columns: Columns, 114 | #[derivative(Debug = "ignore")] 115 | #[derivative(PartialEq = "ignore")] 116 | #[serde(skip, default = "ResultSet::empty_rows")] 117 | rows: Rows, 118 | }, 119 | // Explain result 120 | Explain(Node), 121 | } 122 | 123 | impl ResultSet { 124 | /// Creates an empty row iterator, for use by serde(default). 125 | fn empty_rows() -> Rows { 126 | Box::new(std::iter::empty()) 127 | } 128 | 129 | /// Converts the ResultSet into a row, or errors if not a query result with rows. 130 | pub fn into_row(self) -> Result { 131 | if let ResultSet::Query { mut rows, .. } = self { 132 | rows.next().transpose()?.ok_or_else(|| Error::Value("No rows returned".into())) 133 | } else { 134 | Err(Error::Value(format!("Not a query result: {:?}", self))) 135 | } 136 | } 137 | 138 | /// Converts the ResultSet into a value, if possible. 139 | pub fn into_value(self) -> Result { 140 | self.into_row()?.into_iter().next().ok_or_else(|| Error::Value("No value returned".into())) 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/sql/execution/mutation.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::schema::Table; 3 | use super::super::types::{Expression, Row, Value}; 4 | use super::{Executor, ResultSet}; 5 | use crate::error::{Error, Result}; 6 | 7 | use std::collections::{HashMap, HashSet}; 8 | 9 | /// An INSERT executor 10 | pub struct Insert { 11 | table: String, 12 | columns: Vec, 13 | rows: Vec>, 14 | } 15 | 16 | impl Insert { 17 | pub fn new(table: String, columns: Vec, rows: Vec>) -> Box { 18 | Box::new(Self { table, columns, rows }) 19 | } 20 | 21 | // Builds a row from a set of column names and values, padding it with default values. 22 | pub fn make_row(table: &Table, columns: &[String], values: Vec) -> Result { 23 | if columns.len() != values.len() { 24 | return Err(Error::Value("Column and value counts do not match".into())); 25 | } 26 | let mut inputs = HashMap::new(); 27 | for (c, v) in columns.iter().zip(values.into_iter()) { 28 | table.get_column(c)?; 29 | if inputs.insert(c.clone(), v).is_some() { 30 | return Err(Error::Value(format!("Column {} given multiple times", c))); 31 | } 32 | } 33 | let mut row = Row::new(); 34 | for column in table.columns.iter() { 35 | if let Some(value) = inputs.get(&column.name) { 36 | row.push(value.clone()) 37 | } else if let Some(value) = &column.default { 38 | row.push(value.clone()) 39 | } else { 40 | return Err(Error::Value(format!("No value given for column {}", column.name))); 41 | } 42 | } 43 | Ok(row) 44 | } 45 | 46 | /// Pads a row with default values where possible. 47 | fn pad_row(table: &Table, mut row: Row) -> Result { 48 | for column in table.columns.iter().skip(row.len()) { 49 | if let Some(default) = &column.default { 50 | row.push(default.clone()) 51 | } else { 52 | return Err(Error::Value(format!("No default value for column {}", column.name))); 53 | } 54 | } 55 | Ok(row) 56 | } 57 | } 58 | 59 | impl Executor for Insert { 60 | fn execute(self: Box, txn: &mut T) -> Result { 61 | let table = txn.must_read_table(&self.table)?; 62 | let mut count = 0; 63 | for expressions in self.rows { 64 | let mut row = 65 | expressions.into_iter().map(|expr| expr.evaluate(None)).collect::>()?; 66 | if self.columns.is_empty() { 67 | row = Self::pad_row(&table, row)?; 68 | } else { 69 | row = Self::make_row(&table, &self.columns, row)?; 70 | } 71 | txn.create(&table.name, row)?; 72 | count += 1; 73 | } 74 | Ok(ResultSet::Create { count }) 75 | } 76 | } 77 | 78 | /// An UPDATE executor 79 | pub struct Update { 80 | table: String, 81 | source: Box>, 82 | expressions: Vec<(usize, Expression)>, 83 | } 84 | 85 | impl Update { 86 | pub fn new( 87 | table: String, 88 | source: Box>, 89 | expressions: Vec<(usize, Expression)>, 90 | ) -> Box { 91 | Box::new(Self { table, source, expressions }) 92 | } 93 | } 94 | 95 | impl Executor for Update { 96 | fn execute(self: Box, txn: &mut T) -> Result { 97 | match self.source.execute(txn)? { 98 | ResultSet::Query { mut rows, .. } => { 99 | let table = txn.must_read_table(&self.table)?; 100 | 101 | // The iterator will see our changes, such that the same item may be iterated over 102 | // multiple times. We keep track of the primary keys here to avoid that, althought 103 | // it may cause ballooning memory usage for large updates. 104 | // 105 | // FIXME This is not safe for primary key updates, which may still be processed 106 | // multiple times - it should be possible to come up with a pathological case that 107 | // loops forever (e.g. UPDATE test SET id = id + 1). 108 | let mut updated = HashSet::new(); 109 | while let Some(row) = rows.next().transpose()? { 110 | let id = table.get_row_key(&row)?; 111 | if updated.contains(&id) { 112 | continue; 113 | } 114 | let mut new = row.clone(); 115 | for (field, expr) in &self.expressions { 116 | new[*field] = expr.evaluate(Some(&row))?; 117 | } 118 | txn.update(&table.name, &id, new)?; 119 | updated.insert(id); 120 | } 121 | Ok(ResultSet::Update { count: updated.len() as u64 }) 122 | } 123 | r => Err(Error::Internal(format!("Unexpected response {:?}", r))), 124 | } 125 | } 126 | } 127 | 128 | /// A DELETE executor 129 | pub struct Delete { 130 | table: String, 131 | source: Box>, 132 | } 133 | 134 | impl Delete { 135 | pub fn new(table: String, source: Box>) -> Box { 136 | Box::new(Self { table, source }) 137 | } 138 | } 139 | 140 | impl Executor for Delete { 141 | fn execute(self: Box, txn: &mut T) -> Result { 142 | let table = txn.must_read_table(&self.table)?; 143 | let mut count = 0; 144 | match self.source.execute(txn)? { 145 | ResultSet::Query { mut rows, .. } => { 146 | while let Some(row) = rows.next().transpose()? { 147 | txn.delete(&table.name, &table.get_row_key(&row)?)?; 148 | count += 1 149 | } 150 | Ok(ResultSet::Delete { count }) 151 | } 152 | r => Err(Error::Internal(format!("Unexpected result {:?}", r))), 153 | } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/sql/execution/query.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::plan::Direction; 3 | use super::super::types::{Column, Expression, Row, Value}; 4 | use super::{Executor, ResultSet}; 5 | use crate::error::{Error, Result}; 6 | 7 | /// A filter executor 8 | pub struct Filter { 9 | source: Box>, 10 | predicate: Expression, 11 | } 12 | 13 | impl Filter { 14 | pub fn new(source: Box>, predicate: Expression) -> Box { 15 | Box::new(Self { source, predicate }) 16 | } 17 | } 18 | 19 | impl Executor for Filter { 20 | fn execute(self: Box, txn: &mut T) -> Result { 21 | if let ResultSet::Query { columns, rows } = self.source.execute(txn)? { 22 | let predicate = self.predicate; 23 | Ok(ResultSet::Query { 24 | columns, 25 | rows: Box::new(rows.filter_map(move |r| { 26 | r.and_then(|row| match predicate.evaluate(Some(&row))? { 27 | Value::Boolean(true) => Ok(Some(row)), 28 | Value::Boolean(false) => Ok(None), 29 | Value::Null => Ok(None), 30 | value => Err(Error::Value(format!( 31 | "Filter returned {}, expected boolean", 32 | value 33 | ))), 34 | }) 35 | .transpose() 36 | })), 37 | }) 38 | } else { 39 | Err(Error::Internal("Unexpected result".into())) 40 | } 41 | } 42 | } 43 | 44 | /// A projection executor 45 | pub struct Projection { 46 | source: Box>, 47 | expressions: Vec<(Expression, Option)>, 48 | } 49 | 50 | impl Projection { 51 | pub fn new( 52 | source: Box>, 53 | expressions: Vec<(Expression, Option)>, 54 | ) -> Box { 55 | Box::new(Self { source, expressions }) 56 | } 57 | } 58 | 59 | impl Executor for Projection { 60 | fn execute(self: Box, txn: &mut T) -> Result { 61 | if let ResultSet::Query { columns, rows } = self.source.execute(txn)? { 62 | let (expressions, labels): (Vec, Vec>) = 63 | self.expressions.into_iter().unzip(); 64 | let columns = expressions 65 | .iter() 66 | .enumerate() 67 | .map(|(i, e)| { 68 | if let Some(Some(label)) = labels.get(i) { 69 | Column { name: Some(label.clone()) } 70 | } else if let Expression::Field(i, _) = e { 71 | columns.get(*i).cloned().unwrap_or(Column { name: None }) 72 | } else { 73 | Column { name: None } 74 | } 75 | }) 76 | .collect(); 77 | let rows = Box::new(rows.map(move |r| { 78 | r.and_then(|row| { 79 | expressions.iter().map(|e| e.evaluate(Some(&row))).collect::>() 80 | }) 81 | })); 82 | Ok(ResultSet::Query { columns, rows }) 83 | } else { 84 | Err(Error::Internal("Unexpected result".into())) 85 | } 86 | } 87 | } 88 | 89 | /// An ORDER BY executor 90 | pub struct Order { 91 | source: Box>, 92 | order: Vec<(Expression, Direction)>, 93 | } 94 | 95 | impl Order { 96 | pub fn new(source: Box>, order: Vec<(Expression, Direction)>) -> Box { 97 | Box::new(Self { source, order }) 98 | } 99 | } 100 | 101 | impl Executor for Order { 102 | fn execute(self: Box, txn: &mut T) -> Result { 103 | match self.source.execute(txn)? { 104 | ResultSet::Query { columns, mut rows } => { 105 | // FIXME Since we can't return errors from the sort_by closure, we have to 106 | // pre-evaluate all values. This means that we can't short-circuit evaluation, 107 | // and have to temporarily store evaluated values, which is bad for performance 108 | // and memory usage respectively 109 | struct Item { 110 | row: Row, 111 | values: Vec, 112 | } 113 | 114 | let mut items = Vec::new(); 115 | while let Some(row) = rows.next().transpose()? { 116 | let mut values = Vec::new(); 117 | for (expr, _) in self.order.iter() { 118 | values.push(expr.evaluate(Some(&row))?); 119 | } 120 | items.push(Item { row, values }) 121 | } 122 | 123 | let order = &self.order; 124 | items.sort_by(|a, b| { 125 | for (i, (_, order)) in order.iter().enumerate() { 126 | let value_a = &a.values[i]; 127 | let value_b = &b.values[i]; 128 | match value_a.partial_cmp(value_b) { 129 | Some(std::cmp::Ordering::Equal) => {} 130 | Some(o) => { 131 | return if *order == Direction::Ascending { o } else { o.reverse() } 132 | } 133 | None => {} 134 | } 135 | } 136 | std::cmp::Ordering::Equal 137 | }); 138 | 139 | Ok(ResultSet::Query { 140 | columns, 141 | rows: Box::new(items.into_iter().map(|i| Ok(i.row))), 142 | }) 143 | } 144 | r => Err(Error::Internal(format!("Unexpected result {:?}", r))), 145 | } 146 | } 147 | } 148 | 149 | /// A LIMIT executor 150 | pub struct Limit { 151 | source: Box>, 152 | limit: u64, 153 | } 154 | 155 | impl Limit { 156 | pub fn new(source: Box>, limit: u64) -> Box { 157 | Box::new(Self { source, limit }) 158 | } 159 | } 160 | 161 | impl Executor for Limit { 162 | fn execute(self: Box, txn: &mut T) -> Result { 163 | if let ResultSet::Query { columns, rows } = self.source.execute(txn)? { 164 | Ok(ResultSet::Query { columns, rows: Box::new(rows.take(self.limit as usize)) }) 165 | } else { 166 | Err(Error::Internal("Unexpected result".into())) 167 | } 168 | } 169 | } 170 | 171 | /// An OFFSET executor 172 | pub struct Offset { 173 | source: Box>, 174 | offset: u64, 175 | } 176 | 177 | impl Offset { 178 | pub fn new(source: Box>, offset: u64) -> Box { 179 | Box::new(Self { source, offset }) 180 | } 181 | } 182 | 183 | impl Executor for Offset { 184 | fn execute(self: Box, txn: &mut T) -> Result { 185 | if let ResultSet::Query { columns, rows } = self.source.execute(txn)? { 186 | Ok(ResultSet::Query { columns, rows: Box::new(rows.skip(self.offset as usize)) }) 187 | } else { 188 | Err(Error::Internal("Unexpected result".into())) 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /src/sql/execution/schema.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::schema::Table; 3 | use super::{Executor, ResultSet}; 4 | use crate::error::Result; 5 | 6 | /// A CREATE TABLE executor 7 | pub struct CreateTable { 8 | table: Table, 9 | } 10 | 11 | impl CreateTable { 12 | pub fn new(table: Table) -> Box { 13 | Box::new(Self { table }) 14 | } 15 | } 16 | 17 | impl Executor for CreateTable { 18 | fn execute(self: Box, txn: &mut T) -> Result { 19 | let name = self.table.name.clone(); 20 | txn.create_table(self.table)?; 21 | Ok(ResultSet::CreateTable { name }) 22 | } 23 | } 24 | 25 | /// A DROP TABLE executor 26 | pub struct DropTable { 27 | table: String, 28 | } 29 | 30 | impl DropTable { 31 | pub fn new(table: String) -> Box { 32 | Box::new(Self { table }) 33 | } 34 | } 35 | 36 | impl Executor for DropTable { 37 | fn execute(self: Box, txn: &mut T) -> Result { 38 | txn.delete_table(&self.table)?; 39 | Ok(ResultSet::DropTable { name: self.table }) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/sql/execution/source.rs: -------------------------------------------------------------------------------- 1 | use super::super::engine::Transaction; 2 | use super::super::types::{Column, Expression, Row, Value}; 3 | use super::{Executor, ResultSet}; 4 | use crate::error::Result; 5 | 6 | use std::collections::HashSet; 7 | 8 | /// A table scan executor 9 | pub struct Scan { 10 | table: String, 11 | filter: Option, 12 | } 13 | 14 | impl Scan { 15 | pub fn new(table: String, filter: Option) -> Box { 16 | Box::new(Self { table, filter }) 17 | } 18 | } 19 | 20 | impl Executor for Scan { 21 | fn execute(self: Box, txn: &mut T) -> Result { 22 | let table = txn.must_read_table(&self.table)?; 23 | Ok(ResultSet::Query { 24 | columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(), 25 | rows: Box::new(txn.scan(&table.name, self.filter)?), 26 | }) 27 | } 28 | } 29 | 30 | /// A primary key lookup executor 31 | pub struct KeyLookup { 32 | table: String, 33 | keys: Vec, 34 | } 35 | 36 | impl KeyLookup { 37 | pub fn new(table: String, keys: Vec) -> Box { 38 | Box::new(Self { table, keys }) 39 | } 40 | } 41 | 42 | impl Executor for KeyLookup { 43 | fn execute(self: Box, txn: &mut T) -> Result { 44 | let table = txn.must_read_table(&self.table)?; 45 | 46 | // FIXME Is there a way to pass the txn into an iterator closure instead? 47 | let rows = self 48 | .keys 49 | .into_iter() 50 | .filter_map(|key| txn.read(&table.name, &key).transpose()) 51 | .collect::>>()?; 52 | 53 | Ok(ResultSet::Query { 54 | columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(), 55 | rows: Box::new(rows.into_iter().map(Ok)), 56 | }) 57 | } 58 | } 59 | 60 | /// An index value lookup executor 61 | pub struct IndexLookup { 62 | table: String, 63 | column: String, 64 | values: Vec, 65 | } 66 | 67 | impl IndexLookup { 68 | pub fn new(table: String, column: String, values: Vec) -> Box { 69 | Box::new(Self { table, column, values }) 70 | } 71 | } 72 | 73 | impl Executor for IndexLookup { 74 | fn execute(self: Box, txn: &mut T) -> Result { 75 | let table = txn.must_read_table(&self.table)?; 76 | 77 | let mut pks: HashSet = HashSet::new(); 78 | for value in self.values { 79 | pks.extend(txn.read_index(&self.table, &self.column, &value)?); 80 | } 81 | 82 | // FIXME Is there a way to pass the txn into an iterator closure instead? 83 | let rows = pks 84 | .into_iter() 85 | .filter_map(|pk| txn.read(&table.name, &pk).transpose()) 86 | .collect::>>()?; 87 | 88 | Ok(ResultSet::Query { 89 | columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(), 90 | rows: Box::new(rows.into_iter().map(Ok)), 91 | }) 92 | } 93 | } 94 | 95 | /// An executor that produces a single empty row 96 | pub struct Nothing; 97 | 98 | impl Nothing { 99 | pub fn new() -> Box { 100 | Box::new(Self) 101 | } 102 | } 103 | 104 | impl Executor for Nothing { 105 | fn execute(self: Box, _: &mut T) -> Result { 106 | Ok(ResultSet::Query { 107 | columns: Vec::new(), 108 | rows: Box::new(std::iter::once(Ok(Row::new()))), 109 | }) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/sql/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod engine; 2 | pub mod execution; 3 | pub mod parser; 4 | pub mod plan; 5 | pub mod schema; 6 | pub mod types; 7 | -------------------------------------------------------------------------------- /src/sql/parser/ast.rs: -------------------------------------------------------------------------------- 1 | use super::super::types::DataType; 2 | use crate::error::Result; 3 | 4 | use std::collections::BTreeMap; 5 | use std::mem::replace; 6 | 7 | /// Statements 8 | #[derive(Clone, Debug, PartialEq)] 9 | #[allow(clippy::large_enum_variant)] 10 | pub enum Statement { 11 | Begin { 12 | read_only: bool, 13 | as_of: Option, 14 | }, 15 | Commit, 16 | Rollback, 17 | Explain(Box), 18 | 19 | CreateTable { 20 | name: String, 21 | columns: Vec, 22 | }, 23 | DropTable(String), 24 | 25 | Delete { 26 | table: String, 27 | r#where: Option, 28 | }, 29 | Insert { 30 | table: String, 31 | columns: Option>, 32 | values: Vec>, 33 | }, 34 | Update { 35 | table: String, 36 | set: BTreeMap, 37 | r#where: Option, 38 | }, 39 | 40 | Select { 41 | select: Vec<(Expression, Option)>, 42 | from: Vec, 43 | r#where: Option, 44 | group_by: Vec, 45 | having: Option, 46 | order: Vec<(Expression, Order)>, 47 | offset: Option, 48 | limit: Option, 49 | }, 50 | } 51 | 52 | /// A FROM item 53 | #[derive(Clone, Debug, PartialEq)] 54 | pub enum FromItem { 55 | Table { 56 | name: String, 57 | alias: Option, 58 | }, 59 | Join { 60 | left: Box, 61 | right: Box, 62 | r#type: JoinType, 63 | predicate: Option, 64 | }, 65 | } 66 | 67 | /// A JOIN type 68 | #[derive(Clone, Debug, PartialEq)] 69 | pub enum JoinType { 70 | Cross, 71 | Inner, 72 | Left, 73 | Right, 74 | } 75 | 76 | /// A column 77 | #[derive(Clone, Debug, PartialEq)] 78 | pub struct Column { 79 | pub name: String, 80 | pub datatype: DataType, 81 | pub primary_key: bool, 82 | pub nullable: Option, 83 | pub default: Option, 84 | pub unique: bool, 85 | pub index: bool, 86 | pub references: Option, 87 | } 88 | 89 | /// Sort orders 90 | #[derive(Clone, Debug, PartialEq)] 91 | pub enum Order { 92 | Ascending, 93 | Descending, 94 | } 95 | 96 | /// Expressions 97 | #[derive(Clone, Debug, PartialEq)] 98 | pub enum Expression { 99 | Field(Option, String), 100 | Column(usize), // only used during plan building to break off expression subtrees 101 | Literal(Literal), 102 | Function(String, Vec), 103 | Operation(Operation), 104 | } 105 | 106 | impl From for Expression { 107 | fn from(literal: Literal) -> Self { 108 | Self::Literal(literal) 109 | } 110 | } 111 | 112 | impl From for Expression { 113 | fn from(op: Operation) -> Self { 114 | Self::Operation(op) 115 | } 116 | } 117 | 118 | /// Literals 119 | #[derive(Clone, Debug, PartialEq)] 120 | pub enum Literal { 121 | Null, 122 | Boolean(bool), 123 | Integer(i64), 124 | Float(f64), 125 | String(String), 126 | } 127 | 128 | /// Operations (done by operators) 129 | #[derive(Clone, Debug, PartialEq)] 130 | pub enum Operation { 131 | // Logical operators 132 | And(Box, Box), 133 | Not(Box), 134 | Or(Box, Box), 135 | 136 | // Comparison operators 137 | Equal(Box, Box), 138 | GreaterThan(Box, Box), 139 | GreaterThanOrEqual(Box, Box), 140 | IsNull(Box), 141 | LessThan(Box, Box), 142 | LessThanOrEqual(Box, Box), 143 | NotEqual(Box, Box), 144 | 145 | // Mathematical operators 146 | Add(Box, Box), 147 | Assert(Box), 148 | Divide(Box, Box), 149 | Exponentiate(Box, Box), 150 | Factorial(Box), 151 | Modulo(Box, Box), 152 | Multiply(Box, Box), 153 | Negate(Box), 154 | Subtract(Box, Box), 155 | 156 | // String operators 157 | Like(Box, Box), 158 | } 159 | 160 | impl Expression { 161 | /// Walks the expression tree while calling a closure. Returns true as soon as the closure 162 | /// returns true. This is the inverse of walk(). 163 | pub fn contains bool>(&self, visitor: &F) -> bool { 164 | !self.walk(&|e| !visitor(e)) 165 | } 166 | 167 | /// Replaces the expression with result of the closure. Helper function for transform(). 168 | fn replace_with Result>(&mut self, mut f: F) -> Result<()> { 169 | // Temporarily replace expression with a null value, in case closure panics. May consider 170 | // replace_with crate if this hampers performance. 171 | let expr = replace(self, Expression::Literal(Literal::Null)); 172 | *self = f(expr)?; 173 | Ok(()) 174 | } 175 | 176 | /// Transforms the expression tree by applying a closure before and after descending. 177 | pub fn transform(mut self, before: &mut B, after: &mut A) -> Result 178 | where 179 | B: FnMut(Self) -> Result, 180 | A: FnMut(Self) -> Result, 181 | { 182 | use Operation::*; 183 | self = before(self)?; 184 | match &mut self { 185 | Self::Operation(Add(lhs, rhs)) 186 | | Self::Operation(And(lhs, rhs)) 187 | | Self::Operation(Divide(lhs, rhs)) 188 | | Self::Operation(Equal(lhs, rhs)) 189 | | Self::Operation(Exponentiate(lhs, rhs)) 190 | | Self::Operation(GreaterThan(lhs, rhs)) 191 | | Self::Operation(GreaterThanOrEqual(lhs, rhs)) 192 | | Self::Operation(LessThan(lhs, rhs)) 193 | | Self::Operation(LessThanOrEqual(lhs, rhs)) 194 | | Self::Operation(Like(lhs, rhs)) 195 | | Self::Operation(Modulo(lhs, rhs)) 196 | | Self::Operation(Multiply(lhs, rhs)) 197 | | Self::Operation(NotEqual(lhs, rhs)) 198 | | Self::Operation(Or(lhs, rhs)) 199 | | Self::Operation(Subtract(lhs, rhs)) => { 200 | Self::replace_with(lhs, |e| e.transform(before, after))?; 201 | Self::replace_with(rhs, |e| e.transform(before, after))?; 202 | } 203 | 204 | Self::Operation(Assert(expr)) 205 | | Self::Operation(Factorial(expr)) 206 | | Self::Operation(IsNull(expr)) 207 | | Self::Operation(Negate(expr)) 208 | | Self::Operation(Not(expr)) => { 209 | Self::replace_with(expr, |e| e.transform(before, after))? 210 | } 211 | 212 | Self::Function(_, exprs) => { 213 | for expr in exprs { 214 | Self::replace_with(expr, |e| e.transform(before, after))?; 215 | } 216 | } 217 | 218 | Self::Literal(_) | Self::Field(_, _) | Self::Column(_) => {} 219 | }; 220 | after(self) 221 | } 222 | 223 | /// Transforms an expression using a mutable reference. 224 | pub fn transform_mut(&mut self, before: &mut B, after: &mut A) -> Result<()> 225 | where 226 | B: FnMut(Self) -> Result, 227 | A: FnMut(Self) -> Result, 228 | { 229 | self.replace_with(|e| e.transform(before, after)) 230 | } 231 | 232 | /// Walks the expression tree, calling a closure for every node. Halts if closure returns false. 233 | pub fn walk bool>(&self, visitor: &F) -> bool { 234 | use Operation::*; 235 | visitor(self) 236 | && match self { 237 | Self::Operation(Add(lhs, rhs)) 238 | | Self::Operation(And(lhs, rhs)) 239 | | Self::Operation(Divide(lhs, rhs)) 240 | | Self::Operation(Equal(lhs, rhs)) 241 | | Self::Operation(Exponentiate(lhs, rhs)) 242 | | Self::Operation(GreaterThan(lhs, rhs)) 243 | | Self::Operation(GreaterThanOrEqual(lhs, rhs)) 244 | | Self::Operation(LessThan(lhs, rhs)) 245 | | Self::Operation(LessThanOrEqual(lhs, rhs)) 246 | | Self::Operation(Like(lhs, rhs)) 247 | | Self::Operation(Modulo(lhs, rhs)) 248 | | Self::Operation(Multiply(lhs, rhs)) 249 | | Self::Operation(NotEqual(lhs, rhs)) 250 | | Self::Operation(Or(lhs, rhs)) 251 | | Self::Operation(Subtract(lhs, rhs)) => lhs.walk(visitor) && rhs.walk(visitor), 252 | 253 | Self::Operation(Assert(expr)) 254 | | Self::Operation(Factorial(expr)) 255 | | Self::Operation(IsNull(expr)) 256 | | Self::Operation(Negate(expr)) 257 | | Self::Operation(Not(expr)) => expr.walk(visitor), 258 | 259 | Self::Function(_, exprs) => { 260 | for expr in exprs { 261 | if !expr.walk(visitor) { 262 | return false; 263 | } 264 | } 265 | true 266 | } 267 | 268 | Self::Literal(_) | Self::Field(_, _) | Self::Column(_) => true, 269 | } 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /src/sql/plan/optimizer.rs: -------------------------------------------------------------------------------- 1 | use super::super::schema::Catalog; 2 | use super::super::types::{Expression, Value}; 3 | use super::Node; 4 | use crate::error::Result; 5 | 6 | use std::mem::replace; 7 | 8 | /// A plan optimizer 9 | pub trait Optimizer { 10 | fn optimize(&self, node: Node) -> Result; 11 | } 12 | 13 | /// A constant folding optimizer, which replaces constant expressions with their evaluated value, to 14 | /// prevent it from being re-evaluated over and over again during plan execution. 15 | pub struct ConstantFolder; 16 | 17 | impl Optimizer for ConstantFolder { 18 | fn optimize(&self, node: Node) -> Result { 19 | node.transform(&Ok, &|n| { 20 | n.transform_expressions( 21 | &|e| { 22 | if !e.contains(&|expr| matches!(expr, Expression::Field(_, _))) { 23 | Ok(Expression::Constant(e.evaluate(None)?)) 24 | } else { 25 | Ok(e) 26 | } 27 | }, 28 | &Ok, 29 | ) 30 | }) 31 | } 32 | } 33 | 34 | /// A filter pushdown optimizer, which moves filter predicates into or closer to the source node. 35 | pub struct FilterPushdown; 36 | 37 | impl Optimizer for FilterPushdown { 38 | fn optimize(&self, node: Node) -> Result { 39 | node.transform( 40 | &|n| match n { 41 | Node::Filter { mut source, predicate } => { 42 | // We don't replace the filter node here, since doing so would cause transform() 43 | // to skip the source as it won't reapply the transform to the "same" node. 44 | // We leave a noop filter node instead, which will be cleaned up by NoopCleaner. 45 | if let Some(remainder) = self.pushdown(predicate, &mut source) { 46 | Ok(Node::Filter { source, predicate: remainder }) 47 | } else { 48 | Ok(Node::Filter { 49 | source, 50 | predicate: Expression::Constant(Value::Boolean(true)), 51 | }) 52 | } 53 | } 54 | Node::NestedLoopJoin { 55 | mut left, 56 | left_size, 57 | mut right, 58 | predicate: Some(predicate), 59 | outer, 60 | } => { 61 | let predicate = self.pushdown_join(predicate, &mut left, &mut right, left_size); 62 | Ok(Node::NestedLoopJoin { left, left_size, right, predicate, outer }) 63 | } 64 | n => Ok(n), 65 | }, 66 | &Ok, 67 | ) 68 | } 69 | } 70 | 71 | impl FilterPushdown { 72 | /// Attempts to push an expression down into a target node, returns any remaining expression. 73 | fn pushdown(&self, mut expression: Expression, target: &mut Node) -> Option { 74 | match target { 75 | Node::Scan { ref mut filter, .. } => { 76 | if let Some(filter) = filter.take() { 77 | expression = Expression::And(Box::new(expression), Box::new(filter)) 78 | } 79 | filter.replace(expression) 80 | } 81 | Node::NestedLoopJoin { ref mut predicate, .. } => { 82 | if let Some(predicate) = predicate.take() { 83 | expression = Expression::And(Box::new(expression), Box::new(predicate)); 84 | } 85 | predicate.replace(expression) 86 | } 87 | Node::Filter { ref mut predicate, .. } => { 88 | let p = replace(predicate, Expression::Constant(Value::Null)); 89 | *predicate = Expression::And(Box::new(p), Box::new(expression)); 90 | None 91 | } 92 | _ => Some(expression), 93 | } 94 | } 95 | 96 | /// Attempts to partition a join predicate and push parts of it down into either source, 97 | /// returning any remaining expression. 98 | fn pushdown_join( 99 | &self, 100 | predicate: Expression, 101 | left: &mut Node, 102 | right: &mut Node, 103 | boundary: usize, 104 | ) -> Option { 105 | // Convert the predicate into conjunctive normal form, and partition into expressions 106 | // only referencing the left or right sources, leaving cross-source expressions. 107 | let cnf = predicate.into_cnf_vec(); 108 | let (mut push_left, cnf): (Vec, Vec) = 109 | cnf.into_iter().partition(|e| { 110 | // Partition only if no expressions reference the right-hand source. 111 | !e.contains(&|e| matches!(e, Expression::Field(i, _) if i >= &boundary)) 112 | }); 113 | let (mut push_right, mut cnf): (Vec, Vec) = 114 | cnf.into_iter().partition(|e| { 115 | // Partition only if no expressions reference the left-hand source. 116 | !e.contains(&|e| matches!(e, Expression::Field(i, _) if i < &boundary)) 117 | }); 118 | 119 | // Look for equijoins that have constant lookups on either side, and transfer the constants 120 | // to the other side of the join as well. This allows index lookup optimization in both 121 | // sides. We already know that the remaining cnf expressions span both sources. 122 | for e in &cnf { 123 | if let Expression::Equal(ref lhs, ref rhs) = e { 124 | if let (Expression::Field(l, ln), Expression::Field(r, rn)) = (&**lhs, &**rhs) { 125 | let (l, ln, r, rn) = if l > r { (r, rn, l, ln) } else { (l, ln, r, rn) }; 126 | if let Some(lvals) = push_left.iter().find_map(|e| e.as_lookup(*l)) { 127 | push_right.push(Expression::from_lookup(*r, rn.clone(), lvals)); 128 | } else if let Some(rvals) = push_right.iter().find_map(|e| e.as_lookup(*r)) { 129 | push_left.push(Expression::from_lookup(*l, ln.clone(), rvals)); 130 | } 131 | } 132 | } 133 | } 134 | 135 | // Push predicates down into the sources. 136 | if let Some(push_left) = Expression::from_cnf_vec(push_left) { 137 | if let Some(remainder) = self.pushdown(push_left, left) { 138 | cnf.push(remainder) 139 | } 140 | } 141 | if let Some(mut push_right) = Expression::from_cnf_vec(push_right) { 142 | // All field references to the right must be shifted left. 143 | push_right = push_right 144 | .transform( 145 | &|e| match e { 146 | Expression::Field(i, label) => Ok(Expression::Field(i - boundary, label)), 147 | e => Ok(e), 148 | }, 149 | &Ok, 150 | ) 151 | .unwrap(); 152 | if let Some(remainder) = self.pushdown(push_right, right) { 153 | cnf.push(remainder) 154 | } 155 | } 156 | Expression::from_cnf_vec(cnf) 157 | } 158 | } 159 | 160 | /// An index lookup optimizer, which converts table scans to index lookups. 161 | pub struct IndexLookup<'a, C: Catalog> { 162 | catalog: &'a mut C, 163 | } 164 | 165 | impl<'a, C: Catalog> IndexLookup<'a, C> { 166 | pub fn new(catalog: &'a mut C) -> Self { 167 | Self { catalog } 168 | } 169 | 170 | // Wraps a node in a filter for the given CNF vector, if any, otherwise returns the bare node. 171 | fn wrap_cnf(&self, node: Node, cnf: Vec) -> Node { 172 | if let Some(predicate) = Expression::from_cnf_vec(cnf) { 173 | Node::Filter { source: Box::new(node), predicate } 174 | } else { 175 | node 176 | } 177 | } 178 | } 179 | 180 | impl<'a, C: Catalog> Optimizer for IndexLookup<'a, C> { 181 | fn optimize(&self, node: Node) -> Result { 182 | node.transform(&Ok, &|n| match n { 183 | Node::Scan { table, alias, filter: Some(filter) } => { 184 | let columns = self.catalog.must_read_table(&table)?.columns; 185 | let pk = columns.iter().position(|c| c.primary_key).unwrap(); 186 | 187 | // Convert the filter into conjunctive normal form, and try to convert each 188 | // sub-expression into a lookup. If a lookup is found, return a lookup node and then 189 | // apply the remaining conjunctions as a filter node, if any. 190 | let mut cnf = filter.clone().into_cnf_vec(); 191 | for i in 0..cnf.len() { 192 | if let Some(keys) = cnf[i].as_lookup(pk) { 193 | cnf.remove(i); 194 | return Ok(self.wrap_cnf(Node::KeyLookup { table, alias, keys }, cnf)); 195 | } 196 | for (ci, column) in columns.iter().enumerate().filter(|(_, c)| c.index) { 197 | if let Some(values) = cnf[i].as_lookup(ci) { 198 | cnf.remove(i); 199 | return Ok(self.wrap_cnf( 200 | Node::IndexLookup { 201 | table, 202 | alias, 203 | column: column.name.clone(), 204 | values, 205 | }, 206 | cnf, 207 | )); 208 | } 209 | } 210 | } 211 | Ok(Node::Scan { table, alias, filter: Some(filter) }) 212 | } 213 | n => Ok(n), 214 | }) 215 | } 216 | } 217 | 218 | /// Cleans up noops, e.g. filters with constant true/false predicates. 219 | /// FIXME This should perhaps replace nodes that can never return anything with a Nothing node, 220 | /// but that requires propagating the column names. 221 | pub struct NoopCleaner; 222 | 223 | impl Optimizer for NoopCleaner { 224 | fn optimize(&self, node: Node) -> Result { 225 | use Expression::*; 226 | node.transform( 227 | // While descending the node tree, clean up boolean expressions. 228 | &|n| { 229 | n.transform_expressions(&Ok, &|e| match &e { 230 | And(lhs, rhs) => match (&**lhs, &**rhs) { 231 | (Constant(Value::Boolean(false)), _) 232 | | (Constant(Value::Null), _) 233 | | (_, Constant(Value::Boolean(false))) 234 | | (_, Constant(Value::Null)) => Ok(Constant(Value::Boolean(false))), 235 | (Constant(Value::Boolean(true)), e) 236 | | (e, Constant(Value::Boolean(true))) => Ok(e.clone()), 237 | _ => Ok(e), 238 | }, 239 | Or(lhs, rhs) => match (&**lhs, &**rhs) { 240 | (Constant(Value::Boolean(false)), e) 241 | | (Constant(Value::Null), e) 242 | | (e, Constant(Value::Boolean(false))) 243 | | (e, Constant(Value::Null)) => Ok(e.clone()), 244 | (Constant(Value::Boolean(true)), _) 245 | | (_, Constant(Value::Boolean(true))) => Ok(Constant(Value::Boolean(true))), 246 | _ => Ok(e), 247 | }, 248 | // No need to handle Not, constant folder should have evaluated it already. 249 | _ => Ok(e), 250 | }) 251 | }, 252 | // While ascending the node tree, remove any unnecessary filters or nodes. 253 | // FIXME This should replace scan and join predicates with None as well. 254 | &|n| match n { 255 | Node::Filter { source, predicate } => match predicate { 256 | Expression::Constant(Value::Boolean(true)) => Ok(*source), 257 | predicate => Ok(Node::Filter { source, predicate }), 258 | }, 259 | n => Ok(n), 260 | }, 261 | ) 262 | } 263 | } 264 | 265 | // Optimizes join types, currently by swapping nested-loop joins with hash joins where appropriate. 266 | pub struct JoinType; 267 | 268 | impl Optimizer for JoinType { 269 | fn optimize(&self, node: Node) -> Result { 270 | node.transform( 271 | &|n| match n { 272 | // Replace nested-loop equijoins with hash joins. 273 | Node::NestedLoopJoin { 274 | left, 275 | left_size, 276 | right, 277 | predicate: Some(Expression::Equal(a, b)), 278 | outer, 279 | } => match (*a, *b) { 280 | (Expression::Field(a, a_label), Expression::Field(b, b_label)) => { 281 | let (left_field, right_field) = if a < left_size { 282 | ((a, a_label), (b - left_size, b_label)) 283 | } else { 284 | ((b, b_label), (a - left_size, a_label)) 285 | }; 286 | Ok(Node::HashJoin { left, left_field, right, right_field, outer }) 287 | } 288 | (a, b) => Ok(Node::NestedLoopJoin { 289 | left, 290 | left_size, 291 | right, 292 | predicate: Some(Expression::Equal(a.into(), b.into())), 293 | outer, 294 | }), 295 | }, 296 | n => Ok(n), 297 | }, 298 | &Ok, 299 | ) 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /src/sql/schema.rs: -------------------------------------------------------------------------------- 1 | use super::engine::Transaction; 2 | use super::parser::format_ident; 3 | use super::types::{DataType, Value}; 4 | use crate::error::{Error, Result}; 5 | 6 | use serde_derive::{Deserialize, Serialize}; 7 | use std::fmt::{self, Display}; 8 | 9 | /// The catalog stores schema information 10 | pub trait Catalog { 11 | /// Creates a new table 12 | fn create_table(&mut self, table: Table) -> Result<()>; 13 | /// Deletes an existing table, or errors if it does not exist 14 | fn delete_table(&mut self, table: &str) -> Result<()>; 15 | /// Reads a table, if it exists 16 | fn read_table(&self, table: &str) -> Result>; 17 | /// Iterates over all tables 18 | fn scan_tables(&self) -> Result; 19 | 20 | /// Reads a table, and errors if it does not exist 21 | fn must_read_table(&self, table: &str) -> Result
{ 22 | self.read_table(table)? 23 | .ok_or_else(|| Error::Value(format!("Table {} does not exist", table))) 24 | } 25 | 26 | /// Returns all references to a table, as table,column pairs. 27 | fn table_references(&self, table: &str, with_self: bool) -> Result)>> { 28 | Ok(self 29 | .scan_tables()? 30 | .filter(|t| with_self || t.name != table) 31 | .map(|t| { 32 | ( 33 | t.name, 34 | t.columns 35 | .iter() 36 | .filter(|c| c.references.as_deref() == Some(table)) 37 | .map(|c| c.name.clone()) 38 | .collect::>(), 39 | ) 40 | }) 41 | .filter(|(_, cs)| !cs.is_empty()) 42 | .collect()) 43 | } 44 | } 45 | 46 | /// A table scan iterator 47 | pub type Tables = Box + Send>; 48 | 49 | /// A table schema 50 | #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] 51 | pub struct Table { 52 | pub name: String, 53 | pub columns: Vec, 54 | } 55 | 56 | impl Table { 57 | /// Creates a new table schema 58 | pub fn new(name: String, columns: Vec) -> Result { 59 | let table = Self { name, columns }; 60 | Ok(table) 61 | } 62 | 63 | /// Fetches a column by name 64 | pub fn get_column(&self, name: &str) -> Result<&Column> { 65 | self.columns.iter().find(|c| c.name == name).ok_or_else(|| { 66 | Error::Value(format!("Column {} not found in table {}", name, self.name)) 67 | }) 68 | } 69 | 70 | /// Fetches a column index by name 71 | pub fn get_column_index(&self, name: &str) -> Result { 72 | self.columns.iter().position(|c| c.name == name).ok_or_else(|| { 73 | Error::Value(format!("Column {} not found in table {}", name, self.name)) 74 | }) 75 | } 76 | 77 | /// Returns the primary key column of the table 78 | pub fn get_primary_key(&self) -> Result<&Column> { 79 | self.columns 80 | .iter() 81 | .find(|c| c.primary_key) 82 | .ok_or_else(|| Error::Value(format!("Primary key not found in table {}", self.name))) 83 | } 84 | 85 | /// Returns the primary key value of a row 86 | pub fn get_row_key(&self, row: &[Value]) -> Result { 87 | row.get( 88 | self.columns 89 | .iter() 90 | .position(|c| c.primary_key) 91 | .ok_or_else(|| Error::Value("Primary key not found".into()))?, 92 | ) 93 | .cloned() 94 | .ok_or_else(|| Error::Value("Primary key value not found for row".into())) 95 | } 96 | 97 | /// Validates the table schema 98 | pub fn validate(&self, txn: &mut dyn Transaction) -> Result<()> { 99 | if self.columns.is_empty() { 100 | return Err(Error::Value(format!("Table {} has no columns", self.name))); 101 | } 102 | match self.columns.iter().filter(|c| c.primary_key).count() { 103 | 1 => {} 104 | 0 => return Err(Error::Value(format!("No primary key in table {}", self.name))), 105 | _ => return Err(Error::Value(format!("Multiple primary keys in table {}", self.name))), 106 | }; 107 | for column in &self.columns { 108 | column.validate(self, txn)?; 109 | } 110 | Ok(()) 111 | } 112 | 113 | /// Validates a row 114 | pub fn validate_row(&self, row: &[Value], txn: &mut dyn Transaction) -> Result<()> { 115 | if row.len() != self.columns.len() { 116 | return Err(Error::Value(format!("Invalid row size for table {}", self.name))); 117 | } 118 | let pk = self.get_row_key(row)?; 119 | for (column, value) in self.columns.iter().zip(row.iter()) { 120 | column.validate_value(self, &pk, value, txn)?; 121 | } 122 | Ok(()) 123 | } 124 | } 125 | 126 | impl Display for Table { 127 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 128 | write!( 129 | f, 130 | "CREATE TABLE {} (\n{}\n)", 131 | format_ident(&self.name), 132 | self.columns.iter().map(|c| format!(" {}", c)).collect::>().join(",\n") 133 | ) 134 | } 135 | } 136 | 137 | /// A table column schema 138 | #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] 139 | pub struct Column { 140 | /// Column name 141 | pub name: String, 142 | /// Column datatype 143 | pub datatype: DataType, 144 | /// Whether the column is a primary key 145 | pub primary_key: bool, 146 | /// Whether the column allows null values 147 | pub nullable: bool, 148 | /// The default value of the column 149 | pub default: Option, 150 | /// Whether the column should only take unique values 151 | pub unique: bool, 152 | /// The table which is referenced by this foreign key 153 | pub references: Option, 154 | /// Whether the column should be indexed 155 | pub index: bool, 156 | } 157 | 158 | impl Column { 159 | /// Validates the column schema 160 | pub fn validate(&self, table: &Table, txn: &mut dyn Transaction) -> Result<()> { 161 | // Validate primary key 162 | if self.primary_key && self.nullable { 163 | return Err(Error::Value(format!("Primary key {} cannot be nullable", self.name))); 164 | } 165 | if self.primary_key && !self.unique { 166 | return Err(Error::Value(format!("Primary key {} must be unique", self.name))); 167 | } 168 | 169 | // Validate default value 170 | if let Some(default) = &self.default { 171 | if let Some(datatype) = default.datatype() { 172 | if datatype != self.datatype { 173 | return Err(Error::Value(format!( 174 | "Default value for column {} has datatype {}, must be {}", 175 | self.name, datatype, self.datatype 176 | ))); 177 | } 178 | } else if !self.nullable { 179 | return Err(Error::Value(format!( 180 | "Can't use NULL as default value for non-nullable column {}", 181 | self.name 182 | ))); 183 | } 184 | } else if self.nullable { 185 | return Err(Error::Value(format!( 186 | "Nullable column {} must have a default value", 187 | self.name 188 | ))); 189 | } 190 | 191 | // Validate references 192 | if let Some(reference) = &self.references { 193 | let target = if reference == &table.name { 194 | table.clone() 195 | } else if let Some(table) = txn.read_table(reference)? { 196 | table 197 | } else { 198 | return Err(Error::Value(format!( 199 | "Table {} referenced by column {} does not exist", 200 | reference, self.name 201 | ))); 202 | }; 203 | if self.datatype != target.get_primary_key()?.datatype { 204 | return Err(Error::Value(format!( 205 | "Can't reference {} primary key of table {} from {} column {}", 206 | target.get_primary_key()?.datatype, 207 | target.name, 208 | self.datatype, 209 | self.name 210 | ))); 211 | } 212 | } 213 | 214 | Ok(()) 215 | } 216 | 217 | /// Validates a column value 218 | pub fn validate_value( 219 | &self, 220 | table: &Table, 221 | pk: &Value, 222 | value: &Value, 223 | txn: &mut dyn Transaction, 224 | ) -> Result<()> { 225 | // Validate datatype 226 | match value.datatype() { 227 | None if self.nullable => Ok(()), 228 | None => Err(Error::Value(format!("NULL value not allowed for column {}", self.name))), 229 | Some(ref datatype) if datatype != &self.datatype => Err(Error::Value(format!( 230 | "Invalid datatype {} for {} column {}", 231 | datatype, self.datatype, self.name 232 | ))), 233 | _ => Ok(()), 234 | }?; 235 | 236 | // Validate value 237 | match value { 238 | Value::String(s) if s.len() > 1024 => { 239 | Err(Error::Value("Strings cannot be more than 1024 bytes".into())) 240 | } 241 | _ => Ok(()), 242 | }?; 243 | 244 | // Validate outgoing references 245 | if let Some(target) = &self.references { 246 | match value { 247 | Value::Null => Ok(()), 248 | Value::Float(f) if f.is_nan() => Ok(()), 249 | v if target == &table.name && v == pk => Ok(()), 250 | v if txn.read(target, v)?.is_none() => Err(Error::Value(format!( 251 | "Referenced primary key {} in table {} does not exist", 252 | v, target, 253 | ))), 254 | _ => Ok(()), 255 | }?; 256 | } 257 | 258 | // Validate uniqueness constraints 259 | if self.unique && !self.primary_key && value != &Value::Null { 260 | let index = table.get_column_index(&self.name)?; 261 | let mut scan = txn.scan(&table.name, None)?; 262 | while let Some(row) = scan.next().transpose()? { 263 | if row.get(index).unwrap_or(&Value::Null) == value 264 | && &table.get_row_key(&row)? != pk 265 | { 266 | return Err(Error::Value(format!( 267 | "Unique value {} already exists for column {}", 268 | value, self.name 269 | ))); 270 | } 271 | } 272 | } 273 | 274 | Ok(()) 275 | } 276 | } 277 | 278 | impl Display for Column { 279 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 280 | let mut sql = format_ident(&self.name); 281 | sql += &format!(" {}", self.datatype); 282 | if self.primary_key { 283 | sql += " PRIMARY KEY"; 284 | } 285 | if !self.nullable && !self.primary_key { 286 | sql += " NOT NULL"; 287 | } 288 | if let Some(default) = &self.default { 289 | sql += &format!(" DEFAULT {}", default); 290 | } 291 | if self.unique && !self.primary_key { 292 | sql += " UNIQUE"; 293 | } 294 | if let Some(reference) = &self.references { 295 | sql += &format!(" REFERENCES {}", reference); 296 | } 297 | if self.index { 298 | sql += " INDEX"; 299 | } 300 | write!(f, "{}", sql) 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/sql/types/mod.rs: -------------------------------------------------------------------------------- 1 | mod expression; 2 | pub use expression::Expression; 3 | 4 | use crate::error::{Error, Result}; 5 | 6 | use serde_derive::{Deserialize, Serialize}; 7 | use std::borrow::Cow; 8 | use std::cmp::Ordering; 9 | use std::hash::{Hash, Hasher}; 10 | 11 | /// A datatype 12 | #[derive(Clone, Debug, Hash, PartialEq, Serialize, Deserialize)] 13 | pub enum DataType { 14 | Boolean, 15 | Integer, 16 | Float, 17 | String, 18 | } 19 | 20 | impl std::fmt::Display for DataType { 21 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 22 | f.write_str(match self { 23 | Self::Boolean => "BOOLEAN", 24 | Self::Integer => "INTEGER", 25 | Self::Float => "FLOAT", 26 | Self::String => "STRING", 27 | }) 28 | } 29 | } 30 | 31 | /// A specific value of a data type 32 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 33 | pub enum Value { 34 | Null, 35 | Boolean(bool), 36 | Integer(i64), 37 | Float(f64), 38 | String(String), 39 | } 40 | 41 | impl std::cmp::Eq for Value {} 42 | 43 | #[allow(clippy::derived_hash_with_manual_eq)] 44 | impl Hash for Value { 45 | fn hash(&self, state: &mut H) { 46 | self.datatype().hash(state); 47 | match self { 48 | Value::Null => self.hash(state), 49 | Value::Boolean(v) => v.hash(state), 50 | Value::Integer(v) => v.hash(state), 51 | Value::Float(v) => v.to_be_bytes().hash(state), 52 | Value::String(v) => v.hash(state), 53 | } 54 | } 55 | } 56 | 57 | impl<'a> From for Cow<'a, Value> { 58 | fn from(v: Value) -> Self { 59 | Cow::Owned(v) 60 | } 61 | } 62 | 63 | impl<'a> From<&'a Value> for Cow<'a, Value> { 64 | fn from(v: &'a Value) -> Self { 65 | Cow::Borrowed(v) 66 | } 67 | } 68 | 69 | impl Value { 70 | /// Returns the value's datatype, or None for null values 71 | pub fn datatype(&self) -> Option { 72 | match self { 73 | Self::Null => None, 74 | Self::Boolean(_) => Some(DataType::Boolean), 75 | Self::Integer(_) => Some(DataType::Integer), 76 | Self::Float(_) => Some(DataType::Float), 77 | Self::String(_) => Some(DataType::String), 78 | } 79 | } 80 | 81 | /// Returns the inner boolean, or an error if not a boolean 82 | pub fn boolean(self) -> Result { 83 | match self { 84 | Self::Boolean(b) => Ok(b), 85 | v => Err(Error::Value(format!("Not a boolean: {:?}", v))), 86 | } 87 | } 88 | 89 | /// Returns the inner float, or an error if not a float 90 | pub fn float(self) -> Result { 91 | match self { 92 | Self::Float(f) => Ok(f), 93 | v => Err(Error::Value(format!("Not a float: {:?}", v))), 94 | } 95 | } 96 | 97 | /// Returns the inner integer, or an error if not an integer 98 | pub fn integer(self) -> Result { 99 | match self { 100 | Self::Integer(i) => Ok(i), 101 | v => Err(Error::Value(format!("Not an integer: {:?}", v))), 102 | } 103 | } 104 | 105 | /// Returns the inner string, or an error if not a string 106 | pub fn string(self) -> Result { 107 | match self { 108 | Self::String(s) => Ok(s), 109 | v => Err(Error::Value(format!("Not a string: {:?}", v))), 110 | } 111 | } 112 | } 113 | 114 | impl std::fmt::Display for Value { 115 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 116 | f.write_str( 117 | match self { 118 | Self::Null => "NULL".to_string(), 119 | Self::Boolean(b) if *b => "TRUE".to_string(), 120 | Self::Boolean(_) => "FALSE".to_string(), 121 | Self::Integer(i) => i.to_string(), 122 | Self::Float(f) => f.to_string(), 123 | Self::String(s) => s.clone(), 124 | } 125 | .as_ref(), 126 | ) 127 | } 128 | } 129 | 130 | impl PartialOrd for Value { 131 | fn partial_cmp(&self, other: &Self) -> Option { 132 | match (self, other) { 133 | (Self::Null, Self::Null) => Some(Ordering::Equal), 134 | (Self::Null, _) => Some(Ordering::Less), 135 | (_, Self::Null) => Some(Ordering::Greater), 136 | (Self::Boolean(a), Self::Boolean(b)) => a.partial_cmp(b), 137 | (Self::Float(a), Self::Float(b)) => a.partial_cmp(b), 138 | (Self::Float(a), Self::Integer(b)) => a.partial_cmp(&(*b as f64)), 139 | (Self::Integer(a), Self::Float(b)) => (*a as f64).partial_cmp(b), 140 | (Self::Integer(a), Self::Integer(b)) => a.partial_cmp(b), 141 | (Self::String(a), Self::String(b)) => a.partial_cmp(b), 142 | (_, _) => None, 143 | } 144 | } 145 | } 146 | 147 | impl From for Value { 148 | fn from(v: bool) -> Self { 149 | Value::Boolean(v) 150 | } 151 | } 152 | 153 | impl From for Value { 154 | fn from(v: f64) -> Self { 155 | Value::Float(v) 156 | } 157 | } 158 | 159 | impl From for Value { 160 | fn from(v: i64) -> Self { 161 | Value::Integer(v) 162 | } 163 | } 164 | 165 | impl From for Value { 166 | fn from(v: String) -> Self { 167 | Value::String(v) 168 | } 169 | } 170 | 171 | impl From<&str> for Value { 172 | fn from(v: &str) -> Self { 173 | Value::String(v.to_owned()) 174 | } 175 | } 176 | 177 | /// A row of values 178 | pub type Row = Vec; 179 | 180 | /// A row iterator 181 | pub type Rows = Box> + Send>; 182 | 183 | /// A column (in a result set, see schema::Column for table columns) 184 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 185 | pub struct Column { 186 | pub name: Option, 187 | } 188 | 189 | /// A set of columns 190 | pub type Columns = Vec; 191 | -------------------------------------------------------------------------------- /src/storage/bincode.rs: -------------------------------------------------------------------------------- 1 | //! Bincode is used to encode values. For details, see: 2 | //! https://github.com/bincode-org/bincode 3 | //! 4 | //! By default, the bincode::(de)serialize functions use fixed-length integer 5 | //! encoding, despite DefaultOptions using variable-length encoding. This module 6 | //! provides simple wrappers for these functions that use variable-length 7 | //! encoding and the other defaults. 8 | 9 | use crate::error::Result; 10 | 11 | use bincode::Options; 12 | use lazy_static::lazy_static; 13 | 14 | lazy_static! { 15 | /// Create a static binding for the default Bincode options. 16 | static ref BINCODE: bincode::DefaultOptions = bincode::DefaultOptions::new(); 17 | } 18 | 19 | /// Deserializes a value using Bincode. 20 | pub fn deserialize<'de, T: serde::Deserialize<'de>>(bytes: &'de [u8]) -> Result { 21 | Ok(BINCODE.deserialize(bytes)?) 22 | } 23 | 24 | /// Serializes a value using Bincode. 25 | pub fn serialize(value: &T) -> Result> { 26 | Ok(BINCODE.serialize(value)?) 27 | } 28 | -------------------------------------------------------------------------------- /src/storage/debug.rs: -------------------------------------------------------------------------------- 1 | //! Storage debug helpers, primarily formatting of raw engine data. 2 | 3 | use std::collections::HashSet; 4 | 5 | use super::bincode; 6 | use super::mvcc::{self, TransactionState}; 7 | use crate::error::Result; 8 | 9 | /// Formats a raw byte string, either as a UTF-8 string (if valid and 10 | /// printable), otherwise hex-encoded. 11 | pub fn format_raw(v: &[u8]) -> String { 12 | if v.is_empty() { 13 | return String::from("[]"); 14 | } 15 | if let Ok(s) = String::from_utf8(v.to_vec()) { 16 | if s.chars().all(|c| !c.is_control()) { 17 | return format!(r#""{}""#, s); 18 | } 19 | } 20 | format!("0x{}", hex::encode(v)) 21 | } 22 | 23 | /// Formats a transaction state. 24 | pub fn format_txn(state: &TransactionState) -> String { 25 | format!( 26 | "v{} {} active={}", 27 | state.version, 28 | if state.read_only { "read-only" } else { "read-write" }, 29 | format_hashset(&state.active) 30 | ) 31 | } 32 | 33 | /// Formats a HashSet with sorted elements. 34 | pub fn format_hashset(set: &HashSet) -> String { 35 | let mut elements: Vec = set.iter().copied().collect(); 36 | elements.sort(); 37 | let elements: Vec = elements.into_iter().map(|v| v.to_string()).collect(); 38 | format!("{{{}}}", elements.join(",")) 39 | } 40 | 41 | /// Formats a raw engine key/value pair, or just the key if the value is None. 42 | /// Attempts to decode known MVCC key formats and values. 43 | pub fn format_key_value(key: &[u8], value: &Option>) -> (String, Option) { 44 | // Default to string/hex formatting of the raw key and value. 45 | let mut fkey = format_raw(key); 46 | let mut fvalue = value.as_ref().map(|v| format_raw(v.as_slice())); 47 | 48 | // Try to decode MVCC keys and values. 49 | if let Ok(key) = mvcc::Key::decode(key) { 50 | // Use the debug formatting of the key, unless we need more. 51 | fkey = format!("{:?}", key); 52 | 53 | match key { 54 | mvcc::Key::NextVersion => { 55 | if let Some(ref v) = value { 56 | if let Ok(v) = bincode::deserialize::(v) { 57 | fvalue = Some(format!("{}", v)) 58 | } 59 | } 60 | } 61 | mvcc::Key::TxnActive(_) => {} 62 | mvcc::Key::TxnActiveSnapshot(_) => { 63 | if let Some(ref v) = value { 64 | if let Ok(active) = bincode::deserialize::>(v) { 65 | fvalue = Some(format_hashset(&active)); 66 | } 67 | } 68 | } 69 | mvcc::Key::TxnWrite(version, userkey) => { 70 | fkey = format!("TxnWrite({}, {})", version, format_raw(&userkey)) 71 | } 72 | mvcc::Key::Version(userkey, version) => { 73 | fkey = format!("Version({}, {})", format_raw(&userkey), version); 74 | if let Some(ref v) = value { 75 | match bincode::deserialize(v) { 76 | Ok(Some(v)) => fvalue = Some(format_raw(v)), 77 | Ok(None) => fvalue = Some(String::from("None")), 78 | Err(_) => {} 79 | } 80 | } 81 | } 82 | mvcc::Key::Unversioned(userkey) => { 83 | fkey = format!("Unversioned({})", format_raw(&userkey)); 84 | } 85 | } 86 | } 87 | 88 | (fkey, fvalue) 89 | } 90 | 91 | /// A debug storage engine, which wraps another engine and logs mutations. 92 | pub struct Engine { 93 | /// The wrapped engine. 94 | inner: E, 95 | /// Write log as key/value tuples. Value is None for deletes. 96 | write_log: Vec<(Vec, Option>)>, 97 | } 98 | 99 | impl std::fmt::Display for Engine { 100 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 101 | write!(f, "debug:{}", self.inner) 102 | } 103 | } 104 | 105 | impl Engine { 106 | pub fn new(inner: E) -> Self { 107 | Self { inner, write_log: Vec::new() } 108 | } 109 | 110 | /// Returns and resets the write log. The next call only returns new writes. 111 | pub fn take_write_log(&mut self) -> Vec<(Vec, Option>)> { 112 | let mut write_log = Vec::new(); 113 | std::mem::swap(&mut write_log, &mut self.write_log); 114 | write_log 115 | } 116 | } 117 | 118 | impl super::engine::Engine for Engine { 119 | type ScanIterator<'a> = E::ScanIterator<'a> where E: 'a; 120 | 121 | fn flush(&mut self) -> Result<()> { 122 | self.inner.flush() 123 | } 124 | 125 | fn delete(&mut self, key: &[u8]) -> Result<()> { 126 | self.inner.delete(key)?; 127 | self.write_log.push((key.to_vec(), None)); 128 | Ok(()) 129 | } 130 | 131 | fn get(&mut self, key: &[u8]) -> Result>> { 132 | self.inner.get(key) 133 | } 134 | 135 | fn scan>>(&mut self, range: R) -> Self::ScanIterator<'_> { 136 | self.inner.scan(range) 137 | } 138 | 139 | fn set(&mut self, key: &[u8], value: Vec) -> Result<()> { 140 | self.inner.set(key, value.clone())?; 141 | self.write_log.push((key.to_vec(), Some(value))); 142 | Ok(()) 143 | } 144 | 145 | fn status(&mut self) -> Result { 146 | self.inner.status() 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/storage/engine/memory.rs: -------------------------------------------------------------------------------- 1 | use super::{Engine, Status}; 2 | use crate::error::Result; 3 | 4 | /// An in-memory key/value storage engine using the Rust standard library B-tree 5 | /// implementation. Data is not persisted. 6 | /// This engine is primarily used for testing and scenarios where persistence is not required. 7 | /// It provides a simple and fast key-value store with all data held in memory. 8 | pub struct Memory { 9 | data: std::collections::BTreeMap, Vec>, 10 | } 11 | 12 | impl Memory { 13 | /// Creates a new Memory key-value storage engine. 14 | /// Initializes an empty BTreeMap to hold the key-value data in memory. 15 | pub fn new() -> Self { 16 | Self { data: std::collections::BTreeMap::new() } 17 | } 18 | } 19 | 20 | impl std::fmt::Display for Memory { 21 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 22 | // Display the name of the engine when printed. 23 | write!(f, "memory") 24 | } 25 | } 26 | 27 | impl Engine for Memory { 28 | type ScanIterator<'a> = ScanIterator<'a>; 29 | 30 | fn flush(&mut self) -> Result<()> { 31 | // Flushing is a no-op for the in-memory engine, as there is no disk I/O. 32 | Ok(()) 33 | } 34 | 35 | fn delete(&mut self, key: &[u8]) -> Result<()> { 36 | // Remove the key from the BTreeMap, effectively deleting it from the in-memory store. 37 | self.data.remove(key); 38 | Ok(()) 39 | } 40 | 41 | fn get(&mut self, key: &[u8]) -> Result>> { 42 | // Retrieve the value associated with the key from the BTreeMap, if it exists. 43 | Ok(self.data.get(key).cloned()) 44 | } 45 | 46 | fn scan>>(&mut self, range: R) -> Self::ScanIterator<'_> { 47 | // Create an iterator that scans over the range of keys specified, using the BTreeMap's range function. 48 | ScanIterator { inner: self.data.range(range) } 49 | } 50 | 51 | fn set(&mut self, key: &[u8], value: Vec) -> Result<()> { 52 | // Insert the key-value pair into the BTreeMap, updating the value if the key already exists. 53 | self.data.insert(key.to_vec(), value); 54 | Ok(()) 55 | } 56 | 57 | fn status(&mut self) -> Result { 58 | // Generate a status report containing the engine name, number of keys, and size of all keys and values. 59 | Ok(Status { 60 | name: self.to_string(), 61 | keys: self.data.len() as u64, 62 | size: self.data.iter().fold(0, |size, (k, v)| size + k.len() as u64 + v.len() as u64), 63 | total_disk_size: 0, 64 | live_disk_size: 0, 65 | garbage_disk_size: 0, 66 | }) 67 | } 68 | } 69 | 70 | pub struct ScanIterator<'a> { 71 | inner: std::collections::btree_map::Range<'a, Vec, Vec>, 72 | } 73 | 74 | impl<'a> ScanIterator<'a> { 75 | fn map(item: (&Vec, &Vec)) -> ::Item { 76 | // Map the key and value references to owned clones for the iterator to yield. 77 | let (key, value) = item; 78 | Ok((key.clone(), value.clone())) 79 | } 80 | } 81 | 82 | impl<'a> Iterator for ScanIterator<'a> { 83 | type Item = Result<(Vec, Vec)>; 84 | 85 | fn next(&mut self) -> Option { 86 | // Advance the iterator and return the next key-value pair, cloning the data from the BTreeMap. 87 | self.inner.next().map(Self::map) 88 | } 89 | } 90 | 91 | impl<'a> DoubleEndedIterator for ScanIterator<'a> { 92 | fn next_back(&mut self) -> Option { 93 | // Advance the iterator in reverse and return the previous key-value pair, cloning the data from the BTreeMap. 94 | self.inner.next_back().map(Self::map) 95 | } 96 | } 97 | 98 | #[cfg(test)] 99 | mod tests { 100 | // This module contains tests for the Memory storage engine. 101 | use super::*; 102 | 103 | // Run the generic engine tests using an instance of the Memory engine. 104 | super::super::tests::test_engine!(Memory::new()); 105 | } 106 | -------------------------------------------------------------------------------- /src/storage/golden/bitcask/compact-after: -------------------------------------------------------------------------------- 1 | entry = 0, offset 0 2 | klen = 0 [0, 0, 0, 0] 3 | vlen = 0 [0, 0, 0, 0] 4 | key = "" [] 5 | value = "" [] 6 | 7 | entry = 1, offset 8 8 | klen = 1 [0, 0, 0, 1] 9 | vlen = 1 [0, 0, 0, 1] 10 | key = "a" [61] 11 | value = [1] 12 | 13 | entry = 2, offset 18 14 | klen = 1 [0, 0, 0, 1] 15 | vlen = 1 [0, 0, 0, 1] 16 | key = "b" [62] 17 | value = [2] 18 | 19 | entry = 3, offset 28 20 | klen = 1 [0, 0, 0, 1] 21 | vlen = 1 [0, 0, 0, 1] 22 | key = "c" [63] 23 | value = [3] 24 | 25 | entry = 4, offset 38 26 | klen = 1 [0, 0, 0, 1] 27 | vlen = 1 [0, 0, 0, 1] 28 | key = "d" [64] 29 | value = [4] 30 | 31 | -------------------------------------------------------------------------------- /src/storage/golden/bitcask/compact-before: -------------------------------------------------------------------------------- 1 | entry = 0, offset 0 2 | klen = 1 [0, 0, 0, 1] 3 | vlen = 1 [0, 0, 0, 1] 4 | key = "b" [62] 5 | value = [1] 6 | 7 | entry = 1, offset 10 8 | klen = 1 [0, 0, 0, 1] 9 | vlen = 1 [0, 0, 0, 1] 10 | key = "b" [62] 11 | value = [2] 12 | 13 | entry = 2, offset 20 14 | klen = 1 [0, 0, 0, 1] 15 | vlen = 1 [0, 0, 0, 1] 16 | key = "e" [65] 17 | value = [5] 18 | 19 | entry = 3, offset 30 20 | klen = 1 [0, 0, 0, 1] 21 | vlen = -1 [ff, ff, ff, ff] 22 | key = "e" [65] 23 | value = tombstone [] 24 | 25 | entry = 4, offset 39 26 | klen = 1 [0, 0, 0, 1] 27 | vlen = 1 [0, 0, 0, 1] 28 | key = "c" [63] 29 | value = [0] 30 | 31 | entry = 5, offset 49 32 | klen = 1 [0, 0, 0, 1] 33 | vlen = -1 [ff, ff, ff, ff] 34 | key = "c" [63] 35 | value = tombstone [] 36 | 37 | entry = 6, offset 58 38 | klen = 1 [0, 0, 0, 1] 39 | vlen = 1 [0, 0, 0, 1] 40 | key = "c" [63] 41 | value = [3] 42 | 43 | entry = 7, offset 68 44 | klen = 0 [0, 0, 0, 0] 45 | vlen = 0 [0, 0, 0, 0] 46 | key = "" [] 47 | value = "" [] 48 | 49 | entry = 8, offset 76 50 | klen = 1 [0, 0, 0, 1] 51 | vlen = 1 [0, 0, 0, 1] 52 | key = "a" [61] 53 | value = [1] 54 | 55 | entry = 9, offset 86 56 | klen = 1 [0, 0, 0, 1] 57 | vlen = -1 [ff, ff, ff, ff] 58 | key = "f" [66] 59 | value = tombstone [] 60 | 61 | entry = 10, offset 95 62 | klen = 1 [0, 0, 0, 1] 63 | vlen = -1 [ff, ff, ff, ff] 64 | key = "d" [64] 65 | value = tombstone [] 66 | 67 | entry = 11, offset 104 68 | klen = 1 [0, 0, 0, 1] 69 | vlen = 1 [0, 0, 0, 1] 70 | key = "d" [64] 71 | value = [4] 72 | 73 | -------------------------------------------------------------------------------- /src/storage/golden/bitcask/log: -------------------------------------------------------------------------------- 1 | entry = 0, offset 0 2 | klen = 1 [0, 0, 0, 1] 3 | vlen = 1 [0, 0, 0, 1] 4 | key = "b" [62] 5 | value = [1] 6 | 7 | entry = 1, offset 10 8 | klen = 1 [0, 0, 0, 1] 9 | vlen = 1 [0, 0, 0, 1] 10 | key = "b" [62] 11 | value = [2] 12 | 13 | entry = 2, offset 20 14 | klen = 1 [0, 0, 0, 1] 15 | vlen = 1 [0, 0, 0, 1] 16 | key = "e" [65] 17 | value = [5] 18 | 19 | entry = 3, offset 30 20 | klen = 1 [0, 0, 0, 1] 21 | vlen = -1 [ff, ff, ff, ff] 22 | key = "e" [65] 23 | value = tombstone [] 24 | 25 | entry = 4, offset 39 26 | klen = 1 [0, 0, 0, 1] 27 | vlen = 1 [0, 0, 0, 1] 28 | key = "c" [63] 29 | value = [0] 30 | 31 | entry = 5, offset 49 32 | klen = 1 [0, 0, 0, 1] 33 | vlen = -1 [ff, ff, ff, ff] 34 | key = "c" [63] 35 | value = tombstone [] 36 | 37 | entry = 6, offset 58 38 | klen = 1 [0, 0, 0, 1] 39 | vlen = 1 [0, 0, 0, 1] 40 | key = "c" [63] 41 | value = [3] 42 | 43 | entry = 7, offset 68 44 | klen = 0 [0, 0, 0, 0] 45 | vlen = 0 [0, 0, 0, 0] 46 | key = "" [] 47 | value = "" [] 48 | 49 | entry = 8, offset 76 50 | klen = 1 [0, 0, 0, 1] 51 | vlen = 1 [0, 0, 0, 1] 52 | key = "a" [61] 53 | value = [1] 54 | 55 | entry = 9, offset 86 56 | klen = 1 [0, 0, 0, 1] 57 | vlen = -1 [ff, ff, ff, ff] 58 | key = "f" [66] 59 | value = tombstone [] 60 | 61 | entry = 10, offset 95 62 | klen = 1 [0, 0, 0, 1] 63 | vlen = -1 [ff, ff, ff, ff] 64 | key = "d" [64] 65 | value = tombstone [] 66 | 67 | entry = 11, offset 104 68 | klen = 1 [0, 0, 0, 1] 69 | vlen = 1 [0, 0, 0, 1] 70 | key = "d" [64] 71 | value = [4] 72 | 73 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_dirty_read: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "key" = 0x01 6 | set TxnWrite(1, "key") = [] 7 | set Version("key", 1) = 0x01 8 | 9 | T2: begin → v2 read-write active={1} 10 | set NextVersion = 3 11 | set TxnActiveSnapshot(2) = {1} 12 | set TxnActive(2) = [] 13 | 14 | T2: get "key" → None 15 | 16 | Engine state: 17 | NextVersion = 3 18 | TxnActive(1) = [] 19 | TxnActive(2) = [] 20 | TxnActiveSnapshot(2) = {1} 21 | TxnWrite(1, "key") = [] 22 | Version("key", 1) = 0x01 23 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_dirty_write: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "key" = 0x01 6 | set TxnWrite(1, "key") = [] 7 | set Version("key", 1) = 0x01 8 | 9 | T2: begin → v2 read-write active={1} 10 | set NextVersion = 3 11 | set TxnActiveSnapshot(2) = {1} 12 | set TxnActive(2) = [] 13 | 14 | T2: set "key" = 0x02 → Error::Serialization 15 | 16 | Engine state: 17 | NextVersion = 3 18 | TxnActive(1) = [] 19 | TxnActive(2) = [] 20 | TxnActiveSnapshot(2) = {1} 21 | TxnWrite(1, "key") = [] 22 | Version("key", 1) = 0x01 23 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_fuzzy_read: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("key", 1) = 0x00 4 | 5 | T1: begin → v2 read-write active={} 6 | set NextVersion = 3 7 | set TxnActive(2) = [] 8 | 9 | T2: begin → v3 read-write active={2} 10 | set NextVersion = 4 11 | set TxnActiveSnapshot(3) = {2} 12 | set TxnActive(3) = [] 13 | 14 | T2: get "key" → 0x00 15 | 16 | T1: set "key" = "t1" 17 | set TxnWrite(2, "key") = [] 18 | set Version("key", 2) = "t1" 19 | 20 | T1: commit 21 | del TxnWrite(2, "key") 22 | del TxnActive(2) 23 | 24 | T2: get "key" → 0x00 25 | 26 | Engine state: 27 | NextVersion = 4 28 | TxnActive(3) = [] 29 | TxnActiveSnapshot(3) = {2} 30 | Version("key", 1) = 0x00 31 | Version("key", 2) = "t1" 32 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_lost_update: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("key", 1) = 0x00 4 | 5 | T1: begin → v2 read-write active={} 6 | set NextVersion = 3 7 | set TxnActive(2) = [] 8 | 9 | T2: begin → v3 read-write active={2} 10 | set NextVersion = 4 11 | set TxnActiveSnapshot(3) = {2} 12 | set TxnActive(3) = [] 13 | 14 | T1: get "key" → 0x00 15 | 16 | T2: get "key" → 0x00 17 | 18 | T1: set "key" = 0x01 19 | set TxnWrite(2, "key") = [] 20 | set Version("key", 2) = 0x01 21 | 22 | T2: set "key" = 0x02 → Error::Serialization 23 | 24 | T1: commit 25 | del TxnWrite(2, "key") 26 | del TxnActive(2) 27 | 28 | Engine state: 29 | NextVersion = 4 30 | TxnActive(3) = [] 31 | TxnActiveSnapshot(3) = {2} 32 | Version("key", 1) = 0x00 33 | Version("key", 2) = 0x01 34 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_phantom_read: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("a", 1) = 0x00 4 | Version("ba", 1) = 0x00 5 | Version("bb", 1) = 0x00 6 | 7 | T1: begin → v2 read-write active={} 8 | set NextVersion = 3 9 | set TxnActive(2) = [] 10 | 11 | T2: begin → v3 read-write active={2} 12 | set NextVersion = 4 13 | set TxnActiveSnapshot(3) = {2} 14 | set TxnActive(3) = [] 15 | 16 | T1: scan prefix "b" 17 | "ba" = 0x00 18 | "bb" = 0x00 19 | 20 | T2: del "ba" 21 | set TxnWrite(3, "ba") = [] 22 | set Version("ba", 3) = None 23 | 24 | T2: set "bc" = 0x02 25 | set TxnWrite(3, "bc") = [] 26 | set Version("bc", 3) = 0x02 27 | 28 | T2: commit 29 | del TxnWrite(3, "ba") 30 | del TxnWrite(3, "bc") 31 | del TxnActive(3) 32 | 33 | T1: scan prefix "b" 34 | "ba" = 0x00 35 | "bb" = 0x00 36 | 37 | Engine state: 38 | NextVersion = 4 39 | TxnActive(2) = [] 40 | TxnActiveSnapshot(3) = {2} 41 | Version("a", 1) = 0x00 42 | Version("ba", 1) = 0x00 43 | Version("ba", 3) = None 44 | Version("bb", 1) = 0x00 45 | Version("bc", 3) = 0x02 46 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_read_skew: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("a", 1) = 0x00 4 | Version("b", 1) = 0x00 5 | 6 | T1: begin → v2 read-write active={} 7 | set NextVersion = 3 8 | set TxnActive(2) = [] 9 | 10 | T2: begin → v3 read-write active={2} 11 | set NextVersion = 4 12 | set TxnActiveSnapshot(3) = {2} 13 | set TxnActive(3) = [] 14 | 15 | T1: get "a" → 0x00 16 | 17 | T2: set "a" = 0x02 18 | set TxnWrite(3, "a") = [] 19 | set Version("a", 3) = 0x02 20 | 21 | T2: set "b" = 0x02 22 | set TxnWrite(3, "b") = [] 23 | set Version("b", 3) = 0x02 24 | 25 | T2: commit 26 | del TxnWrite(3, "a") 27 | del TxnWrite(3, "b") 28 | del TxnActive(3) 29 | 30 | T1: get "a" → 0x00 31 | 32 | Engine state: 33 | NextVersion = 4 34 | TxnActive(2) = [] 35 | TxnActiveSnapshot(3) = {2} 36 | Version("a", 1) = 0x00 37 | Version("a", 3) = 0x02 38 | Version("b", 1) = 0x00 39 | Version("b", 3) = 0x02 40 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/anomaly_write_skew: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("a", 1) = 0x01 4 | Version("b", 1) = 0x02 5 | 6 | T1: begin → v2 read-write active={} 7 | set NextVersion = 3 8 | set TxnActive(2) = [] 9 | 10 | T2: begin → v3 read-write active={2} 11 | set NextVersion = 4 12 | set TxnActiveSnapshot(3) = {2} 13 | set TxnActive(3) = [] 14 | 15 | T1: get "a" → 0x01 16 | 17 | T2: get "b" → 0x02 18 | 19 | T1: set "b" = 0x01 20 | set TxnWrite(2, "b") = [] 21 | set Version("b", 2) = 0x01 22 | 23 | T2: set "a" = 0x02 24 | set TxnWrite(3, "a") = [] 25 | set Version("a", 3) = 0x02 26 | 27 | T1: commit 28 | del TxnWrite(2, "b") 29 | del TxnActive(2) 30 | 31 | T2: commit 32 | del TxnWrite(3, "a") 33 | del TxnActive(3) 34 | 35 | Engine state: 36 | NextVersion = 4 37 | TxnActiveSnapshot(3) = {2} 38 | Version("a", 1) = 0x01 39 | Version("a", 3) = 0x02 40 | Version("b", 1) = 0x02 41 | Version("b", 2) = 0x01 42 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/begin: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T2: begin → v2 read-write active={1} 6 | set NextVersion = 3 7 | set TxnActiveSnapshot(2) = {1} 8 | set TxnActive(2) = [] 9 | 10 | T3: begin → v3 read-write active={1,2} 11 | set NextVersion = 4 12 | set TxnActiveSnapshot(3) = {1,2} 13 | set TxnActive(3) = [] 14 | 15 | T2: commit 16 | del TxnActive(2) 17 | 18 | T4: begin → v4 read-write active={1,3} 19 | set NextVersion = 5 20 | set TxnActiveSnapshot(4) = {1,3} 21 | set TxnActive(4) = [] 22 | 23 | Engine state: 24 | NextVersion = 5 25 | TxnActive(1) = [] 26 | TxnActive(3) = [] 27 | TxnActive(4) = [] 28 | TxnActiveSnapshot(2) = {1} 29 | TxnActiveSnapshot(3) = {1,2} 30 | TxnActiveSnapshot(4) = {1,3} 31 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/begin_as_of: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "other" = 0x01 6 | set TxnWrite(1, "other") = [] 7 | set Version("other", 1) = 0x01 8 | 9 | T2: begin → v2 read-write active={1} 10 | set NextVersion = 3 11 | set TxnActiveSnapshot(2) = {1} 12 | set TxnActive(2) = [] 13 | 14 | T2: set "key" = 0x02 15 | set TxnWrite(2, "key") = [] 16 | set Version("key", 2) = 0x02 17 | 18 | T2: commit 19 | del TxnWrite(2, "key") 20 | del TxnActive(2) 21 | 22 | T3: begin → v3 read-write active={1} 23 | set NextVersion = 4 24 | set TxnActiveSnapshot(3) = {1} 25 | set TxnActive(3) = [] 26 | 27 | T3: set "key" = 0x03 28 | set TxnWrite(3, "key") = [] 29 | set Version("key", 3) = 0x03 30 | 31 | T4: begin as of 3 → v3 read-only active={1} 32 | 33 | T4: scan .. 34 | "key" = 0x02 35 | 36 | T4: set "foo" = 0x01 → Error::ReadOnly 37 | 38 | T4: del "foo" → Error::ReadOnly 39 | 40 | T1: commit 41 | del TxnWrite(1, "other") 42 | del TxnActive(1) 43 | 44 | T3: commit 45 | del TxnWrite(3, "key") 46 | del TxnActive(3) 47 | 48 | T4: scan .. 49 | "key" = 0x02 50 | 51 | T5: begin as of 3 → v3 read-only active={1} 52 | 53 | T5: scan .. 54 | "key" = 0x02 55 | 56 | T4: rollback 57 | 58 | T5: commit 59 | 60 | T6: begin → v4 read-write active={} 61 | set NextVersion = 5 62 | set TxnActive(4) = [] 63 | 64 | T6: set "key" = 0x04 65 | set TxnWrite(4, "key") = [] 66 | set Version("key", 4) = 0x04 67 | 68 | T6: commit 69 | del TxnWrite(4, "key") 70 | del TxnActive(4) 71 | 72 | T7: begin as of 4 → v4 read-only active={} 73 | 74 | T7: scan .. 75 | "key" = 0x03 76 | "other" = 0x01 77 | 78 | T8: begin as of 5 → Error::Value("Version 5 does not exist") 79 | 80 | T9: begin as of 9 → Error::Value("Version 9 does not exist") 81 | 82 | Engine state: 83 | NextVersion = 5 84 | TxnActiveSnapshot(2) = {1} 85 | TxnActiveSnapshot(3) = {1} 86 | Version("key", 2) = 0x02 87 | Version("key", 3) = 0x03 88 | Version("key", 4) = 0x04 89 | Version("other", 1) = 0x01 90 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/begin_read_only: -------------------------------------------------------------------------------- 1 | T1: begin read-only → v1 read-only active={} 2 | 3 | T1: set "foo" = 0x01 → Error::ReadOnly 4 | 5 | T1: del "foo" → Error::ReadOnly 6 | 7 | T2: begin → v1 read-write active={} 8 | set NextVersion = 2 9 | set TxnActive(1) = [] 10 | 11 | T3: begin read-only → v2 read-only active={1} 12 | 13 | Engine state: 14 | NextVersion = 2 15 | TxnActive(1) = [] 16 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/delete: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("key", 1) = 0x01 4 | Version("tombstone", 1) = None 5 | 6 | T1: begin → v2 read-write active={} 7 | set NextVersion = 3 8 | set TxnActive(2) = [] 9 | 10 | T1: set "key" = 0x02 11 | set TxnWrite(2, "key") = [] 12 | set Version("key", 2) = 0x02 13 | 14 | T1: del "key" 15 | set TxnWrite(2, "key") = [] 16 | set Version("key", 2) = None 17 | 18 | T1: del "key" 19 | set TxnWrite(2, "key") = [] 20 | set Version("key", 2) = None 21 | 22 | T1: del "tombstone" 23 | set TxnWrite(2, "tombstone") = [] 24 | set Version("tombstone", 2) = None 25 | 26 | T1: del "missing" 27 | set TxnWrite(2, "missing") = [] 28 | set Version("missing", 2) = None 29 | 30 | T1: commit 31 | del TxnWrite(2, "key") 32 | del TxnWrite(2, "missing") 33 | del TxnWrite(2, "tombstone") 34 | del TxnActive(2) 35 | 36 | Engine state: 37 | NextVersion = 3 38 | Version("key", 1) = 0x01 39 | Version("key", 2) = None 40 | Version("missing", 2) = None 41 | Version("tombstone", 1) = None 42 | Version("tombstone", 2) = None 43 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/delete_conflict: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T2: begin → v2 read-write active={1} 6 | set NextVersion = 3 7 | set TxnActiveSnapshot(2) = {1} 8 | set TxnActive(2) = [] 9 | 10 | T3: begin → v3 read-write active={1,2} 11 | set NextVersion = 4 12 | set TxnActiveSnapshot(3) = {1,2} 13 | set TxnActive(3) = [] 14 | 15 | T4: begin → v4 read-write active={1,2,3} 16 | set NextVersion = 5 17 | set TxnActiveSnapshot(4) = {1,2,3} 18 | set TxnActive(4) = [] 19 | 20 | T1: set "a" = 0x01 21 | set TxnWrite(1, "a") = [] 22 | set Version("a", 1) = 0x01 23 | 24 | T3: set "c" = 0x03 25 | set TxnWrite(3, "c") = [] 26 | set Version("c", 3) = 0x03 27 | 28 | T4: set "d" = 0x04 29 | set TxnWrite(4, "d") = [] 30 | set Version("d", 4) = 0x04 31 | 32 | T4: commit 33 | del TxnWrite(4, "d") 34 | del TxnActive(4) 35 | 36 | T2: del "a" → Error::Serialization 37 | 38 | T2: del "c" → Error::Serialization 39 | 40 | T2: del "d" → Error::Serialization 41 | 42 | Engine state: 43 | NextVersion = 5 44 | TxnActive(1) = [] 45 | TxnActive(2) = [] 46 | TxnActive(3) = [] 47 | TxnActiveSnapshot(2) = {1} 48 | TxnActiveSnapshot(3) = {1,2} 49 | TxnActiveSnapshot(4) = {1,2,3} 50 | TxnWrite(1, "a") = [] 51 | TxnWrite(3, "c") = [] 52 | Version("a", 1) = 0x01 53 | Version("c", 3) = 0x03 54 | Version("d", 4) = 0x04 55 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/get: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 3 3 | Version("deleted", 1) = 0x01 4 | Version("deleted", 2) = None 5 | Version("key", 1) = 0x01 6 | Version("tombstone", 1) = None 7 | Version("updated", 1) = 0x01 8 | Version("updated", 2) = 0x02 9 | 10 | T1: begin read-only → v3 read-only active={} 11 | 12 | T1: get "key" → 0x01 13 | 14 | T1: get "updated" → 0x02 15 | 16 | T1: get "deleted" → None 17 | 18 | T1: get "tombstone" → None 19 | 20 | Engine state: 21 | NextVersion = 3 22 | Version("deleted", 1) = 0x01 23 | Version("deleted", 2) = None 24 | Version("key", 1) = 0x01 25 | Version("tombstone", 1) = None 26 | Version("updated", 1) = 0x01 27 | Version("updated", 2) = 0x02 28 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/get_isolation: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "a" = 0x01 6 | set TxnWrite(1, "a") = [] 7 | set Version("a", 1) = 0x01 8 | 9 | T1: set "b" = 0x01 10 | set TxnWrite(1, "b") = [] 11 | set Version("b", 1) = 0x01 12 | 13 | T1: set "d" = 0x01 14 | set TxnWrite(1, "d") = [] 15 | set Version("d", 1) = 0x01 16 | 17 | T1: set "e" = 0x01 18 | set TxnWrite(1, "e") = [] 19 | set Version("e", 1) = 0x01 20 | 21 | T1: commit 22 | del TxnWrite(1, "a") 23 | del TxnWrite(1, "b") 24 | del TxnWrite(1, "d") 25 | del TxnWrite(1, "e") 26 | del TxnActive(1) 27 | 28 | T2: begin → v2 read-write active={} 29 | set NextVersion = 3 30 | set TxnActive(2) = [] 31 | 32 | T2: set "a" = 0x02 33 | set TxnWrite(2, "a") = [] 34 | set Version("a", 2) = 0x02 35 | 36 | T2: del "b" 37 | set TxnWrite(2, "b") = [] 38 | set Version("b", 2) = None 39 | 40 | T2: set "c" = 0x02 41 | set TxnWrite(2, "c") = [] 42 | set Version("c", 2) = 0x02 43 | 44 | T3: begin read-only → v3 read-only active={2} 45 | 46 | T4: begin → v3 read-write active={2} 47 | set NextVersion = 4 48 | set TxnActiveSnapshot(3) = {2} 49 | set TxnActive(3) = [] 50 | 51 | T4: set "d" = 0x03 52 | set TxnWrite(3, "d") = [] 53 | set Version("d", 3) = 0x03 54 | 55 | T4: del "e" 56 | set TxnWrite(3, "e") = [] 57 | set Version("e", 3) = None 58 | 59 | T4: set "f" = 0x03 60 | set TxnWrite(3, "f") = [] 61 | set Version("f", 3) = 0x03 62 | 63 | T4: commit 64 | del TxnWrite(3, "d") 65 | del TxnWrite(3, "e") 66 | del TxnWrite(3, "f") 67 | del TxnActive(3) 68 | 69 | T3: get "a" → 0x01 70 | 71 | T3: get "b" → 0x01 72 | 73 | T3: get "c" → None 74 | 75 | T3: get "d" → 0x01 76 | 77 | T3: get "e" → 0x01 78 | 79 | T3: get "f" → None 80 | 81 | Engine state: 82 | NextVersion = 4 83 | TxnActive(2) = [] 84 | TxnActiveSnapshot(3) = {2} 85 | TxnWrite(2, "a") = [] 86 | TxnWrite(2, "b") = [] 87 | TxnWrite(2, "c") = [] 88 | Version("a", 1) = 0x01 89 | Version("a", 2) = 0x02 90 | Version("b", 1) = 0x01 91 | Version("b", 2) = None 92 | Version("c", 2) = 0x02 93 | Version("d", 1) = 0x01 94 | Version("d", 3) = 0x03 95 | Version("e", 1) = 0x01 96 | Version("e", 3) = None 97 | Version("f", 3) = 0x03 98 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/resume: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "a" = 0x01 6 | set TxnWrite(1, "a") = [] 7 | set Version("a", 1) = 0x01 8 | 9 | T1: set "b" = 0x01 10 | set TxnWrite(1, "b") = [] 11 | set Version("b", 1) = 0x01 12 | 13 | T1: commit 14 | del TxnWrite(1, "a") 15 | del TxnWrite(1, "b") 16 | del TxnActive(1) 17 | 18 | T2: begin → v2 read-write active={} 19 | set NextVersion = 3 20 | set TxnActive(2) = [] 21 | 22 | T3: begin → v3 read-write active={2} 23 | set NextVersion = 4 24 | set TxnActiveSnapshot(3) = {2} 25 | set TxnActive(3) = [] 26 | 27 | T4: begin → v4 read-write active={2,3} 28 | set NextVersion = 5 29 | set TxnActiveSnapshot(4) = {2,3} 30 | set TxnActive(4) = [] 31 | 32 | T2: set "a" = 0x02 33 | set TxnWrite(2, "a") = [] 34 | set Version("a", 2) = 0x02 35 | 36 | T3: set "b" = 0x03 37 | set TxnWrite(3, "b") = [] 38 | set Version("b", 3) = 0x03 39 | 40 | T4: set "c" = 0x04 41 | set TxnWrite(4, "c") = [] 42 | set Version("c", 4) = 0x04 43 | 44 | T2: commit 45 | del TxnWrite(2, "a") 46 | del TxnActive(2) 47 | 48 | T4: commit 49 | del TxnWrite(4, "c") 50 | del TxnActive(4) 51 | 52 | T5: resume → v3 read-write active={2} 53 | 54 | T5: scan .. 55 | "a" = 0x01 56 | "b" = 0x03 57 | 58 | T6: begin → v5 read-write active={3} 59 | set NextVersion = 6 60 | set TxnActiveSnapshot(5) = {3} 61 | set TxnActive(5) = [] 62 | 63 | T6: scan .. 64 | "a" = 0x02 65 | "b" = 0x01 66 | "c" = 0x04 67 | 68 | T6: rollback 69 | del TxnActive(5) 70 | 71 | T5: commit 72 | del TxnWrite(3, "b") 73 | del TxnActive(3) 74 | 75 | T7: begin → v6 read-write active={} 76 | set NextVersion = 7 77 | set TxnActive(6) = [] 78 | 79 | T7: scan .. 80 | "a" = 0x02 81 | "b" = 0x03 82 | "c" = 0x04 83 | 84 | T7: rollback 85 | del TxnActive(6) 86 | 87 | T8: resume → Error::Internal("No active transaction at version 3") 88 | 89 | T9: begin as of 3 → v3 read-only active={2} 90 | 91 | T9: scan .. 92 | "a" = 0x01 93 | "b" = 0x01 94 | 95 | T10: resume → v3 read-only active={2} 96 | 97 | T10: scan .. 98 | "a" = 0x01 99 | "b" = 0x01 100 | 101 | Engine state: 102 | NextVersion = 7 103 | TxnActiveSnapshot(3) = {2} 104 | TxnActiveSnapshot(4) = {2,3} 105 | TxnActiveSnapshot(5) = {3} 106 | Version("a", 1) = 0x01 107 | Version("a", 2) = 0x02 108 | Version("b", 1) = 0x01 109 | Version("b", 3) = 0x03 110 | Version("c", 4) = 0x04 111 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/rollback: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("a", 1) = 0x00 4 | Version("b", 1) = 0x00 5 | Version("c", 1) = 0x00 6 | Version("d", 1) = 0x00 7 | 8 | T1: begin → v2 read-write active={} 9 | set NextVersion = 3 10 | set TxnActive(2) = [] 11 | 12 | T2: begin → v3 read-write active={2} 13 | set NextVersion = 4 14 | set TxnActiveSnapshot(3) = {2} 15 | set TxnActive(3) = [] 16 | 17 | T3: begin → v4 read-write active={2,3} 18 | set NextVersion = 5 19 | set TxnActiveSnapshot(4) = {2,3} 20 | set TxnActive(4) = [] 21 | 22 | T1: set "a" = 0x01 23 | set TxnWrite(2, "a") = [] 24 | set Version("a", 2) = 0x01 25 | 26 | T2: set "b" = 0x02 27 | set TxnWrite(3, "b") = [] 28 | set Version("b", 3) = 0x02 29 | 30 | T2: del "c" 31 | set TxnWrite(3, "c") = [] 32 | set Version("c", 3) = None 33 | 34 | T3: set "d" = 0x03 35 | set TxnWrite(4, "d") = [] 36 | set Version("d", 4) = 0x03 37 | 38 | T1: set "b" = 0x01 → Error::Serialization 39 | 40 | T3: set "c" = 0x03 → Error::Serialization 41 | 42 | T2: rollback 43 | del Version("b", 3) 44 | del TxnWrite(3, "b") 45 | del Version("c", 3) 46 | del TxnWrite(3, "c") 47 | del TxnActive(3) 48 | 49 | T4: begin read-only → v5 read-only active={2,4} 50 | 51 | T4: scan .. 52 | "a" = 0x00 53 | "b" = 0x00 54 | "c" = 0x00 55 | "d" = 0x00 56 | 57 | T1: set "b" = 0x01 58 | set TxnWrite(2, "b") = [] 59 | set Version("b", 2) = 0x01 60 | 61 | T3: set "c" = 0x03 62 | set TxnWrite(4, "c") = [] 63 | set Version("c", 4) = 0x03 64 | 65 | T1: commit 66 | del TxnWrite(2, "a") 67 | del TxnWrite(2, "b") 68 | del TxnActive(2) 69 | 70 | T3: commit 71 | del TxnWrite(4, "c") 72 | del TxnWrite(4, "d") 73 | del TxnActive(4) 74 | 75 | T5: begin read-only → v5 read-only active={} 76 | 77 | T5: scan .. 78 | "a" = 0x01 79 | "b" = 0x01 80 | "c" = 0x03 81 | "d" = 0x03 82 | 83 | Engine state: 84 | NextVersion = 5 85 | TxnActiveSnapshot(3) = {2} 86 | TxnActiveSnapshot(4) = {2,3} 87 | Version("a", 1) = 0x00 88 | Version("a", 2) = 0x01 89 | Version("b", 1) = 0x00 90 | Version("b", 2) = 0x01 91 | Version("c", 1) = 0x00 92 | Version("c", 4) = 0x03 93 | Version("d", 1) = 0x00 94 | Version("d", 4) = 0x03 95 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/scan: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 5 3 | Version("B", 1) = 0x0001 4 | Version("B", 3) = None 5 | Version("a", 1) = 0x0a01 6 | Version("a", 2) = None 7 | Version("a", 3) = 0x0a03 8 | Version("b", 1) = None 9 | Version("b", 3) = 0x0b03 10 | Version("b", 4) = None 11 | Version("ba", 2) = 0xba02 12 | Version("ba", 4) = 0xba04 13 | Version("bb", 2) = 0xbb02 14 | Version("bb", 3) = None 15 | Version("bc", 2) = 0xbc02 16 | Version("c", 1) = 0x0c01 17 | 18 | T1: begin as of 1 → v1 read-only active={} 19 | 20 | T1: scan .. 21 | 22 | T2: begin as of 2 → v2 read-only active={} 23 | 24 | T2: scan .. 25 | "B" = 0x0001 26 | "a" = 0x0a01 27 | "c" = 0x0c01 28 | 29 | T3: begin as of 3 → v3 read-only active={} 30 | 31 | T3: scan .. 32 | "B" = 0x0001 33 | "ba" = 0xba02 34 | "bb" = 0xbb02 35 | "bc" = 0xbc02 36 | "c" = 0x0c01 37 | 38 | T4: begin as of 4 → v4 read-only active={} 39 | 40 | T4: scan .. 41 | "a" = 0x0a03 42 | "b" = 0x0b03 43 | "ba" = 0xba02 44 | "bc" = 0xbc02 45 | "c" = 0x0c01 46 | 47 | T5: begin as of 3 → v3 read-only active={} 48 | 49 | T5: scan .. 50 | "B" = 0x0001 51 | "ba" = 0xba02 52 | "bb" = 0xbb02 53 | "bc" = 0xbc02 54 | "c" = 0x0c01 55 | 56 | T5: scan .."bc"] 57 | "B" = 0x0001 58 | "ba" = 0xba02 59 | "bb" = 0xbb02 60 | "bc" = 0xbc02 61 | 62 | T5: scan .."bc") 63 | "B" = 0x0001 64 | "ba" = 0xba02 65 | "bb" = 0xbb02 66 | 67 | T5: scan ["ba".. 68 | "ba" = 0xba02 69 | "bb" = 0xbb02 70 | "bc" = 0xbc02 71 | "c" = 0x0c01 72 | 73 | T5: scan ["ba".."bc"] 74 | "ba" = 0xba02 75 | "bb" = 0xbb02 76 | "bc" = 0xbc02 77 | 78 | T5: scan ["ba".."bc") 79 | "ba" = 0xba02 80 | "bb" = 0xbb02 81 | 82 | T5: scan ("ba".. 83 | "bb" = 0xbb02 84 | "bc" = 0xbc02 85 | "c" = 0x0c01 86 | 87 | T5: scan ("ba".."bc"] 88 | "bb" = 0xbb02 89 | "bc" = 0xbc02 90 | 91 | T5: scan ("ba".."bc") 92 | "bb" = 0xbb02 93 | 94 | Engine state: 95 | NextVersion = 5 96 | Version("B", 1) = 0x0001 97 | Version("B", 3) = None 98 | Version("a", 1) = 0x0a01 99 | Version("a", 2) = None 100 | Version("a", 3) = 0x0a03 101 | Version("b", 1) = None 102 | Version("b", 3) = 0x0b03 103 | Version("b", 4) = None 104 | Version("ba", 2) = 0xba02 105 | Version("ba", 4) = 0xba04 106 | Version("bb", 2) = 0xbb02 107 | Version("bb", 3) = None 108 | Version("bc", 2) = 0xbc02 109 | Version("c", 1) = 0x0c01 110 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/scan_isolation: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set "a" = 0x01 6 | set TxnWrite(1, "a") = [] 7 | set Version("a", 1) = 0x01 8 | 9 | T1: set "b" = 0x01 10 | set TxnWrite(1, "b") = [] 11 | set Version("b", 1) = 0x01 12 | 13 | T1: set "d" = 0x01 14 | set TxnWrite(1, "d") = [] 15 | set Version("d", 1) = 0x01 16 | 17 | T1: set "e" = 0x01 18 | set TxnWrite(1, "e") = [] 19 | set Version("e", 1) = 0x01 20 | 21 | T1: commit 22 | del TxnWrite(1, "a") 23 | del TxnWrite(1, "b") 24 | del TxnWrite(1, "d") 25 | del TxnWrite(1, "e") 26 | del TxnActive(1) 27 | 28 | T2: begin → v2 read-write active={} 29 | set NextVersion = 3 30 | set TxnActive(2) = [] 31 | 32 | T2: set "a" = 0x02 33 | set TxnWrite(2, "a") = [] 34 | set Version("a", 2) = 0x02 35 | 36 | T2: del "b" 37 | set TxnWrite(2, "b") = [] 38 | set Version("b", 2) = None 39 | 40 | T2: set "c" = 0x02 41 | set TxnWrite(2, "c") = [] 42 | set Version("c", 2) = 0x02 43 | 44 | T3: begin read-only → v3 read-only active={2} 45 | 46 | T4: begin → v3 read-write active={2} 47 | set NextVersion = 4 48 | set TxnActiveSnapshot(3) = {2} 49 | set TxnActive(3) = [] 50 | 51 | T4: set "d" = 0x03 52 | set TxnWrite(3, "d") = [] 53 | set Version("d", 3) = 0x03 54 | 55 | T4: del "e" 56 | set TxnWrite(3, "e") = [] 57 | set Version("e", 3) = None 58 | 59 | T4: set "f" = 0x03 60 | set TxnWrite(3, "f") = [] 61 | set Version("f", 3) = 0x03 62 | 63 | T4: commit 64 | del TxnWrite(3, "d") 65 | del TxnWrite(3, "e") 66 | del TxnWrite(3, "f") 67 | del TxnActive(3) 68 | 69 | T3: scan .. 70 | "a" = 0x01 71 | "b" = 0x01 72 | "d" = 0x01 73 | "e" = 0x01 74 | 75 | Engine state: 76 | NextVersion = 4 77 | TxnActive(2) = [] 78 | TxnActiveSnapshot(3) = {2} 79 | TxnWrite(2, "a") = [] 80 | TxnWrite(2, "b") = [] 81 | TxnWrite(2, "c") = [] 82 | Version("a", 1) = 0x01 83 | Version("a", 2) = 0x02 84 | Version("b", 1) = 0x01 85 | Version("b", 2) = None 86 | Version("c", 2) = 0x02 87 | Version("d", 1) = 0x01 88 | Version("d", 3) = 0x03 89 | Version("e", 1) = 0x01 90 | Version("e", 3) = None 91 | Version("f", 3) = 0x03 92 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/scan_key_version_encoding: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T1: set 0x00 = 0x01 6 | set TxnWrite(1, 0x00) = [] 7 | set Version(0x00, 1) = 0x01 8 | 9 | T1: commit 10 | del TxnWrite(1, 0x00) 11 | del TxnActive(1) 12 | 13 | T2: begin → v2 read-write active={} 14 | set NextVersion = 3 15 | set TxnActive(2) = [] 16 | 17 | T2: set 0x00 = 0x02 18 | set TxnWrite(2, 0x00) = [] 19 | set Version(0x00, 2) = 0x02 20 | 21 | T2: set 0x000000000000000002 = 0x02 22 | set TxnWrite(2, 0x000000000000000002) = [] 23 | set Version(0x000000000000000002, 2) = 0x02 24 | 25 | T2: commit 26 | del TxnWrite(2, 0x00) 27 | del TxnWrite(2, 0x000000000000000002) 28 | del TxnActive(2) 29 | 30 | T3: begin → v3 read-write active={} 31 | set NextVersion = 4 32 | set TxnActive(3) = [] 33 | 34 | T3: set 0x00 = 0x03 35 | set TxnWrite(3, 0x00) = [] 36 | set Version(0x00, 3) = 0x03 37 | 38 | T3: commit 39 | del TxnWrite(3, 0x00) 40 | del TxnActive(3) 41 | 42 | T4: begin read-only → v4 read-only active={} 43 | 44 | T4: scan .. 45 | 0x00 = 0x03 46 | 0x000000000000000002 = 0x02 47 | 48 | Engine state: 49 | NextVersion = 4 50 | Version(0x00, 1) = 0x01 51 | Version(0x00, 2) = 0x02 52 | Version(0x00, 3) = 0x03 53 | Version(0x000000000000000002, 2) = 0x02 54 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/scan_prefix: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 5 3 | Version("B", 1) = 0x0001 4 | Version("B", 3) = None 5 | Version("a", 1) = 0x0a01 6 | Version("a", 2) = None 7 | Version("a", 3) = 0x0a03 8 | Version("b", 1) = None 9 | Version("b", 3) = 0x0b03 10 | Version("b", 4) = None 11 | Version("ba", 2) = 0xba02 12 | Version("ba", 4) = 0xba04 13 | Version("bb", 2) = 0xbb02 14 | Version("bb", 3) = None 15 | Version("bc", 2) = 0xbc02 16 | Version("c", 1) = 0x0c01 17 | 18 | T1: begin as of 1 → v1 read-only active={} 19 | 20 | T1: scan prefix [] 21 | 22 | T2: begin as of 2 → v2 read-only active={} 23 | 24 | T2: scan prefix [] 25 | "B" = 0x0001 26 | "a" = 0x0a01 27 | "c" = 0x0c01 28 | 29 | T3: begin as of 3 → v3 read-only active={} 30 | 31 | T3: scan prefix [] 32 | "B" = 0x0001 33 | "ba" = 0xba02 34 | "bb" = 0xbb02 35 | "bc" = 0xbc02 36 | "c" = 0x0c01 37 | 38 | T4: begin as of 4 → v4 read-only active={} 39 | 40 | T4: scan prefix [] 41 | "a" = 0x0a03 42 | "b" = 0x0b03 43 | "ba" = 0xba02 44 | "bc" = 0xbc02 45 | "c" = 0x0c01 46 | 47 | T5: begin as of 3 → v3 read-only active={} 48 | 49 | T5: scan prefix "B" 50 | "B" = 0x0001 51 | 52 | T5: scan prefix "a" 53 | 54 | T5: scan prefix "b" 55 | "ba" = 0xba02 56 | "bb" = 0xbb02 57 | "bc" = 0xbc02 58 | 59 | T5: scan prefix "ba" 60 | "ba" = 0xba02 61 | 62 | T5: scan prefix "bb" 63 | "bb" = 0xbb02 64 | 65 | T5: scan prefix "bbb" 66 | 67 | T5: scan prefix "bc" 68 | "bc" = 0xbc02 69 | 70 | T5: scan prefix "c" 71 | "c" = 0x0c01 72 | 73 | T5: scan prefix "d" 74 | 75 | T6: begin as of 4 → v4 read-only active={} 76 | 77 | T6: scan prefix "B" 78 | 79 | T6: scan prefix "a" 80 | "a" = 0x0a03 81 | 82 | T6: scan prefix "b" 83 | "b" = 0x0b03 84 | "ba" = 0xba02 85 | "bc" = 0xbc02 86 | 87 | T6: scan prefix "ba" 88 | "ba" = 0xba02 89 | 90 | T6: scan prefix "bb" 91 | 92 | T6: scan prefix "bbb" 93 | 94 | T6: scan prefix "bc" 95 | "bc" = 0xbc02 96 | 97 | T6: scan prefix "c" 98 | "c" = 0x0c01 99 | 100 | T6: scan prefix "d" 101 | 102 | Engine state: 103 | NextVersion = 5 104 | Version("B", 1) = 0x0001 105 | Version("B", 3) = None 106 | Version("a", 1) = 0x0a01 107 | Version("a", 2) = None 108 | Version("a", 3) = 0x0a03 109 | Version("b", 1) = None 110 | Version("b", 3) = 0x0b03 111 | Version("b", 4) = None 112 | Version("ba", 2) = 0xba02 113 | Version("ba", 4) = 0xba04 114 | Version("bb", 2) = 0xbb02 115 | Version("bb", 3) = None 116 | Version("bc", 2) = 0xbc02 117 | Version("c", 1) = 0x0c01 118 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/set: -------------------------------------------------------------------------------- 1 | Engine state: 2 | NextVersion = 2 3 | Version("key", 1) = 0x01 4 | Version("tombstone", 1) = None 5 | 6 | T1: begin → v2 read-write active={} 7 | set NextVersion = 3 8 | set TxnActive(2) = [] 9 | 10 | T1: set "key" = 0x02 11 | set TxnWrite(2, "key") = [] 12 | set Version("key", 2) = 0x02 13 | 14 | T1: set "tombstone" = 0x02 15 | set TxnWrite(2, "tombstone") = [] 16 | set Version("tombstone", 2) = 0x02 17 | 18 | T1: set "new" = 0x01 19 | set TxnWrite(2, "new") = [] 20 | set Version("new", 2) = 0x01 21 | 22 | T1: set "new" = 0x01 23 | set TxnWrite(2, "new") = [] 24 | set Version("new", 2) = 0x01 25 | 26 | T1: set "new" = 0x02 27 | set TxnWrite(2, "new") = [] 28 | set Version("new", 2) = 0x02 29 | 30 | T1: commit 31 | del TxnWrite(2, "key") 32 | del TxnWrite(2, "new") 33 | del TxnWrite(2, "tombstone") 34 | del TxnActive(2) 35 | 36 | Engine state: 37 | NextVersion = 3 38 | Version("key", 1) = 0x01 39 | Version("key", 2) = 0x02 40 | Version("new", 2) = 0x02 41 | Version("tombstone", 1) = None 42 | Version("tombstone", 2) = 0x02 43 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/set_conflict: -------------------------------------------------------------------------------- 1 | T1: begin → v1 read-write active={} 2 | set NextVersion = 2 3 | set TxnActive(1) = [] 4 | 5 | T2: begin → v2 read-write active={1} 6 | set NextVersion = 3 7 | set TxnActiveSnapshot(2) = {1} 8 | set TxnActive(2) = [] 9 | 10 | T3: begin → v3 read-write active={1,2} 11 | set NextVersion = 4 12 | set TxnActiveSnapshot(3) = {1,2} 13 | set TxnActive(3) = [] 14 | 15 | T4: begin → v4 read-write active={1,2,3} 16 | set NextVersion = 5 17 | set TxnActiveSnapshot(4) = {1,2,3} 18 | set TxnActive(4) = [] 19 | 20 | T1: set "a" = 0x01 21 | set TxnWrite(1, "a") = [] 22 | set Version("a", 1) = 0x01 23 | 24 | T3: set "c" = 0x03 25 | set TxnWrite(3, "c") = [] 26 | set Version("c", 3) = 0x03 27 | 28 | T4: set "d" = 0x04 29 | set TxnWrite(4, "d") = [] 30 | set Version("d", 4) = 0x04 31 | 32 | T4: commit 33 | del TxnWrite(4, "d") 34 | del TxnActive(4) 35 | 36 | T2: set "a" = 0x02 → Error::Serialization 37 | 38 | T2: set "c" = 0x02 → Error::Serialization 39 | 40 | T2: set "d" = 0x02 → Error::Serialization 41 | 42 | Engine state: 43 | NextVersion = 5 44 | TxnActive(1) = [] 45 | TxnActive(2) = [] 46 | TxnActive(3) = [] 47 | TxnActiveSnapshot(2) = {1} 48 | TxnActiveSnapshot(3) = {1,2} 49 | TxnActiveSnapshot(4) = {1,2,3} 50 | TxnWrite(1, "a") = [] 51 | TxnWrite(3, "c") = [] 52 | Version("a", 1) = 0x01 53 | Version("c", 3) = 0x03 54 | Version("d", 4) = 0x04 55 | -------------------------------------------------------------------------------- /src/storage/golden/mvcc/unversioned: -------------------------------------------------------------------------------- 1 | T_: set unversioned "a" = 0x00 2 | set Unversioned("a") = 0x00 3 | 4 | T1: begin → v1 read-write active={} 5 | set NextVersion = 2 6 | set TxnActive(1) = [] 7 | 8 | T1: set "a" = 0x01 9 | set TxnWrite(1, "a") = [] 10 | set Version("a", 1) = 0x01 11 | 12 | T1: set "b" = 0x01 13 | set TxnWrite(1, "b") = [] 14 | set Version("b", 1) = 0x01 15 | 16 | T1: set "c" = 0x01 17 | set TxnWrite(1, "c") = [] 18 | set Version("c", 1) = 0x01 19 | 20 | T1: commit 21 | del TxnWrite(1, "a") 22 | del TxnWrite(1, "b") 23 | del TxnWrite(1, "c") 24 | del TxnActive(1) 25 | 26 | T_: set unversioned "b" = 0x00 27 | set Unversioned("b") = 0x00 28 | 29 | T_: set unversioned "d" = 0x00 30 | set Unversioned("d") = 0x00 31 | 32 | T2: begin read-only → v2 read-only active={} 33 | 34 | T2: scan .. 35 | "a" = 0x01 36 | "b" = 0x01 37 | "c" = 0x01 38 | 39 | T_: get unversioned "a" → 0x00 40 | 41 | T_: get unversioned "b" → 0x00 42 | 43 | T_: get unversioned "c" → None 44 | 45 | T_: get unversioned "d" → 0x00 46 | 47 | T_: set unversioned "a" = 0x01 48 | set Unversioned("a") = 0x01 49 | 50 | T_: get unversioned "a" → 0x01 51 | 52 | Engine state: 53 | NextVersion = 2 54 | Version("a", 1) = 0x01 55 | Version("b", 1) = 0x01 56 | Version("c", 1) = 0x01 57 | Unversioned("a") = 0x01 58 | Unversioned("b") = 0x00 59 | Unversioned("d") = 0x00 60 | -------------------------------------------------------------------------------- /src/storage/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod bincode; 2 | pub mod debug; 3 | pub mod engine; 4 | pub mod keycode; 5 | pub mod mvcc; 6 | --------------------------------------------------------------------------------