├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── README.md
├── config
    └── entangledb.yaml
├── husky
    └── cloud
    │   ├── build.sh
    │   ├── entangledb1
    │       ├── data
    │       │   └── .gitkeep
    │       └── entangledb.yaml
    │   ├── entangledb2
    │       ├── data
    │       │   └── .gitkeep
    │       └── entangledb.yaml
    │   ├── entangledb3
    │       ├── data
    │       │   └── .gitkeep
    │       └── entangledb.yaml
    │   ├── entangledb4
    │       ├── data
    │       │   └── .gitkeep
    │       └── entangledb.yaml
    │   └── entangledb5
    │       ├── data
    │           └── .gitkeep
    │       └── entangledb.yaml
├── learning_resources.md
└── src
    ├── bin
        ├── entangledb.rs
        └── entanglesql.rs
    ├── client.rs
    ├── error.rs
    ├── lib.rs
    ├── raft
        ├── log.rs
        ├── message.rs
        ├── mod.rs
        ├── node
        │   ├── candidate.rs
        │   ├── follower.rs
        │   ├── leader.rs
        │   └── mod.rs
        ├── server.rs
        └── state.rs
    ├── server.rs
    ├── sql
        ├── engine
        │   ├── kv.rs
        │   ├── mod.rs
        │   └── raft.rs
        ├── execution
        │   ├── aggregation.rs
        │   ├── join.rs
        │   ├── mod.rs
        │   ├── mutation.rs
        │   ├── query.rs
        │   ├── schema.rs
        │   └── source.rs
        ├── mod.rs
        ├── parser
        │   ├── ast.rs
        │   ├── lexer.rs
        │   └── mod.rs
        ├── plan
        │   ├── mod.rs
        │   ├── optimizer.rs
        │   └── planner.rs
        ├── schema.rs
        └── types
        │   ├── expression.rs
        │   └── mod.rs
    └── storage
        ├── bincode.rs
        ├── debug.rs
        ├── engine
            ├── bitcask.rs
            ├── memory.rs
            └── mod.rs
        ├── golden
            ├── bitcask
            │   ├── compact-after
            │   ├── compact-before
            │   └── log
            └── mvcc
            │   ├── anomaly_dirty_read
            │   ├── anomaly_dirty_write
            │   ├── anomaly_fuzzy_read
            │   ├── anomaly_lost_update
            │   ├── anomaly_phantom_read
            │   ├── anomaly_read_skew
            │   ├── anomaly_write_skew
            │   ├── begin
            │   ├── begin_as_of
            │   ├── begin_read_only
            │   ├── delete
            │   ├── delete_conflict
            │   ├── get
            │   ├── get_isolation
            │   ├── resume
            │   ├── rollback
            │   ├── scan
            │   ├── scan_isolation
            │   ├── scan_key_version_encoding
            │   ├── scan_prefix
            │   ├── set
            │   ├── set_conflict
            │   └── unversioned
        ├── keycode.rs
        ├── mod.rs
        └── mvcc.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /clusters/*/entangledb*/data
2 | /data
3 | /target
4 | .vscode/
5 | **/*.rs.bk
6 | .aider*
7 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "entangledb"
 3 | description = "A distributed SQL database"
 4 | version = "0.1.0"
 5 | edition = "2021"
 6 | default-run = "entangledb"
 7 | 
 8 | [lib]
 9 | doctest = false
10 | 
11 | [dependencies]
12 | bincode = "~1.3.3"
13 | clap = { version = "~4.4.2", features = ["cargo"] }
14 | config = "~0.13.3"
15 | derivative = "~2.2.0"
16 | fs4 = "~0.6.6"
17 | futures = "~0.3.15"
18 | futures-util = "~0.3.15"
19 | hex = "~0.4.3"
20 | lazy_static = "~1.4.0"
21 | log = "~0.4.14"
22 | names = "~0.14.0"
23 | rand = "~0.8.3"
24 | regex = "1.5.4"
25 | rustyline = "~12.0.0"
26 | rustyline-derive = "0.9.0"
27 | serde = "~1.0.126"
28 | serde_bytes = "~0.11.12"
29 | serde_derive = "~1.0.126"
30 | simplelog = "~0.12.1"
31 | tokio = { version = "~1.32.0", features = [
32 |     "macros",
33 |     "rt",
34 |     "rt-multi-thread",
35 |     "net",
36 |     "io-util",
37 |     "time",
38 |     "sync",
39 | ] }
40 | tokio-serde = { version = "~0.8", features = ["bincode"] }
41 | tokio-stream = { version = "~0.1.6", features = ["net"] }
42 | tokio-util = { version = "~0.7.8", features = ["codec"] }
43 | uuid = { version = "~1.4.1", features = ["v4"] }
44 | 
45 | [dev-dependencies]
46 | goldenfile = "~1.5.2"
47 | paste = "~1.0.14"
48 | pretty_assertions = "~1.4.0"
49 | serial_test = "~2.0.0"
50 | tempdir = "~0.3.7"
51 | tempfile = "~3.8.0"
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Table of Contents
  2 | - [Overview](#overview)
  3 | - [Usage](#usage)
  4 | - [TODO](#todo)
  5 | - [MVCC in entangleDB](#mvcc-in-entangledb)
  6 | - [SQL Query Execution in entangleDB](#sql-query-execution-in-entangledb)
  7 | - [entangleDB Raft Consensus Engine](#entangledb-raft-consensus-engine)
  8 | - [What I am trying to build](#what-i-am-trying-to-build)
  9 |   - [Distributed Consensus Engine](#1-distributed-consensus-engine)
 10 |   - [Transaction Engine](#2-transaction-engine)
 11 |   - [Storage Engine](#3-storage-engine)
 12 |   - [Query Engine](#4-query-engine)
 13 |   - [SQL Interface and PostgreSQL Compatibility](#5-sql-interface-and-postgresql-compatibility)
 14 | - [Proposed Architecture](#proposed-architecture)
 15 | - [SQL Engine](#sql-engine)
 16 | - [Raft Engine](#raft-engine)
 17 | - [Storage Engine](#storage-engine)
 18 | - [entangleDB Peers](#entangledb-peers)
 19 | - [Example SQL Queries that you will be able to execute in entangleDB](#example-sql-queries-that-you-will-be-able-to-execute-in-entangledb)
 20 | - [Learning Resources I've been using for building the database](#learning-resources-ive-been-using-for-building-the-database)
 21 | 
 22 | ## Overview
 23 | 
 24 | I'm working on creating entangleDB, a project that's all about really getting to know how databases work from the inside out. My aim is to deeply understand everything about databases, from the big picture down to the small details. It's a way for me to build a strong foundation in database.
 25 | 
 26 | The name "entangleDB" is special because it's in honor of a friend who loves databases just as much as I do. 
 27 | 
 28 | The plan is to write the database in Rust. My main goal is to create something that's not only useful for me to learn from but also helpful for others who are interested in diving deep into how databases work. I'm hoping to make it postgresSQL compatible.
 29 | 
 30 | ## Usage
 31 | Pre-requisite is to have the Rust compiler; follow this doc to install the [Rust compiler](https://www.rust-lang.org/tools/install) 
 32 | 
 33 | entangledb cluster can be started on `localhost` ports `3201` to `3205`:
 34 | 
 35 | ```
 36 | (cd husky/cloud && ./build.sh)
 37 | ```
 38 | 
 39 | Client can be used to connect with the node on `localhost` port `3205`:
 40 | 
 41 | ```
 42 | cargo run --release --bin entanglesql
 43 | 
 44 | Connected to EntangleDB node "5". Enter !help for instructions.
 45 | entangledb> SELECT * FROM dishes;
 46 | poha
 47 | breads
 48 | korma
 49 | ```
 50 | 
 51 | ## TODO
 52 | 1. Make the isolation level configurable; currently, it is set to repeatable read (snapshot).
 53 | 2. Implement partitions, both hash and range types.
 54 | 3. Utilize generics throughout in Rust, thereby eliminating the need for std::fmt::Display + Send + Sync.
 55 | 4. Consider the use of runtime assertions instead of employing Error::Internal ubiquitously.
 56 | 5. Revisit the implementation of time-travel queries
 57 | 
 58 | ## MVCC in entangleDB
 59 | 
 60 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/0a923e2d-75fc-469e-9ce7-504af45c73c7)
 61 | 
 62 | ## SQL Query Execution in entangleDB
 63 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/a90fc90c-91e7-4ee8-a06f-887629a82401)
 64 | 
 65 | ## entangleDB Raft Consensus Engine
 66 | ![image](https://github.com/TypicalDefender/entangleDB/assets/106574498/a56f02b9-d172-4ab3-8883-230d7b1326b4)
 67 | 
 68 | ## What I am trying to build
 69 | 
 70 | ### 1. Distributed Consensus Engine
 71 | 
 72 | The design for entangleDB centers around a custom-built consensus engine, intended for high availability in distributed settings. This engine will be crucial in maintaining consistent and reliable state management across various nodes.
 73 | 
 74 | A key focus will be on linearizable state machine replication, an essential feature for ensuring data consistency across all nodes, especially for applications that require strong consistency.
 75 | 
 76 | ### 2. Transaction Engine
 77 | 
 78 |  The proposed transaction engine for entangleDB is committed to adhering to ACID properties, ensuring reliability and integrity in every transaction.
 79 | 
 80 | The plan includes the implementation of Snapshot Isolation and Serializable Isolation, with the aim of optimizing transaction handling for enhanced concurrency and data integrity.
 81 | 
 82 | ### 3. Storage Engine
 83 | 
 84 |  The planned storage engine for entangleDB will explore a variety of storage formats to find and utilize the most efficient methods for data storage and retrieval.
 85 | 
 86 | The storage layer is being designed for flexibility, to support a range of backend technologies and meet diverse storage requirements.
 87 | 
 88 | ### 4. Query Engine
 89 | 
 90 | The development of the query engine will focus on rapid and effective query processing, utilizing advanced optimization algorithms.
 91 | 
 92 | A distinctive feature of entangleDB will be its ability to handle time-travel queries, allowing users to access and analyze data from different historical states.
 93 | 
 94 | ### 5. SQL Interface and PostgreSQL Compatibility
 95 | 
 96 | The SQL interface for entangleDB is intended to support a wide array of SQL functionalities, including complex queries, joins, aggregates, and window functions.
 97 | 
 98 | Compatibility with PostgreSQL’s wire protocol is a goal, to facilitate smooth integration with existing PostgreSQL setups and offer a solid alternative for database system upgrades or migrations.
 99 | 
100 | ## Proposed Architecture
101 | <img width="890" alt="Screenshot 2023-12-02 at 1 26 15 PM" src="https://github.com/TypicalDefender/entangleDB/assets/37482550/f8d262b9-618c-435d-925b-4f992076581f">
102 | 
103 | ## SQL Engine
104 | 
105 | The SQL Engine is responsible for the intake and processing of SQL queries. It consists of:
106 | 
107 | - **SQL Session**: The processing pipeline within a session includes:
108 |   - `Parser`: Interprets SQL queries and converts them into a machine-understandable format.
109 |   - `Planner`: Devises an execution plan based on the parsed input.
110 |   - `Executor`: Carries out the plan, accessing and modifying the database.
111 | 
112 | Adjacent to the session is the:
113 | 
114 | - **SQL Storage Raft Backend**: This component integrates with the Raft consensus protocol to ensure distributed transactions are consistent and resilient.
115 | 
116 | ## Raft Engine
117 | 
118 | The Raft Engine is crucial for maintaining a consistent state across the distributed system:
119 | 
120 | - **Raft Node**: This consensus node confirms that all database transactions are in sync across the network.
121 | - **Raft Log**: A record of all transactions agreed upon by the Raft consensus algorithm, which is crucial for data integrity and fault tolerance.
122 | 
123 | ## Storage Engine
124 | 
125 | The Storage Engine is where the actual data is stored and managed:
126 | 
127 | - **State Machine Driver**: Comprising of:
128 |   - `State Machine Interface`: An intermediary that conveys state changes from the Raft log to the storage layer.
129 |   - `Key Value Backend`: The primary storage layer, consisting of:
130 |     - `Bitcask Engine`: A simple, fast on-disk storage system for key-value data.
131 |     - `MVCC Storage`: Handles multiple versions of data for read-write concurrency control.
132 | 
133 | ## entangleDB Peers
134 | 
135 | - interaction between multiple database instances or "peers".
136 | 
137 | ## Example SQL Queries that you will be able to execute in entangleDB
138 | 
139 | ```sql
140 | -- Transaction example with a table creation, data insertion, and selection
141 | BEGIN;
142 | 
143 | CREATE TABLE employees (id INT PRIMARY KEY, name VARCHAR, department VARCHAR);
144 | INSERT INTO employees VALUES (1, 'Alice', 'Engineering'), (2, 'Bob', 'HR');
145 | SELECT * FROM employees;
146 | 
147 | COMMIT;
148 | 
149 | -- Aggregation query with JOIN
150 | SELECT department, AVG(salary) FROM employees JOIN salaries ON employees.id = salaries.emp_id GROUP BY department;
151 | 
152 | -- Time-travel query
153 | SELECT * FROM employees AS OF SYSTEM TIME '-5m';
154 | ```
155 | 
156 | ## Learning Resources I've been using for building the database
157 | 
158 | For a comprehensive list of resources that have been learning what to build in a distributed database, check out the [Learning Resources](https://github.com/TypicalDefender/entangleDB/blob/main/learning_resources.md) page.
159 | 
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/config/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | # The node ID, peer ID/address map (empty for single node), and log level.
 2 | id: 1
 3 | peers: {}
 4 | log_level: INFO
 5 | 
 6 | # Network addresses to bind the SQL and Raft servers to.
 7 | listen_sql: 0.0.0.0:3205
 8 | listen_raft: 0.0.0.0:3305
 9 | 
10 | 
11 | data_dir: data
12 | compact_threshold: 0.2
13 | sync: true
14 | 
15 | storage_raft: bitcask
16 | 
17 | storage_sql: bitcask
18 | 


--------------------------------------------------------------------------------
/husky/cloud/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | cargo build --release --bin entangledb
 6 | 
 7 | for ID in 1 2 3 4 5; do
 8 |     (cargo run -q --release -- -c entangledb$ID/entangledb.yaml 2>&1 | sed -e "s/\\(.*\\)/entangledb$ID \\1/g") &
 9 | done
10 | 
11 | trap 'kill $(jobs -p)' EXIT
12 | wait < <(jobs -p)


--------------------------------------------------------------------------------
/husky/cloud/entangledb1/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb1/data/.gitkeep


--------------------------------------------------------------------------------
/husky/cloud/entangledb1/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | id: 1
 2 | data_dir: entangledb1/data
 3 | sync: false
 4 | listen_sql: 0.0.0.0:3201
 5 | listen_raft: 0.0.0.0:3301
 6 | peers:
 7 |   '2': 127.0.0.1:3302
 8 |   '3': 127.0.0.1:3303
 9 |   '4': 127.0.0.1:3304
10 |   '5': 127.0.0.1:3305


--------------------------------------------------------------------------------
/husky/cloud/entangledb2/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb2/data/.gitkeep


--------------------------------------------------------------------------------
/husky/cloud/entangledb2/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | id: 2
 2 | data_dir: entangledb2/data
 3 | sync: false
 4 | listen_sql: 0.0.0.0:3202
 5 | listen_raft: 0.0.0.0:3302
 6 | peers:
 7 |   '1': 127.0.0.1:3301
 8 |   '3': 127.0.0.1:3303
 9 |   '4': 127.0.0.1:3304
10 |   '5': 127.0.0.1:3305


--------------------------------------------------------------------------------
/husky/cloud/entangledb3/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb3/data/.gitkeep


--------------------------------------------------------------------------------
/husky/cloud/entangledb3/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | id: 3
 2 | data_dir: entangledb3/data
 3 | sync: false
 4 | listen_sql: 0.0.0.0:3203
 5 | listen_raft: 0.0.0.0:3303
 6 | peers:
 7 |   '1': 127.0.0.1:3301
 8 |   '2': 127.0.0.1:3302
 9 |   '4': 127.0.0.1:3304
10 |   '5': 127.0.0.1:3305


--------------------------------------------------------------------------------
/husky/cloud/entangledb4/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb4/data/.gitkeep


--------------------------------------------------------------------------------
/husky/cloud/entangledb4/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | id: 4
 2 | data_dir: entangledb4/data
 3 | sync: false
 4 | listen_sql: 0.0.0.0:3204
 5 | listen_raft: 0.0.0.0:3304
 6 | peers:
 7 |   '1': 127.0.0.1:3301
 8 |   '2': 127.0.0.1:3302
 9 |   '3': 127.0.0.1:3303
10 |   '5': 127.0.0.1:3305


--------------------------------------------------------------------------------
/husky/cloud/entangledb5/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TypicalDefender/entangleDB/beaf75098d2c936bf841c34ccb9241a144058380/husky/cloud/entangledb5/data/.gitkeep


--------------------------------------------------------------------------------
/husky/cloud/entangledb5/entangledb.yaml:
--------------------------------------------------------------------------------
 1 | id: 5
 2 | data_dir: entangledb5/data
 3 | sync: false
 4 | listen_sql: 0.0.0.0:3205
 5 | listen_raft: 0.0.0.0:3305
 6 | peers:
 7 |   '1': 127.0.0.1:3301
 8 |   '2': 127.0.0.1:3302
 9 |   '3': 127.0.0.1:3303
10 |   '4': 127.0.0.1:3304


--------------------------------------------------------------------------------
/learning_resources.md:
--------------------------------------------------------------------------------
 1 | # Learning Resources I've been using for building the database
 2 | 
 3 | ### Introductory Materials
 4 | 
 5 | **1. Lectures by Andy Pavlo**
 6 |    - **CMU 15-445 Intro to Database Systems**: [YouTube Playlist](https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi) (A Pavlo 2019)
 7 |    - **CMU 15-721 Advanced Database Systems**: [YouTube Playlist](https://www.youtube.com/playlist?list=PLSE8ODhjZXjasmrEd2_Yi1deeE360zv5O) (A Pavlo 2020)
 8 | 
 9 | **2. Books by Martin Kleppman and Alex Petrov**
10 |    - **Designing Data-Intensive Applications**: [Link to Book](https://dataintensive.net/) (M Kleppmann 2017)
11 |    - **Database Internals**: [Link to Book](https://www.databass.dev) (A Petrov 2019)
12 | 
13 | ### Raft Algorithm
14 | 
15 | **1. Original Paper and Talks**
16 |    - **In Search of an Understandable Consensus Algorithm**: [Raft Paper](https://raft.github.io/raft.pdf) (D Ongaro, J Ousterhout 2014)
17 |    - **Designing for Understandability: The Raft Consensus Algorithm**: [YouTube Video](https://www.youtube.com/watch?v=vYp4LYbnnW8) (J Ousterhout 2016)
18 | 
19 | **2. Student Guide**
20 |    - **Students' Guide to Raft**: [Blog Post](https://thesquareplanet.com/blog/students-guide-to-raft/) (J Gjengset 2016)
21 | 
22 | ### Parsing Techniques
23 | 
24 | **1. Books by Thorsten Ball**
25 |    - **Writing An Interpreter In Go**: [Link to Book](https://interpreterbook.com) (T Ball 2016)
26 |    - **Writing A Compiler In Go**: [Link to Book](https://compilerbook.com) (T Ball 2018)
27 | 
28 | **2. Blog Post**
29 |    - **Parsing Expressions by Precedence Climbing**: [Blog Post](https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing) (E Bendersky 2012)
30 | 
31 | ### Transactions and Consistency
32 | 
33 | **1. Overviews and Classic Papers**
34 |    - **Consistency Models**: [Jepsen Article](https://jepsen.io/consistency) (Jepsen 2016)
35 |    - **A Critique of ANSI SQL Isolation Levels**: [Research Paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf) (H Berenson et al 1995)
36 |    - **Generalized Isolation Level Definitions**: [Research Paper](http://pmg.csail.mit.edu/papers/icde00.pdf) (A Adya, B Liskov, P O'Neil 2000)
37 | 
38 | **2. Blog Posts on MVCC Implementation**
39 |    - **Implementing Your Own Transactions with MVCC**: [Blog Post](https://levelup.gitconnected.com/implementing-your-own-transactions-with-mvcc-bba11cab8e70) (E Chance 2015)
40 |    - **How Postgres Makes Transactions Atomic**: [Blog Post](https://brandur.org/postgres-atomicity) (B Leach 2017)
41 | 


--------------------------------------------------------------------------------
/src/bin/entangledb.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * entangledb is the entangledb server. It takes configuration via a configuration file, command-line
  3 |  * parameters, and environment variables, then starts up a entangledb TCP server that communicates with
  4 |  * SQL clients (port 3205) and Raft peers (port 3305).
  5 |  */
  6 | 
  7 | #![warn(clippy::all)]
  8 | 
  9 | use serde_derive::Deserialize;
 10 | use std::collections::HashMap;
 11 | use entangledb::error::{Error, Result};
 12 | use entangledb::raft;
 13 | use entangledb::sql;
 14 | use entangledb::storage;
 15 | use entangledb::Server;
 16 | 
 17 | #[tokio::main]
 18 | async fn main() -> Result<()> {
 19 |     let args = clap::command!()
 20 |         .arg(
 21 |             clap::Arg::new("config")
 22 |                 .short('c')
 23 |                 .long("config")
 24 |                 .help("Configuration file path")
 25 |                 .default_value("config/entangledb.yaml"),
 26 |         )
 27 |         .get_matches();
 28 |     let cfg = Config::new(args.get_one::<String>("config").unwrap().as_ref())?;
 29 | 
 30 |     let loglevel = cfg.log_level.parse::<simplelog::LevelFilter>()?;
 31 |     let mut logconfig = simplelog::ConfigBuilder::new();
 32 |     if loglevel != simplelog::LevelFilter::Debug {
 33 |         logconfig.add_filter_allow_str("entangledb");
 34 |     }
 35 |     simplelog::SimpleLogger::init(loglevel, logconfig.build())?;
 36 | 
 37 |     let path = std::path::Path::new(&cfg.data_dir);
 38 |     let raft_log = match cfg.storage_raft.as_str() {
 39 |         "bitcask" | "" => raft::Log::new(
 40 |             Box::new(storage::engine::BitCask::new_compact(
 41 |                 path.join("log"),
 42 |                 cfg.compact_threshold,
 43 |             )?),
 44 |             cfg.sync,
 45 |         )?,
 46 |         "memory" => raft::Log::new(Box::new(storage::engine::Memory::new()), false)?,
 47 |         name => return Err(Error::Config(format!("Unknown Raft storage engine {}", name))),
 48 |     };
 49 |     let raft_state: Box<dyn raft::State> = match cfg.storage_sql.as_str() {
 50 |         "bitcask" | "" => {
 51 |             let engine =
 52 |                 storage::engine::BitCask::new_compact(path.join("state"), cfg.compact_threshold)?;
 53 |             Box::new(sql::engine::Raft::new_state(engine)?)
 54 |         }
 55 |         "memory" => {
 56 |             let engine = storage::engine::Memory::new();
 57 |             Box::new(sql::engine::Raft::new_state(engine)?)
 58 |         }
 59 |         name => return Err(Error::Config(format!("Unknown SQL storage engine {}", name))),
 60 |     };
 61 | 
 62 |     Server::new(cfg.id, cfg.peers, raft_log, raft_state)
 63 |         .await?
 64 |         .listen(&cfg.listen_sql, &cfg.listen_raft)
 65 |         .await?
 66 |         .serve()
 67 |         .await
 68 | }
 69 | 
 70 | #[derive(Debug, Deserialize)]
 71 | struct Config {
 72 |     id: raft::NodeID,
 73 |     peers: HashMap<raft::NodeID, String>,
 74 |     listen_sql: String,
 75 |     listen_raft: String,
 76 |     log_level: String,
 77 |     data_dir: String,
 78 |     compact_threshold: f64,
 79 |     sync: bool,
 80 |     storage_raft: String,
 81 |     storage_sql: String,
 82 | }
 83 | 
 84 | impl Config {
 85 |     fn new(file: &str) -> Result<Self> {
 86 |         Ok(config::Config::builder()
 87 |             .set_default("id", "entangledb")?
 88 |             .set_default("listen_sql", "0.0.0.0:3205")?
 89 |             .set_default("listen_raft", "0.0.0.0:3305")?
 90 |             .set_default("log_level", "info")?
 91 |             .set_default("data_dir", "data")?
 92 |             .set_default("compact_threshold", 0.2)?
 93 |             .set_default("sync", true)?
 94 |             .set_default("storage_raft", "bitcask")?
 95 |             .set_default("storage_sql", "bitcask")?
 96 |             .add_source(config::File::with_name(file))
 97 |             .add_source(config::Environment::with_prefix("entangledb"))
 98 |             .build()?
 99 |             .try_deserialize()?)
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/bin/entanglesql.rs:
--------------------------------------------------------------------------------
  1 | #![warn(clippy::all)]
  2 | 
  3 | use rustyline::history::DefaultHistory;
  4 | use rustyline::validate::{ValidationContext, ValidationResult, Validator};
  5 | use rustyline::{error::ReadlineError, Editor, Modifiers};
  6 | use rustyline_derive::{Completer, Helper, Highlighter, Hinter};
  7 | use entangledb::error::{Error, Result};
  8 | use entangledb::sql::execution::ResultSet;
  9 | use entangledb::sql::parser::{Lexer, Token};
 10 | use entangledb::Client;
 11 | 
 12 | #[tokio::main]
 13 | async fn main() -> Result<()> {
 14 |     let opts = clap::command!()
 15 |         .name("entanglesql")
 16 |         .about("An EntangleDB client.")
 17 |         .args([
 18 |             clap::Arg::new("command"),
 19 |             clap::Arg::new("host")
 20 |                 .short('H')
 21 |                 .long("host")
 22 |                 .help("Host to connect to")
 23 |                 .default_value("127.0.0.1"),
 24 |             clap::Arg::new("port")
 25 |                 .short('p')
 26 |                 .long("port")
 27 |                 .help("Port number to connect to")
 28 |                 .value_parser(clap::value_parser!(u16))
 29 |                 .default_value("3205"),
 30 |         ])
 31 |         .get_matches();
 32 | 
 33 |     let mut entanglesql =
 34 |         EntangleSQL::new(opts.get_one::<String>("host").unwrap(), *opts.get_one("port").unwrap())
 35 |             .await?;
 36 | 
 37 |     if let Some(command) = opts.get_one::<String>("command") {
 38 |         entanglesql.execute(command).await
 39 |     } else {
 40 |         entanglesql.run().await
 41 |     }
 42 | }
 43 | 
 44 | /// The EntangleSQL REPL
 45 | struct EntangleSQL {
 46 |     client: Client,
 47 |     editor: Editor<EntangleInputValidator, DefaultHistory>,
 48 |     history_path: Option<std::path::PathBuf>,
 49 |     show_headers: bool,
 50 | }
 51 | 
 52 | impl EntangleSQL {
 53 |     async fn new(host: &str, port: u16) -> Result<Self> {
 54 |         Ok(Self {
 55 |             client: Client::new((host, port)).await?,
 56 |             editor: Editor::new()?,
 57 |             history_path: std::env::var_os("HOME")
 58 |                 .map(|home| std::path::Path::new(&home).join(".entanglesql.history")),
 59 |             show_headers: false,
 60 |         })
 61 |     }
 62 | 
 63 |     /// Executes a line of input
 64 |     async fn execute(&mut self, input: &str) -> Result<()> {
 65 |         if input.starts_with('!') {
 66 |             self.execute_command(input).await
 67 |         } else if !input.is_empty() {
 68 |             self.execute_query(input).await
 69 |         } else {
 70 |             Ok(())
 71 |         }
 72 |     }
 73 | 
 74 |     /// Handles a REPL command (prefixed by !, e.g. !help)
 75 |     async fn execute_command(&mut self, input: &str) -> Result<()> {
 76 |         let mut input = input.split_ascii_whitespace();
 77 |         let command = input.next().ok_or_else(|| Error::Parse("Expected command.".to_string()))?;
 78 | 
 79 |         let getargs = |n| {
 80 |             let args: Vec<&str> = input.collect();
 81 |             if args.len() != n {
 82 |                 Err(Error::Parse(format!("{}: expected {} args, got {}", command, n, args.len())))
 83 |             } else {
 84 |                 Ok(args)
 85 |             }
 86 |         };
 87 | 
 88 |         match command {
 89 |             "!headers" => match getargs(1)?[0] {
 90 |                 "on" => {
 91 |                     self.show_headers = true;
 92 |                     println!("Headers enabled");
 93 |                 }
 94 |                 "off" => {
 95 |                     self.show_headers = false;
 96 |                     println!("Headers disabled");
 97 |                 }
 98 |                 v => return Err(Error::Parse(format!("Invalid value {}, expected on or off", v))),
 99 |             },
100 |             "!help" => println!(
101 |                 r#"
102 | Enter a SQL statement terminated by a semicolon (;) to execute it and display the result.
103 | The following commands are also available:
104 | 
105 |     !headers <on|off>  Enable or disable column headers
106 |     !help              This help message
107 |     !status            Display server status
108 |     !table [table]     Display table schema, if it exists
109 |     !tables            List tables
110 | "#
111 |             ),
112 |             "!status" => {
113 |                 let status = self.client.status().await?;
114 |                 let mut node_logs = status
115 |                     .raft
116 |                     .node_last_index
117 |                     .iter()
118 |                     .map(|(id, index)| format!("{}:{}", id, index))
119 |                     .collect::<Vec<_>>();
120 |                 node_logs.sort();
121 |                 println!(
122 |                     r#"
123 | Server:    {server} (leader {leader} in term {term} with {nodes} nodes)
124 | Raft log:  {committed} committed, {applied} applied, {raft_size} MB ({raft_storage} storage)
125 | Node logs: {logs}
126 | MVCC:      {active_txns} active txns, {versions} versions
127 | Storage:   {keys} keys, {logical_size} MB logical, {nodes}x {disk_size} MB disk, {garbage_percent}% garbage ({sql_storage} engine)
128 | "#,
129 |                     server = status.raft.server,
130 |                     leader = status.raft.leader,
131 |                     term = status.raft.term,
132 |                     nodes = status.raft.node_last_index.len(),
133 |                     committed = status.raft.commit_index,
134 |                     applied = status.raft.apply_index,
135 |                     raft_storage = status.raft.storage,
136 |                     raft_size =
137 |                         format_args!("{:.3}", status.raft.storage_size as f64 / 1000.0 / 1000.0),
138 |                     logs = node_logs.join(" "),
139 |                     versions = status.mvcc.versions,
140 |                     active_txns = status.mvcc.active_txns,
141 |                     keys = status.mvcc.storage.keys,
142 |                     logical_size =
143 |                         format_args!("{:.3}", status.mvcc.storage.size as f64 / 1000.0 / 1000.0),
144 |                     garbage_percent = format_args!(
145 |                         "{:.0}",
146 |                         if status.mvcc.storage.total_disk_size > 0 {
147 |                             status.mvcc.storage.garbage_disk_size as f64
148 |                                 / status.mvcc.storage.total_disk_size as f64
149 |                                 * 100.0
150 |                         } else {
151 |                             0.0
152 |                         }
153 |                     ),
154 |                     disk_size = format_args!(
155 |                         "{:.3}",
156 |                         status.mvcc.storage.total_disk_size as f64 / 1000.0 / 1000.0
157 |                     ),
158 |                     sql_storage = status.mvcc.storage.name,
159 |                 )
160 |             }
161 |             "!table" => {
162 |                 let args = getargs(1)?;
163 |                 println!("{}", self.client.get_table(args[0]).await?);
164 |             }
165 |             "!tables" => {
166 |                 getargs(0)?;
167 |                 for table in self.client.list_tables().await? {
168 |                     println!("{}", table)
169 |                 }
170 |             }
171 |             c => return Err(Error::Parse(format!("Unknown command {}", c))),
172 |         }
173 |         Ok(())
174 |     }
175 | 
176 |     /// Runs a query and displays the results
177 |     async fn execute_query(&mut self, query: &str) -> Result<()> {
178 |         match self.client.execute(query).await? {
179 |             ResultSet::Begin { version, read_only } => match read_only {
180 |                 false => println!("Began transaction at new version {}", version),
181 |                 true => println!("Began read-only transaction at version {}", version),
182 |             },
183 |             ResultSet::Commit { version: id } => println!("Committed transaction {}", id),
184 |             ResultSet::Rollback { version: id } => println!("Rolled back transaction {}", id),
185 |             ResultSet::Create { count } => println!("Created {} rows", count),
186 |             ResultSet::Delete { count } => println!("Deleted {} rows", count),
187 |             ResultSet::Update { count } => println!("Updated {} rows", count),
188 |             ResultSet::CreateTable { name } => println!("Created table {}", name),
189 |             ResultSet::DropTable { name } => println!("Dropped table {}", name),
190 |             ResultSet::Explain(plan) => println!("{}", plan),
191 |             ResultSet::Query { columns, mut rows } => {
192 |                 if self.show_headers {
193 |                     println!(
194 |                         "{}",
195 |                         columns
196 |                             .iter()
197 |                             .map(|c| c.name.as_deref().unwrap_or("?"))
198 |                             .collect::<Vec<_>>()
199 |                             .join("|")
200 |                     );
201 |                 }
202 |                 while let Some(row) = rows.next().transpose()? {
203 |                     println!(
204 |                         "{}",
205 |                         row.into_iter().map(|v| format!("{}", v)).collect::<Vec<_>>().join("|")
206 |                     );
207 |                 }
208 |             }
209 |         }
210 |         Ok(())
211 |     }
212 | 
213 |     /// Prompts the user for input
214 |     fn prompt(&mut self) -> Result<Option<String>> {
215 |         let prompt = match self.client.txn() {
216 |             Some((version, false)) => format!("entangledb:{}> ", version),
217 |             Some((version, true)) => format!("entangledb@{}> ", version),
218 |             None => "entangledb> ".into(),
219 |         };
220 |         match self.editor.readline(&prompt) {
221 |             Ok(input) => {
222 |                 self.editor.add_history_entry(&input)?;
223 |                 Ok(Some(input.trim().to_string()))
224 |             }
225 |             Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => Ok(None),
226 |             Err(err) => Err(err.into()),
227 |         }
228 |     }
229 | 
230 |     /// Runs the EntangleSQL REPL
231 |     async fn run(&mut self) -> Result<()> {
232 |         if let Some(path) = &self.history_path {
233 |             match self.editor.load_history(path) {
234 |                 Ok(_) => {}
235 |                 Err(ReadlineError::Io(ref err)) if err.kind() == std::io::ErrorKind::NotFound => {}
236 |                 Err(err) => return Err(err.into()),
237 |             };
238 |         }
239 |         // self.editor.set_helper(Some(InputValidator));
240 |         self.editor.set_helper(Some(EntangleInputValidator));
241 |         // Make sure multiline pastes are interpreted as normal inputs.
242 |         self.editor.bind_sequence(
243 |             rustyline::KeyEvent(rustyline::KeyCode::BracketedPasteStart, Modifiers::NONE),
244 |             rustyline::Cmd::Noop,
245 |         );
246 | 
247 |         let status = self.client.status().await?;
248 |         println!(
249 |             "Connected to EntangleDB node \"{}\". Enter !help for instructions.",
250 |             status.raft.server
251 |         );
252 | 
253 |         while let Some(input) = self.prompt()? {
254 |             match self.execute(&input).await {
255 |                 Ok(()) => {}
256 |                 error @ Err(Error::Internal(_)) => return error,
257 |                 Err(error) => println!("Error: {}", error),
258 |             }
259 |         }
260 | 
261 |         if let Some(path) = &self.history_path {
262 |             self.editor.save_history(path)?;
263 |         }
264 |         Ok(())
265 |     }
266 | }
267 | 
268 | /// A Rustyline helper for multiline editing. It parses input lines and determines if they make up a complete command or not.
269 | #[derive(Completer, Helper, Highlighter, Hinter)]
270 | struct EntangleInputValidator;
271 | 
272 | impl Validator for EntangleInputValidator {
273 |     fn validate(&self, ctx: &mut ValidationContext) -> rustyline::Result<ValidationResult> {
274 |         let input = ctx.input();
275 | 
276 |         // Empty lines and ! commands are fine.
277 |         if input.is_empty() || input.starts_with('!') || input == ";" {
278 |             return Ok(ValidationResult::Valid(None));
279 |         }
280 | 
281 |         // For SQL statements, just look for any semicolon or lexer error and if found accept the
282 |         // input and rely on the server to do further validation and error handling. Otherwise,
283 |         // wait for more input.
284 |         for result in Lexer::new(ctx.input()) {
285 |             match result {
286 |                 Ok(Token::Semicolon) => return Ok(ValidationResult::Valid(None)),
287 |                 Err(_) => return Ok(ValidationResult::Valid(None)),
288 |                 _ => {}
289 |             }
290 |         }
291 |         Ok(ValidationResult::Incomplete)
292 |     }
293 | 
294 |     fn validate_while_typing(&self) -> bool {
295 |         false
296 |     }
297 | }
298 | 


--------------------------------------------------------------------------------
/src/client.rs:
--------------------------------------------------------------------------------
  1 | use crate::error::{Error, Result};
  2 | use crate::server::{Request, Response};
  3 | use crate::sql::engine::Status;
  4 | use crate::sql::execution::ResultSet;
  5 | use crate::sql::schema::Table;
  6 | 
  7 | use futures::future::FutureExt as _;
  8 | use futures::sink::SinkExt as _;
  9 | use futures::stream::TryStreamExt as _;
 10 | use rand::Rng as _;
 11 | use std::cell::Cell;
 12 | use std::future::Future;
 13 | use std::ops::{Deref, Drop};
 14 | use std::sync::Arc;
 15 | use tokio::net::{TcpStream, ToSocketAddrs};
 16 | use tokio::sync::{Mutex, MutexGuard};
 17 | use tokio_util::codec::{Framed, LengthDelimitedCodec};
 18 | 
 19 | type Connection = tokio_serde::Framed<
 20 |     Framed<TcpStream, LengthDelimitedCodec>,
 21 |     Result<Response>,
 22 |     Request,
 23 |     tokio_serde::formats::Bincode<Result<Response>, Request>,
 24 | >;
 25 | 
 26 | /// Number of serialization retries in with_txn()
 27 | const WITH_TXN_RETRIES: u8 = 8;
 28 | 
 29 | /// A entangledb client
 30 | #[derive(Clone)]
 31 | pub struct Client {
 32 |     conn: Arc<Mutex<Connection>>,
 33 |     txn: Cell<Option<(u64, bool)>>,
 34 | }
 35 | 
 36 | impl Client {
 37 |     /// Creates a new client
 38 |     pub async fn new<A: ToSocketAddrs>(addr: A) -> Result<Self> {
 39 |         Ok(Self {
 40 |             conn: Arc::new(Mutex::new(tokio_serde::Framed::new(
 41 |                 Framed::new(TcpStream::connect(addr).await?, LengthDelimitedCodec::new()),
 42 |                 tokio_serde::formats::Bincode::default(),
 43 |             ))),
 44 |             txn: Cell::new(None),
 45 |         })
 46 |     }
 47 | 
 48 |     /// Call a server method
 49 |     async fn call(&self, request: Request) -> Result<Response> {
 50 |         let mut conn = self.conn.lock().await;
 51 |         self.call_locked(&mut conn, request).await
 52 |     }
 53 | 
 54 |     /// Call a server method while holding the mutex lock
 55 |     async fn call_locked(
 56 |         &self,
 57 |         conn: &mut MutexGuard<'_, Connection>,
 58 |         request: Request,
 59 |     ) -> Result<Response> {
 60 |         conn.send(request).await?;
 61 |         match conn.try_next().await? {
 62 |             Some(result) => result,
 63 |             None => Err(Error::Internal("Server disconnected".into())),
 64 |         }
 65 |     }
 66 | 
 67 |     /// Executes a query
 68 |     pub async fn execute(&self, query: &str) -> Result<ResultSet> {
 69 |         let mut conn = self.conn.lock().await;
 70 |         let mut resultset =
 71 |             match self.call_locked(&mut conn, Request::Execute(query.into())).await? {
 72 |                 Response::Execute(rs) => rs,
 73 |                 resp => return Err(Error::Internal(format!("Unexpected response {:?}", resp))),
 74 |             };
 75 |         if let ResultSet::Query { columns, .. } = resultset {
 76 |             // FIXME We buffer rows for now to avoid lifetime hassles
 77 |             let mut rows = Vec::new();
 78 |             while let Some(result) = conn.try_next().await? {
 79 |                 match result? {
 80 |                     Response::Row(Some(row)) => rows.push(row),
 81 |                     Response::Row(None) => break,
 82 |                     response => {
 83 |                         return Err(Error::Internal(format!("Unexpected response {:?}", response)))
 84 |                     }
 85 |                 }
 86 |             }
 87 |             resultset = ResultSet::Query { columns, rows: Box::new(rows.into_iter().map(Ok)) }
 88 |         };
 89 |         match &resultset {
 90 |             ResultSet::Begin { version, read_only } => self.txn.set(Some((*version, *read_only))),
 91 |             ResultSet::Commit { .. } => self.txn.set(None),
 92 |             ResultSet::Rollback { .. } => self.txn.set(None),
 93 |             _ => {}
 94 |         }
 95 |         Ok(resultset)
 96 |     }
 97 | 
 98 |     /// Fetches the table schema as SQL
 99 |     pub async fn get_table(&self, table: &str) -> Result<Table> {
100 |         match self.call(Request::GetTable(table.into())).await? {
101 |             Response::GetTable(t) => Ok(t),
102 |             resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))),
103 |         }
104 |     }
105 | 
106 |     /// Lists database tables
107 |     pub async fn list_tables(&self) -> Result<Vec<String>> {
108 |         match self.call(Request::ListTables).await? {
109 |             Response::ListTables(t) => Ok(t),
110 |             resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))),
111 |         }
112 |     }
113 | 
114 |     /// Checks server status
115 |     pub async fn status(&self) -> Result<Status> {
116 |         match self.call(Request::Status).await? {
117 |             Response::Status(s) => Ok(s),
118 |             resp => Err(Error::Value(format!("Unexpected response: {:?}", resp))),
119 |         }
120 |     }
121 | 
122 |     /// Returns the version and read-only state of the txn
123 |     pub fn txn(&self) -> Option<(u64, bool)> {
124 |         self.txn.get()
125 |     }
126 | 
127 |     /// Runs a query in a transaction, automatically retrying serialization failures with
128 |     /// exponential backoff.
129 |     pub async fn with_txn<W, F, R>(&self, mut with: W) -> Result<R>
130 |     where
131 |         W: FnMut(Client) -> F,
132 |         F: Future<Output = Result<R>>,
133 |     {
134 |         for i in 0..WITH_TXN_RETRIES {
135 |             if i > 0 {
136 |                 tokio::time::sleep(std::time::Duration::from_millis(
137 |                     2_u64.pow(i as u32 - 1) * rand::thread_rng().gen_range(25..=75),
138 |                 ))
139 |                 .await;
140 |             }
141 |             let result = async {
142 |                 self.execute("BEGIN").await?;
143 |                 let result = with(self.clone()).await?;
144 |                 self.execute("COMMIT").await?;
145 |                 Ok(result)
146 |             }
147 |             .await;
148 |             if result.is_err() {
149 |                 self.execute("ROLLBACK").await.ok();
150 |                 if matches!(result, Err(Error::Serialization) | Err(Error::Abort)) {
151 |                     continue;
152 |                 }
153 |             }
154 |             return result;
155 |         }
156 |         Err(Error::Serialization)
157 |     }
158 | }
159 | 
160 | /// A entangledb client pool
161 | pub struct Pool {
162 |     clients: Vec<Mutex<Client>>,
163 | }
164 | 
165 | impl Pool {
166 |     /// Creates a new connection pool for the given servers, eagerly connecting clients.
167 |     pub async fn new<A: ToSocketAddrs + Clone>(addrs: Vec<A>, size: u64) -> Result<Self> {
168 |         let mut addrs = addrs.into_iter().cycle();
169 |         let clients = futures::future::try_join_all(
170 |             std::iter::from_fn(|| {
171 |                 Some(Client::new(addrs.next().unwrap()).map(|r| r.map(Mutex::new)))
172 |             })
173 |             .take(size as usize),
174 |         )
175 |         .await?;
176 |         Ok(Self { clients })
177 |     }
178 | 
179 |     /// Fetches a client from the pool. It is reset (i.e. any open txns are rolled back) and
180 |     /// returned when it goes out of scope.
181 |     pub async fn get(&self) -> PoolClient<'_> {
182 |         let (client, index, _) =
183 |             futures::future::select_all(self.clients.iter().map(|m| m.lock().boxed())).await;
184 |         PoolClient::new(index, client)
185 |     }
186 | 
187 |     /// Returns the size of the pool
188 |     pub fn size(&self) -> usize {
189 |         self.clients.len()
190 |     }
191 | }
192 | 
193 | /// A client returned from the pool
194 | pub struct PoolClient<'a> {
195 |     id: usize,
196 |     client: MutexGuard<'a, Client>,
197 | }
198 | 
199 | impl<'a> PoolClient<'a> {
200 |     /// Creates a new PoolClient
201 |     fn new(id: usize, client: MutexGuard<'a, Client>) -> Self {
202 |         Self { id, client }
203 |     }
204 | 
205 |     /// Returns the ID of the client in the pool
206 |     pub fn id(&self) -> usize {
207 |         self.id
208 |     }
209 | }
210 | 
211 | impl<'a> Deref for PoolClient<'a> {
212 |     type Target = MutexGuard<'a, Client>;
213 | 
214 |     fn deref(&self) -> &Self::Target {
215 |         &self.client
216 |     }
217 | }
218 | 
219 | impl<'a> Drop for PoolClient<'a> {
220 |     fn drop(&mut self) {
221 |         if self.txn().is_some() {
222 |             // FIXME This should disconnect or destroy the client if it errors.
223 |             futures::executor::block_on(self.client.execute("ROLLBACK")).ok();
224 |         }
225 |     }
226 | }
227 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
  1 | use serde_derive::{Deserialize, Serialize};
  2 | use std::fmt::{self, Display};
  3 | 
  4 | /// Result returning Error
  5 | pub type Result<T> = std::result::Result<T, Error>;
  6 | 
  7 | /// entangledb errors. All except Internal are considered user-facing.
  8 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
  9 | pub enum Error {
 10 |     Abort,
 11 |     Config(String),
 12 |     Internal(String),
 13 |     Parse(String),
 14 |     ReadOnly,
 15 |     Serialization,
 16 |     Value(String),
 17 | }
 18 | 
 19 | impl std::error::Error for Error {}
 20 | 
 21 | impl Display for Error {
 22 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> fmt::Result {
 23 |         match self {
 24 |             Error::Config(s) | Error::Internal(s) | Error::Parse(s) | Error::Value(s) => {
 25 |                 write!(f, "{}", s)
 26 |             }
 27 |             Error::Abort => write!(f, "Operation aborted"),
 28 |             Error::Serialization => write!(f, "Serialization failure, retry transaction"),
 29 |             Error::ReadOnly => write!(f, "Read-only transaction"),
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl serde::ser::Error for Error {
 35 |     fn custom<T: Display>(msg: T) -> Self {
 36 |         Error::Internal(msg.to_string())
 37 |     }
 38 | }
 39 | 
 40 | impl serde::de::Error for Error {
 41 |     fn custom<T: Display>(msg: T) -> Self {
 42 |         Error::Internal(msg.to_string())
 43 |     }
 44 | }
 45 | 
 46 | impl From<Box<bincode::ErrorKind>> for Error {
 47 |     fn from(err: Box<bincode::ErrorKind>) -> Self {
 48 |         Error::Internal(err.to_string())
 49 |     }
 50 | }
 51 | 
 52 | impl From<config::ConfigError> for Error {
 53 |     fn from(err: config::ConfigError) -> Self {
 54 |         Error::Config(err.to_string())
 55 |     }
 56 | }
 57 | 
 58 | impl From<hex::FromHexError> for Error {
 59 |     fn from(err: hex::FromHexError) -> Self {
 60 |         Error::Internal(err.to_string())
 61 |     }
 62 | }
 63 | 
 64 | impl From<log::ParseLevelError> for Error {
 65 |     fn from(err: log::ParseLevelError) -> Self {
 66 |         Error::Config(err.to_string())
 67 |     }
 68 | }
 69 | 
 70 | impl From<log::SetLoggerError> for Error {
 71 |     fn from(err: log::SetLoggerError) -> Self {
 72 |         Error::Config(err.to_string())
 73 |     }
 74 | }
 75 | 
 76 | impl From<regex::Error> for Error {
 77 |     fn from(err: regex::Error) -> Self {
 78 |         Error::Value(err.to_string())
 79 |     }
 80 | }
 81 | 
 82 | impl From<rustyline::error::ReadlineError> for Error {
 83 |     fn from(err: rustyline::error::ReadlineError) -> Self {
 84 |         Error::Internal(err.to_string())
 85 |     }
 86 | }
 87 | 
 88 | impl From<std::array::TryFromSliceError> for Error {
 89 |     fn from(err: std::array::TryFromSliceError) -> Self {
 90 |         Error::Internal(err.to_string())
 91 |     }
 92 | }
 93 | 
 94 | impl From<std::num::TryFromIntError> for Error {
 95 |     fn from(err: std::num::TryFromIntError) -> Self {
 96 |         Error::Value(err.to_string())
 97 |     }
 98 | }
 99 | 
100 | impl From<std::io::Error> for Error {
101 |     fn from(err: std::io::Error) -> Self {
102 |         Error::Internal(err.to_string())
103 |     }
104 | }
105 | 
106 | impl From<std::net::AddrParseError> for Error {
107 |     fn from(err: std::net::AddrParseError) -> Self {
108 |         Error::Internal(err.to_string())
109 |     }
110 | }
111 | 
112 | impl From<std::num::ParseFloatError> for Error {
113 |     fn from(err: std::num::ParseFloatError) -> Self {
114 |         Error::Parse(err.to_string())
115 |     }
116 | }
117 | 
118 | impl From<std::num::ParseIntError> for Error {
119 |     fn from(err: std::num::ParseIntError) -> Self {
120 |         Error::Parse(err.to_string())
121 |     }
122 | }
123 | 
124 | impl From<std::string::FromUtf8Error> for Error {
125 |     fn from(err: std::string::FromUtf8Error) -> Self {
126 |         Error::Internal(err.to_string())
127 |     }
128 | }
129 | 
130 | impl<T> From<std::sync::PoisonError<T>> for Error {
131 |     fn from(err: std::sync::PoisonError<T>) -> Self {
132 |         Error::Internal(err.to_string())
133 |     }
134 | }
135 | 
136 | impl From<tokio::task::JoinError> for Error {
137 |     fn from(err: tokio::task::JoinError) -> Self {
138 |         Error::Internal(err.to_string())
139 |     }
140 | }
141 | 
142 | impl From<tokio::sync::mpsc::error::TryRecvError> for Error {
143 |     fn from(err: tokio::sync::mpsc::error::TryRecvError) -> Self {
144 |         Error::Internal(err.to_string())
145 |     }
146 | }
147 | 
148 | impl<T> From<tokio::sync::mpsc::error::SendError<T>> for Error {
149 |     fn from(err: tokio::sync::mpsc::error::SendError<T>) -> Self {
150 |         Error::Internal(err.to_string())
151 |     }
152 | }
153 | 
154 | impl<T> From<tokio::sync::mpsc::error::TrySendError<T>> for Error {
155 |     fn from(err: tokio::sync::mpsc::error::TrySendError<T>) -> Self {
156 |         Error::Internal(err.to_string())
157 |     }
158 | }
159 | 
160 | impl From<tokio::sync::oneshot::error::RecvError> for Error {
161 |     fn from(err: tokio::sync::oneshot::error::RecvError) -> Self {
162 |         Error::Internal(err.to_string())
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(clippy::all)]
 2 | #![allow(clippy::new_without_default)]
 3 | #![allow(clippy::unneeded_field_pattern)]
 4 | 
 5 | pub mod client;
 6 | pub mod error;
 7 | pub mod raft;
 8 | pub mod server;
 9 | pub mod sql;
10 | pub mod storage;
11 | 
12 | pub use client::Client;
13 | pub use server::Server;
14 | 


--------------------------------------------------------------------------------
/src/raft/message.rs:
--------------------------------------------------------------------------------
  1 | use super::{Entry, Index, NodeID, Status, Term};
  2 | use crate::error::Result;
  3 | 
  4 | use serde_derive::{Deserialize, Serialize};
  5 | 
  6 | /// A message address.
  7 | #[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
  8 | pub enum Address {
  9 |     /// Broadcast to all peers. Only valid as an outbound recipient (to).
 10 |     Broadcast,
 11 |     /// A node with the specified node ID (local or remote). Valid both as
 12 |     /// sender and recipient.
 13 |     Node(NodeID),
 14 |     /// A local client. Can only send ClientRequest messages, and receive
 15 |     /// ClientResponse messages.
 16 |     Client,
 17 | }
 18 | 
 19 | impl Address {
 20 |     /// Unwraps the node ID, or panics if address is not of kind Node.
 21 |     pub fn unwrap(&self) -> NodeID {
 22 |         match self {
 23 |             Self::Node(id) => *id,
 24 |             _ => panic!("unwrap called on non-Node address {:?}", self),
 25 |         }
 26 |     }
 27 | }
 28 | 
 29 | /// A message passed between Raft nodes.
 30 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 31 | pub struct Message {
 32 |     /// The current term of the sender. Must be set, unless the sender is
 33 |     /// Address::Client, in which case it must be 0.
 34 |     pub term: Term,
 35 |     /// The sender address.
 36 |     pub from: Address,
 37 |     /// The recipient address.
 38 |     pub to: Address,
 39 |     /// The message payload.
 40 |     pub event: Event,
 41 | }
 42 | 
 43 | /// An event contained within messages.
 44 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 45 | pub enum Event {
 46 |     /// Leaders send periodic heartbeats to its followers.
 47 |     Heartbeat {
 48 |         /// The index of the leader's last committed log entry.
 49 |         commit_index: Index,
 50 |         /// The term of the leader's last committed log entry.
 51 |         commit_term: Term,
 52 |     },
 53 |     /// Followers confirm loyalty to leader after heartbeats.
 54 |     ConfirmLeader {
 55 |         /// The commit_index of the original leader heartbeat, to confirm
 56 |         /// read requests.
 57 |         commit_index: Index,
 58 |         /// If false, the follower does not have the entry at commit_index
 59 |         /// and would like the leader to replicate it.
 60 |         has_committed: bool,
 61 |     },
 62 | 
 63 |     /// Candidates solicit votes from all peers when campaigning for leadership.
 64 |     SolicitVote {
 65 |         // The index of the candidate's last stored log entry
 66 |         last_index: Index,
 67 |         // The term of the candidate's last stored log entry
 68 |         last_term: Term,
 69 |     },
 70 | 
 71 |     /// Followers may grant a single vote to a candidate per term, on a
 72 |     /// first-come basis. Candidates implicitly vote for themselves.
 73 |     GrantVote,
 74 | 
 75 |     /// Leaders replicate log entries to followers by appending it to their log.
 76 |     AppendEntries {
 77 |         /// The index of the log entry immediately preceding the submitted commands.
 78 |         base_index: Index,
 79 |         /// The term of the log entry immediately preceding the submitted commands.
 80 |         base_term: Term,
 81 |         /// Commands to replicate.
 82 |         entries: Vec<Entry>,
 83 |     },
 84 |     /// Followers may accept a set of log entries from a leader.
 85 |     AcceptEntries {
 86 |         /// The index of the last log entry.
 87 |         last_index: Index,
 88 |     },
 89 |     /// Followers may also reject a set of log entries from a leader.
 90 |     RejectEntries,
 91 | 
 92 |     /// A client request. This can be submitted to the leader, or to a follower
 93 |     /// which will forward it to its leader. If there is no leader, or the
 94 |     /// leader or term changes, the request is aborted with an Error::Abort
 95 |     /// ClientResponse and the client must retry.
 96 |     ClientRequest {
 97 |         /// The request ID. This is arbitrary, but must be globally unique for
 98 |         /// the duration of the request.
 99 |         id: RequestID,
100 |         /// The request.
101 |         request: Request,
102 |     },
103 | 
104 |     /// A client response.
105 |     ClientResponse {
106 |         /// The response ID. This matches the ID of the ClientRequest.
107 |         id: RequestID,
108 |         /// The response, or an error.
109 |         response: Result<Response>,
110 |     },
111 | }
112 | 
113 | /// A client request ID.
114 | pub type RequestID = Vec<u8>;
115 | 
116 | /// A client request.
117 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
118 | pub enum Request {
119 |     Query(Vec<u8>),
120 |     Mutate(Vec<u8>),
121 |     Status,
122 | }
123 | 
124 | /// A client response.
125 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
126 | pub enum Response {
127 |     Query(Vec<u8>),
128 |     Mutate(Vec<u8>),
129 |     Status(Status),
130 | }
131 | 


--------------------------------------------------------------------------------
/src/raft/mod.rs:
--------------------------------------------------------------------------------
 1 | mod log;
 2 | mod message;
 3 | mod node;
 4 | mod server;
 5 | mod state;
 6 | 
 7 | pub use self::log::{Engine, Entry, Index, Log};
 8 | pub use message::{Address, Event, Message, Request, RequestID, Response};
 9 | pub use node::{Node, NodeID, Status, Term};
10 | pub use server::Server;
11 | pub use state::{Driver, Instruction, State};
12 | 


--------------------------------------------------------------------------------
/src/raft/server.rs:
--------------------------------------------------------------------------------
  1 | use super::{Address, Event, Log, Message, Node, NodeID, Request, Response, State};
  2 | use crate::error::{Error, Result};
  3 | 
  4 | use ::log::{debug, error};
  5 | use futures::{sink::SinkExt as _, FutureExt as _};
  6 | use std::collections::HashMap;
  7 | use std::time::Duration;
  8 | use tokio::net::{TcpListener, TcpStream};
  9 | use tokio::sync::{mpsc, oneshot};
 10 | use tokio_stream::wrappers::{ReceiverStream, TcpListenerStream, UnboundedReceiverStream};
 11 | use tokio_stream::StreamExt as _;
 12 | use tokio_util::codec::{Framed, LengthDelimitedCodec};
 13 | use uuid::Uuid;
 14 | 
 15 | /// The interval between Raft ticks, the unit of time for e.g. heartbeats and
 16 | /// elections.
 17 | const TICK_INTERVAL: Duration = Duration::from_millis(100);
 18 | 
 19 | /// A Raft server.
 20 | pub struct Server {
 21 |     node: Node,
 22 |     peers: HashMap<NodeID, String>,
 23 |     node_rx: mpsc::UnboundedReceiver<Message>,
 24 | }
 25 | 
 26 | impl Server {
 27 |     /// Creates a new Raft cluster
 28 |     pub async fn new(
 29 |         id: NodeID,
 30 |         peers: HashMap<NodeID, String>,
 31 |         log: Log,
 32 |         state: Box<dyn State>,
 33 |     ) -> Result<Self> {
 34 |         let (node_tx, node_rx) = mpsc::unbounded_channel();
 35 |         Ok(Self {
 36 |             node: Node::new(id, peers.keys().copied().collect(), log, state, node_tx).await?,
 37 |             peers,
 38 |             node_rx,
 39 |         })
 40 |     }
 41 | 
 42 |     /// Connects to peers and serves requests.
 43 |     pub async fn serve(
 44 |         self,
 45 |         listener: TcpListener,
 46 |         client_rx: mpsc::UnboundedReceiver<(Request, oneshot::Sender<Result<Response>>)>,
 47 |     ) -> Result<()> {
 48 |         let (tcp_in_tx, tcp_in_rx) = mpsc::unbounded_channel::<Message>();
 49 |         let (tcp_out_tx, tcp_out_rx) = mpsc::unbounded_channel::<Message>();
 50 |         let (task, tcp_receiver) = Self::tcp_receive(listener, tcp_in_tx).remote_handle();
 51 |         tokio::spawn(task);
 52 |         let (task, tcp_sender) = Self::tcp_send(self.peers, tcp_out_rx).remote_handle();
 53 |         tokio::spawn(task);
 54 |         let (task, eventloop) =
 55 |             Self::eventloop(self.node, self.node_rx, client_rx, tcp_in_rx, tcp_out_tx)
 56 |                 .remote_handle();
 57 |         tokio::spawn(task);
 58 | 
 59 |         tokio::try_join!(tcp_receiver, tcp_sender, eventloop)?;
 60 |         Ok(())
 61 |     }
 62 | 
 63 |     /// Runs the event loop.
 64 |     async fn eventloop(
 65 |         mut node: Node,
 66 |         node_rx: mpsc::UnboundedReceiver<Message>,
 67 |         client_rx: mpsc::UnboundedReceiver<(Request, oneshot::Sender<Result<Response>>)>,
 68 |         tcp_rx: mpsc::UnboundedReceiver<Message>,
 69 |         tcp_tx: mpsc::UnboundedSender<Message>,
 70 |     ) -> Result<()> {
 71 |         let mut node_rx = UnboundedReceiverStream::new(node_rx);
 72 |         let mut tcp_rx = UnboundedReceiverStream::new(tcp_rx);
 73 |         let mut client_rx = UnboundedReceiverStream::new(client_rx);
 74 | 
 75 |         let mut ticker = tokio::time::interval(TICK_INTERVAL);
 76 |         let mut requests = HashMap::<Vec<u8>, oneshot::Sender<Result<Response>>>::new();
 77 |         loop {
 78 |             tokio::select! {
 79 |                 _ = ticker.tick() => node = node.tick()?,
 80 | 
 81 |                 Some(msg) = tcp_rx.next() => node = node.step(msg)?,
 82 | 
 83 |                 Some(msg) = node_rx.next() => {
 84 |                     match msg {
 85 |                         Message{to: Address::Node(_), ..} => tcp_tx.send(msg)?,
 86 |                         Message{to: Address::Broadcast, ..} => tcp_tx.send(msg)?,
 87 |                         Message{to: Address::Client, event: Event::ClientResponse{ id, response }, ..} => {
 88 |                             if let Some(response_tx) = requests.remove(&id) {
 89 |                                 response_tx
 90 |                                     .send(response)
 91 |                                     .map_err(|e| Error::Internal(format!("Failed to send response {:?}", e)))?;
 92 |                             }
 93 |                         }
 94 |                         _ => return Err(Error::Internal(format!("Unexpected message {:?}", msg))),
 95 |                     }
 96 |                 }
 97 | 
 98 |                 Some((request, response_tx)) = client_rx.next() => {
 99 |                     let id = Uuid::new_v4().as_bytes().to_vec();
100 |                     let msg = Message{
101 |                         from: Address::Client,
102 |                         to: Address::Node(node.id()),
103 |                         term: 0,
104 |                         event: Event::ClientRequest{id: id.clone(), request},
105 |                     };
106 |                     node = node.step(msg)?;
107 |                     requests.insert(id, response_tx);
108 |                 }
109 |             }
110 |         }
111 |     }
112 | 
113 |     /// Receives inbound messages from peers via TCP.
114 |     async fn tcp_receive(
115 |         listener: TcpListener,
116 |         in_tx: mpsc::UnboundedSender<Message>,
117 |     ) -> Result<()> {
118 |         let mut listener = TcpListenerStream::new(listener);
119 |         while let Some(socket) = listener.try_next().await? {
120 |             let peer = socket.peer_addr()?;
121 |             let peer_in_tx = in_tx.clone();
122 |             tokio::spawn(async move {
123 |                 debug!("Raft peer {} connected", peer);
124 |                 match Self::tcp_receive_peer(socket, peer_in_tx).await {
125 |                     Ok(()) => debug!("Raft peer {} disconnected", peer),
126 |                     Err(err) => error!("Raft peer {} error: {}", peer, err.to_string()),
127 |                 };
128 |             });
129 |         }
130 |         Ok(())
131 |     }
132 | 
133 |     /// Receives inbound messages from a peer via TCP.
134 |     async fn tcp_receive_peer(
135 |         socket: TcpStream,
136 |         in_tx: mpsc::UnboundedSender<Message>,
137 |     ) -> Result<()> {
138 |         let mut stream = tokio_serde::SymmetricallyFramed::<_, Message, _>::new(
139 |             Framed::new(socket, LengthDelimitedCodec::new()),
140 |             tokio_serde::formats::SymmetricalBincode::<Message>::default(),
141 |         );
142 |         while let Some(message) = stream.try_next().await? {
143 |             in_tx.send(message)?;
144 |         }
145 |         Ok(())
146 |     }
147 | 
148 |     /// Sends outbound messages to peers via TCP.
149 |     async fn tcp_send(
150 |         peers: HashMap<NodeID, String>,
151 |         out_rx: mpsc::UnboundedReceiver<Message>,
152 |     ) -> Result<()> {
153 |         let mut out_rx = UnboundedReceiverStream::new(out_rx);
154 |         let mut peer_txs: HashMap<NodeID, mpsc::Sender<Message>> = HashMap::new();
155 | 
156 |         for (id, addr) in peers.into_iter() {
157 |             let (tx, rx) = mpsc::channel::<Message>(1000);
158 |             peer_txs.insert(id, tx);
159 |             tokio::spawn(Self::tcp_send_peer(addr, rx));
160 |         }
161 | 
162 |         while let Some(message) = out_rx.next().await {
163 |             let to = match message.to {
164 |                 Address::Broadcast => peer_txs.keys().copied().collect(),
165 |                 Address::Node(peer) => vec![peer],
166 |                 addr => {
167 |                     error!("Received outbound message for non-TCP address {:?}", addr);
168 |                     continue;
169 |                 }
170 |             };
171 |             for id in to {
172 |                 match peer_txs.get_mut(&id) {
173 |                     Some(tx) => match tx.try_send(message.clone()) {
174 |                         Ok(()) => {}
175 |                         Err(mpsc::error::TrySendError::Full(_)) => {
176 |                             debug!("Full send buffer for peer {}, discarding message", id)
177 |                         }
178 |                         Err(error) => return Err(error.into()),
179 |                     },
180 |                     None => error!("Received outbound message for unknown peer {}", id),
181 |                 }
182 |             }
183 |         }
184 |         Ok(())
185 |     }
186 | 
187 |     /// Sends outbound messages to a peer, continuously reconnecting.
188 |     async fn tcp_send_peer(addr: String, out_rx: mpsc::Receiver<Message>) {
189 |         let mut out_rx = ReceiverStream::new(out_rx);
190 |         loop {
191 |             match TcpStream::connect(&addr).await {
192 |                 Ok(socket) => {
193 |                     debug!("Connected to Raft peer {}", addr);
194 |                     match Self::tcp_send_peer_session(socket, &mut out_rx).await {
195 |                         Ok(()) => break,
196 |                         Err(err) => error!("Failed sending to Raft peer {}: {}", addr, err),
197 |                     }
198 |                 }
199 |                 Err(err) => error!("Failed connecting to Raft peer {}: {}", addr, err),
200 |             }
201 |             tokio::time::sleep(Duration::from_millis(1000)).await;
202 |         }
203 |         debug!("Disconnected from Raft peer {}", addr);
204 |     }
205 | 
206 |     /// Sends outbound messages to a peer via a TCP session.
207 |     async fn tcp_send_peer_session(
208 |         socket: TcpStream,
209 |         out_rx: &mut ReceiverStream<Message>,
210 |     ) -> Result<()> {
211 |         let mut stream = tokio_serde::SymmetricallyFramed::<_, Message, _>::new(
212 |             Framed::new(socket, LengthDelimitedCodec::new()),
213 |             tokio_serde::formats::SymmetricalBincode::<Message>::default(),
214 |         );
215 |         while let Some(message) = out_rx.next().await {
216 |             stream.send(message).await?;
217 |         }
218 |         Ok(())
219 |     }
220 | }
221 | 


--------------------------------------------------------------------------------
/src/server.rs:
--------------------------------------------------------------------------------
  1 | use crate::error::{Error, Result};
  2 | use crate::raft;
  3 | use crate::sql;
  4 | use crate::sql::engine::Engine as _;
  5 | use crate::sql::execution::ResultSet;
  6 | use crate::sql::schema::{Catalog as _, Table};
  7 | use crate::sql::types::Row;
  8 | 
  9 | use ::log::{debug, error, info};
 10 | use futures::sink::SinkExt as _;
 11 | use serde_derive::{Deserialize, Serialize};
 12 | use std::collections::HashMap;
 13 | use tokio::net::{TcpListener, TcpStream};
 14 | use tokio::sync::mpsc;
 15 | use tokio_stream::wrappers::TcpListenerStream;
 16 | use tokio_stream::StreamExt as _;
 17 | use tokio_util::codec::{Framed, LengthDelimitedCodec};
 18 | 
 19 | /// A entangledb server.
 20 | /// It encapsulates the Raft consensus server and SQL server functionalities.
 21 | /// The server manages both Raft and SQL client connections, processing incoming
 22 | /// requests and dispatching them to the appropriate internal components.
 23 | pub struct Server {
 24 |     raft: raft::Server,
 25 |     raft_listener: Option<TcpListener>,
 26 |     sql_listener: Option<TcpListener>,
 27 | }
 28 | 
 29 | impl Server {
 30 |     /// Creates a new entangledb server.
 31 |     /// Initializes a new server instance with the provided Raft configuration.
 32 |     /// 
 33 |     /// # Arguments
 34 |     /// * `id` - The unique identifier for the Raft node.
 35 |     /// * `peers` - A map of peer node IDs to their associated network addresses.
 36 |     /// * `raft_log` - The Raft log implementation.
 37 |     /// * `raft_state` - The persistent state storage for the Raft consensus algorithm.
 38 |     ///
 39 |     /// # Returns
 40 |     /// A result containing the new server instance or an error if initialization fails.
 41 |     pub async fn new(
 42 |         id: raft::NodeID,
 43 |         peers: HashMap<raft::NodeID, String>,
 44 |         raft_log: raft::Log,
 45 |         raft_state: Box<dyn raft::State>,
 46 |     ) -> Result<Self> {
 47 |         Ok(Server {
 48 |             raft: raft::Server::new(id, peers, raft_log, raft_state).await?,
 49 |             raft_listener: None,
 50 |             sql_listener: None,
 51 |         })
 52 |     }
 53 | 
 54 |     /// Starts listening on the given ports. Must be called before serve.
 55 |     /// Sets up the TCP listeners for both SQL and Raft communication.
 56 |     ///
 57 |     /// # Arguments
 58 |     /// * `sql_addr` - The address to listen for SQL client connections.
 59 |     /// * `raft_addr` - The address to listen for Raft peer connections.
 60 |     ///
 61 |     /// # Returns
 62 |     /// A result containing the server instance with listeners configured or an error if listening fails.
 63 |     pub async fn listen(mut self, sql_addr: &str, raft_addr: &str) -> Result<Self> {
 64 |         let (sql, raft) =
 65 |             tokio::try_join!(TcpListener::bind(sql_addr), TcpListener::bind(raft_addr),)?;
 66 |         info!("Listening on {} (SQL) and {} (Raft)", sql.local_addr()?, raft.local_addr()?);
 67 |         self.sql_listener = Some(sql);
 68 |         self.raft_listener = Some(raft);
 69 |         Ok(self)
 70 |     }
 71 | 
 72 |     /// Serves Raft and SQL requests until the returned future is dropped. Consumes the server.
 73 |     /// Starts the event loop for handling incoming Raft and SQL connections.
 74 |     /// This function will run indefinitely until the server is shut down.
 75 |     ///
 76 |     /// # Returns
 77 |     /// A result indicating the success or failure of the server event loop.
 78 |     pub async fn serve(self) -> Result<()> {
 79 |         let sql_listener = self
 80 |             .sql_listener
 81 |             .ok_or_else(|| Error::Internal("Must listen before serving".into()))?;
 82 |         let raft_listener = self
 83 |             .raft_listener
 84 |             .ok_or_else(|| Error::Internal("Must listen before serving".into()))?;
 85 |         let (raft_tx, raft_rx) = mpsc::unbounded_channel();
 86 |         let sql_engine = sql::engine::Raft::new(raft_tx);
 87 | 
 88 |         tokio::try_join!(
 89 |             self.raft.serve(raft_listener, raft_rx),
 90 |             Self::serve_sql(sql_listener, sql_engine),
 91 |         )?;
 92 |         Ok(())
 93 |     }
 94 | 
 95 |     /// Serves SQL clients.
 96 |     /// Accepts incoming SQL client connections and handles their requests in separate tasks.
 97 |     ///
 98 |     /// # Arguments
 99 |     /// * `listener` - The TCP listener for SQL client connections.
100 |     /// * `engine` - The SQL engine instance used for executing SQL commands.
101 |     ///
102 |     /// # Returns
103 |     /// A result indicating the success or failure of serving SQL clients.
104 |     async fn serve_sql(listener: TcpListener, engine: sql::engine::Raft) -> Result<()> {
105 |         let mut listener = TcpListenerStream::new(listener);
106 |         while let Some(socket) = listener.try_next().await? {
107 |             let peer = socket.peer_addr()?;
108 |             let session = Session::new(engine.clone())?;
109 |             tokio::spawn(async move {
110 |                 info!("Client {} connected", peer);
111 |                 match session.handle(socket).await {
112 |                     Ok(()) => info!("Client {} disconnected", peer),
113 |                     Err(err) => error!("Client {} error: {}", peer, err),
114 |                 }
115 |             });
116 |         }
117 |         Ok(())
118 |     }
119 | }
120 | 
121 | /// A client request.
122 | /// Enumerates the different types of requests that a client can send to the server.
123 | #[derive(Debug, Serialize, Deserialize)]
124 | pub enum Request {
125 |     Execute(String),
126 |     GetTable(String),
127 |     ListTables,
128 |     Status,
129 | }
130 | 
131 | /// A server response.
132 | /// Enumerates the different types of responses that the server can send back to the client.
133 | #[derive(Debug, Serialize, Deserialize)]
134 | pub enum Response {
135 |     Execute(ResultSet),
136 |     Row(Option<Row>),
137 |     GetTable(Table),
138 |     ListTables(Vec<String>),
139 |     Status(sql::engine::Status),
140 | }
141 | 
142 | /// A client session coupled to a SQL session.
143 | /// Manages the state and communication for a single client's connection to the SQL server.
144 | pub struct Session {
145 |     engine: sql::engine::Raft,
146 |     sql: sql::engine::Session<sql::engine::Raft>,
147 | }
148 | 
149 | impl Session {
150 |     /// Creates a new client session.
151 |     /// Initializes a new session for a client connected to the SQL server.
152 |     ///
153 |     /// # Arguments
154 |     /// * `engine` - The SQL engine instance used for executing SQL commands.
155 |     ///
156 |     /// # Returns
157 |     /// A result containing the new session instance or an error if initialization fails.
158 |     fn new(engine: sql::engine::Raft) -> Result<Self> {
159 |         Ok(Self { sql: engine.session()?, engine })
160 |     }
161 | 
162 |     /// Handles a client connection.
163 |     /// Processes incoming requests from the client and sends appropriate responses.
164 |     ///
165 |     /// # Arguments
166 |     /// * `socket` - The TCP stream representing the client's connection.
167 |     ///
168 |     /// # Returns
169 |     /// A result indicating the success or failure of handling the client connection.
170 |     async fn handle(mut self, socket: TcpStream) -> Result<()> {
171 |         let mut stream = tokio_serde::Framed::new(
172 |             Framed::new(socket, LengthDelimitedCodec::new()),
173 |             tokio_serde::formats::Bincode::default(),
174 |         );
175 |         while let Some(request) = stream.try_next().await? {
176 |             let mut response = tokio::task::block_in_place(|| self.request(request));
177 |             let mut rows: Box<dyn Iterator<Item = Result<Response>> + Send> =
178 |                 Box::new(std::iter::empty());
179 |             if let Ok(Response::Execute(ResultSet::Query { rows: ref mut resultrows, .. })) =
180 |                 &mut response
181 |             {
182 |                 rows = Box::new(
183 |                     std::mem::replace(resultrows, Box::new(std::iter::empty()))
184 |                         .map(|result| result.map(|row| Response::Row(Some(row))))
185 |                         .chain(std::iter::once(Ok(Response::Row(None))))
186 |                         .scan(false, |err_sent, response| match (&err_sent, &response) {
187 |                             (true, _) => None,
188 |                             (_, Err(error)) => {
189 |                                 *err_sent = true;
190 |                                 Some(Err(error.clone()))
191 |                             }
192 |                             _ => Some(response),
193 |                         })
194 |                         .fuse(),
195 |                 );
196 |             }
197 |             stream.send(response).await?;
198 |             stream.send_all(&mut tokio_stream::iter(rows.map(Ok))).await?;
199 |         }
200 |         Ok(())
201 |     }
202 | 
203 |     /// Executes a request.
204 |     /// Processes a single request from the client and generates the corresponding response.
205 |     ///
206 |     /// # Arguments
207 |     /// * `request` - The client request to be processed.
208 |     ///
209 |     /// # Returns
210 |     /// A result containing the server response to the request or an error if processing fails.
211 |     pub fn request(&mut self, request: Request) -> Result<Response> {
212 |         debug!("Processing request {:?}", request);
213 |         let response = match request {
214 |             Request::Execute(query) => Response::Execute(self.sql.execute(&query)?),
215 |             Request::GetTable(table) => {
216 |                 Response::GetTable(self.sql.read_with_txn(|txn| txn.must_read_table(&table))?)
217 |             }
218 |             Request::ListTables => Response::ListTables(
219 |                 self.sql.read_with_txn(|txn| Ok(txn.scan_tables()?.map(|t| t.name).collect()))?,
220 |             ),
221 |             Request::Status => Response::Status(self.engine.status()?),
222 |         };
223 |         debug!("Returning response {:?}", response);
224 |         Ok(response)
225 |     }
226 | }
227 | 
228 | impl Drop for Session {
229 |     fn drop(&mut self) {
230 |         /// Automatically rolls back any active transaction when the session is dropped.
231 |         tokio::task::block_in_place(|| self.sql.execute("ROLLBACK").ok());
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/src/sql/engine/mod.rs:
--------------------------------------------------------------------------------
  1 | //! The SQL engine provides fundamental CRUD storage operations.
  2 | mod kv;
  3 | pub mod raft;
  4 | pub use kv::KV;
  5 | pub use raft::{Raft, Status};
  6 | 
  7 | use super::execution::ResultSet;
  8 | use super::parser::{ast, Parser};
  9 | use super::plan::Plan;
 10 | use super::schema::Catalog;
 11 | use super::types::{Expression, Row, Value};
 12 | use crate::error::{Error, Result};
 13 | 
 14 | use std::collections::HashSet;
 15 | 
 16 | /// The SQL engine interface
 17 | pub trait Engine: Clone {
 18 |     /// The transaction type
 19 |     type Transaction: Transaction;
 20 | 
 21 |     /// Begins a read-write transaction.
 22 |     fn begin(&self) -> Result<Self::Transaction>;
 23 | 
 24 |     /// Begins a read-only transaction.
 25 |     fn begin_read_only(&self) -> Result<Self::Transaction>;
 26 | 
 27 |     /// Begins a read-only transaction as of a historical version.
 28 |     fn begin_as_of(&self, version: u64) -> Result<Self::Transaction>;
 29 | 
 30 |     /// Begins a session for executing individual statements
 31 |     fn session(&self) -> Result<Session<Self>> {
 32 |         Ok(Session { engine: self.clone(), txn: None })
 33 |     }
 34 | }
 35 | 
 36 | /// An SQL transaction
 37 | pub trait Transaction: Catalog {
 38 |     /// The transaction's version
 39 |     fn version(&self) -> u64;
 40 |     /// Whether the transaction is read-only
 41 |     fn read_only(&self) -> bool;
 42 | 
 43 |     /// Commits the transaction
 44 |     fn commit(self) -> Result<()>;
 45 |     /// Rolls back the transaction
 46 |     fn rollback(self) -> Result<()>;
 47 | 
 48 |     /// Creates a new table row
 49 |     fn create(&mut self, table: &str, row: Row) -> Result<()>;
 50 |     /// Deletes a table row
 51 |     fn delete(&mut self, table: &str, id: &Value) -> Result<()>;
 52 |     /// Reads a table row, if it exists
 53 |     fn read(&self, table: &str, id: &Value) -> Result<Option<Row>>;
 54 |     /// Reads an index entry, if it exists
 55 |     fn read_index(&self, table: &str, column: &str, value: &Value) -> Result<HashSet<Value>>;
 56 |     /// Scans a table's rows
 57 |     fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Scan>;
 58 |     /// Scans a column's index entries
 59 |     fn scan_index(&self, table: &str, column: &str) -> Result<IndexScan>;
 60 |     /// Updates a table row
 61 |     fn update(&mut self, table: &str, id: &Value, row: Row) -> Result<()>;
 62 | }
 63 | 
 64 | /// An SQL session, which handles transaction control and simplified query execution
 65 | pub struct Session<E: Engine> {
 66 |     /// The underlying engine
 67 |     engine: E,
 68 |     /// The current session transaction, if any
 69 |     txn: Option<E::Transaction>,
 70 | }
 71 | 
 72 | impl<E: Engine + 'static> Session<E> {
 73 |     /// Executes a query, managing transaction status for the session
 74 |     pub fn execute(&mut self, query: &str) -> Result<ResultSet> {
 75 |         // FIXME We should match on self.txn as well, but get this error:
 76 |         // error[E0009]: cannot bind by-move and by-ref in the same pattern
 77 |         // ...which seems like an arbitrary compiler limitation
 78 |         match Parser::new(query).parse()? {
 79 |             ast::Statement::Begin { .. } if self.txn.is_some() => {
 80 |                 Err(Error::Value("Already in a transaction".into()))
 81 |             }
 82 |             ast::Statement::Begin { read_only: true, as_of: None } => {
 83 |                 let txn = self.engine.begin_read_only()?;
 84 |                 let result = ResultSet::Begin { version: txn.version(), read_only: true };
 85 |                 self.txn = Some(txn);
 86 |                 Ok(result)
 87 |             }
 88 |             ast::Statement::Begin { read_only: true, as_of: Some(version) } => {
 89 |                 let txn = self.engine.begin_as_of(version)?;
 90 |                 let result = ResultSet::Begin { version, read_only: true };
 91 |                 self.txn = Some(txn);
 92 |                 Ok(result)
 93 |             }
 94 |             ast::Statement::Begin { read_only: false, as_of: Some(_) } => {
 95 |                 Err(Error::Value("Can't start read-write transaction in a given version".into()))
 96 |             }
 97 |             ast::Statement::Begin { read_only: false, as_of: None } => {
 98 |                 let txn = self.engine.begin()?;
 99 |                 let result = ResultSet::Begin { version: txn.version(), read_only: false };
100 |                 self.txn = Some(txn);
101 |                 Ok(result)
102 |             }
103 |             ast::Statement::Commit | ast::Statement::Rollback if self.txn.is_none() => {
104 |                 Err(Error::Value("Not in a transaction".into()))
105 |             }
106 |             ast::Statement::Commit => {
107 |                 let txn = self.txn.take().unwrap();
108 |                 let version = txn.version();
109 |                 txn.commit()?;
110 |                 Ok(ResultSet::Commit { version })
111 |             }
112 |             ast::Statement::Rollback => {
113 |                 let txn = self.txn.take().unwrap();
114 |                 let version = txn.version();
115 |                 txn.rollback()?;
116 |                 Ok(ResultSet::Rollback { version })
117 |             }
118 |             ast::Statement::Explain(statement) => self.read_with_txn(|txn| {
119 |                 Ok(ResultSet::Explain(Plan::build(*statement, txn)?.optimize(txn)?.0))
120 |             }),
121 |             statement if self.txn.is_some() => Plan::build(statement, self.txn.as_mut().unwrap())?
122 |                 .optimize(self.txn.as_mut().unwrap())?
123 |                 .execute(self.txn.as_mut().unwrap()),
124 |             statement @ ast::Statement::Select { .. } => {
125 |                 let mut txn = self.engine.begin_read_only()?;
126 |                 let result =
127 |                     Plan::build(statement, &mut txn)?.optimize(&mut txn)?.execute(&mut txn);
128 |                 txn.rollback()?;
129 |                 result
130 |             }
131 |             statement => {
132 |                 let mut txn = self.engine.begin()?;
133 |                 match Plan::build(statement, &mut txn)?.optimize(&mut txn)?.execute(&mut txn) {
134 |                     Ok(result) => {
135 |                         txn.commit()?;
136 |                         Ok(result)
137 |                     }
138 |                     Err(error) => {
139 |                         txn.rollback()?;
140 |                         Err(error)
141 |                     }
142 |                 }
143 |             }
144 |         }
145 |     }
146 | 
147 |     /// Runs a read-only closure in the session's transaction, or a new
148 |     /// transaction if none is active.
149 |     ///
150 |     /// TODO: reconsider this
151 |     pub fn read_with_txn<R, F>(&mut self, f: F) -> Result<R>
152 |     where
153 |         F: FnOnce(&mut E::Transaction) -> Result<R>,
154 |     {
155 |         if let Some(ref mut txn) = self.txn {
156 |             return f(txn);
157 |         }
158 |         let mut txn = self.engine.begin_read_only()?;
159 |         let result = f(&mut txn);
160 |         txn.rollback()?;
161 |         result
162 |     }
163 | }
164 | 
165 | /// A row scan iterator
166 | pub type Scan = Box<dyn DoubleEndedIterator<Item = Result<Row>> + Send>;
167 | 
168 | /// An index scan iterator
169 | pub type IndexScan = Box<dyn DoubleEndedIterator<Item = Result<(Value, HashSet<Value>)>> + Send>;
170 | 


--------------------------------------------------------------------------------
/src/sql/engine/raft.rs:
--------------------------------------------------------------------------------
  1 | use super::super::schema::{Catalog, Table, Tables};
  2 | use super::super::types::{Expression, Row, Value};
  3 | use super::{Engine as _, IndexScan, Scan, Transaction as _};
  4 | use crate::error::{Error, Result};
  5 | use crate::raft::{self, Entry};
  6 | use crate::storage::{self, bincode, mvcc::TransactionState};
  7 | 
  8 | use serde::{de::DeserializeOwned, Deserialize, Serialize};
  9 | use std::collections::HashSet;
 10 | use tokio::sync::{mpsc, oneshot};
 11 | 
 12 | /// A Raft state machine mutation.
 13 | ///
 14 | /// TODO: use Cows for these.
 15 | #[derive(Clone, Serialize, Deserialize)]
 16 | enum Mutation {
 17 |     /// Begins a transaction
 18 |     Begin { read_only: bool, as_of: Option<u64> },
 19 |     /// Commits the given transaction
 20 |     Commit(TransactionState),
 21 |     /// Rolls back the given transaction
 22 |     Rollback(TransactionState),
 23 | 
 24 |     /// Creates a new row
 25 |     Create { txn: TransactionState, table: String, row: Row },
 26 |     /// Deletes a row
 27 |     Delete { txn: TransactionState, table: String, id: Value },
 28 |     /// Updates a row
 29 |     Update { txn: TransactionState, table: String, id: Value, row: Row },
 30 | 
 31 |     /// Creates a table
 32 |     CreateTable { txn: TransactionState, schema: Table },
 33 |     /// Deletes a table
 34 |     DeleteTable { txn: TransactionState, table: String },
 35 | }
 36 | 
 37 | /// A Raft state machine query.
 38 | ///
 39 | /// TODO: use Cows for these.
 40 | #[derive(Clone, Serialize, Deserialize)]
 41 | enum Query {
 42 |     /// Fetches engine status
 43 |     Status,
 44 | 
 45 |     /// Reads a row
 46 |     Read { txn: TransactionState, table: String, id: Value },
 47 |     /// Reads an index entry
 48 |     ReadIndex { txn: TransactionState, table: String, column: String, value: Value },
 49 |     /// Scans a table's rows
 50 |     Scan { txn: TransactionState, table: String, filter: Option<Expression> },
 51 |     /// Scans an index
 52 |     ScanIndex { txn: TransactionState, table: String, column: String },
 53 | 
 54 |     /// Scans the tables
 55 |     ScanTables { txn: TransactionState },
 56 |     /// Reads a table
 57 |     ReadTable { txn: TransactionState, table: String },
 58 | }
 59 | 
 60 | /// Status for the Raft SQL engine.
 61 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 62 | pub struct Status {
 63 |     pub raft: raft::Status,
 64 |     pub mvcc: storage::mvcc::Status,
 65 | }
 66 | 
 67 | /// A client for the local Raft node.
 68 | #[derive(Clone)]
 69 | struct Client {
 70 |     tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender<Result<raft::Response>>)>,
 71 | }
 72 | 
 73 | impl Client {
 74 |     /// Creates a new Raft client.
 75 |     fn new(
 76 |         tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender<Result<raft::Response>>)>,
 77 |     ) -> Self {
 78 |         Self { tx }
 79 |     }
 80 | 
 81 |     /// Executes a request against the Raft cluster.
 82 |     fn execute(&self, request: raft::Request) -> Result<raft::Response> {
 83 |         let (response_tx, response_rx) = oneshot::channel();
 84 |         self.tx.send((request, response_tx))?;
 85 |         futures::executor::block_on(response_rx)?
 86 |     }
 87 | 
 88 |     /// Mutates the Raft state machine, deserializing the response into the
 89 |     /// return type.
 90 |     fn mutate<V: DeserializeOwned>(&self, mutation: Mutation) -> Result<V> {
 91 |         match self.execute(raft::Request::Mutate(bincode::serialize(&mutation)?))? {
 92 |             raft::Response::Mutate(response) => Ok(bincode::deserialize(&response)?),
 93 |             resp => Err(Error::Internal(format!("Unexpected Raft mutation response {:?}", resp))),
 94 |         }
 95 |     }
 96 | 
 97 |     /// Queries the Raft state machine, deserializing the response into the
 98 |     /// return type.
 99 |     fn query<V: DeserializeOwned>(&self, query: Query) -> Result<V> {
100 |         match self.execute(raft::Request::Query(bincode::serialize(&query)?))? {
101 |             raft::Response::Query(response) => Ok(bincode::deserialize(&response)?),
102 |             resp => Err(Error::Internal(format!("Unexpected Raft query response {:?}", resp))),
103 |         }
104 |     }
105 | 
106 |     /// Fetches Raft node status.
107 |     fn status(&self) -> Result<raft::Status> {
108 |         match self.execute(raft::Request::Status)? {
109 |             raft::Response::Status(status) => Ok(status),
110 |             resp => Err(Error::Internal(format!("Unexpected Raft status response {:?}", resp))),
111 |         }
112 |     }
113 | }
114 | 
115 | /// A SQL engine using a Raft state machine.
116 | #[derive(Clone)]
117 | pub struct Raft {
118 |     client: Client,
119 | }
120 | 
121 | impl Raft {
122 |     /// Creates a new Raft-based SQL engine.
123 |     pub fn new(
124 |         tx: mpsc::UnboundedSender<(raft::Request, oneshot::Sender<Result<raft::Response>>)>,
125 |     ) -> Self {
126 |         Self { client: Client::new(tx) }
127 |     }
128 | 
129 |     /// Creates an underlying state machine for a Raft engine.
130 |     pub fn new_state<E: storage::engine::Engine>(engine: E) -> Result<State<E>> {
131 |         State::new(engine)
132 |     }
133 | 
134 |     /// Returns Raft SQL engine status.
135 |     pub fn status(&self) -> Result<Status> {
136 |         Ok(Status { raft: self.client.status()?, mvcc: self.client.query(Query::Status)? })
137 |     }
138 | }
139 | 
140 | impl super::Engine for Raft {
141 |     type Transaction = Transaction;
142 | 
143 |     fn begin(&self) -> Result<Self::Transaction> {
144 |         Transaction::begin(self.client.clone(), false, None)
145 |     }
146 | 
147 |     fn begin_read_only(&self) -> Result<Self::Transaction> {
148 |         Transaction::begin(self.client.clone(), true, None)
149 |     }
150 | 
151 |     fn begin_as_of(&self, version: u64) -> Result<Self::Transaction> {
152 |         Transaction::begin(self.client.clone(), true, Some(version))
153 |     }
154 | }
155 | 
156 | /// A Raft-based SQL transaction.
157 | #[derive(Clone)]
158 | pub struct Transaction {
159 |     client: Client,
160 |     state: TransactionState,
161 | }
162 | 
163 | impl Transaction {
164 |     /// Starts a transaction in the given mode.
165 |     fn begin(client: Client, read_only: bool, as_of: Option<u64>) -> Result<Self> {
166 |         let state = client.mutate(Mutation::Begin { read_only, as_of })?;
167 |         Ok(Self { client, state })
168 |     }
169 | }
170 | 
171 | impl super::Transaction for Transaction {
172 |     fn version(&self) -> u64 {
173 |         self.state.version
174 |     }
175 | 
176 |     fn read_only(&self) -> bool {
177 |         self.state.read_only
178 |     }
179 | 
180 |     fn commit(self) -> Result<()> {
181 |         self.client.mutate(Mutation::Commit(self.state.clone()))
182 |     }
183 | 
184 |     fn rollback(self) -> Result<()> {
185 |         self.client.mutate(Mutation::Rollback(self.state.clone()))
186 |     }
187 | 
188 |     fn create(&mut self, table: &str, row: Row) -> Result<()> {
189 |         self.client.mutate(Mutation::Create {
190 |             txn: self.state.clone(),
191 |             table: table.to_string(),
192 |             row,
193 |         })
194 |     }
195 | 
196 |     fn delete(&mut self, table: &str, id: &Value) -> Result<()> {
197 |         self.client.mutate(Mutation::Delete {
198 |             txn: self.state.clone(),
199 |             table: table.to_string(),
200 |             id: id.clone(),
201 |         })
202 |     }
203 | 
204 |     fn read(&self, table: &str, id: &Value) -> Result<Option<Row>> {
205 |         self.client.query(Query::Read {
206 |             txn: self.state.clone(),
207 |             table: table.to_string(),
208 |             id: id.clone(),
209 |         })
210 |     }
211 | 
212 |     fn read_index(&self, table: &str, column: &str, value: &Value) -> Result<HashSet<Value>> {
213 |         self.client.query(Query::ReadIndex {
214 |             txn: self.state.clone(),
215 |             table: table.to_string(),
216 |             column: column.to_string(),
217 |             value: value.clone(),
218 |         })
219 |     }
220 | 
221 |     fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Scan> {
222 |         Ok(Box::new(
223 |             self.client
224 |                 .query::<Vec<_>>(Query::Scan {
225 |                     txn: self.state.clone(),
226 |                     table: table.to_string(),
227 |                     filter,
228 |                 })?
229 |                 .into_iter()
230 |                 .map(Ok),
231 |         ))
232 |     }
233 | 
234 |     fn scan_index(&self, table: &str, column: &str) -> Result<IndexScan> {
235 |         Ok(Box::new(
236 |             self.client
237 |                 .query::<Vec<_>>(Query::ScanIndex {
238 |                     txn: self.state.clone(),
239 |                     table: table.to_string(),
240 |                     column: column.to_string(),
241 |                 })?
242 |                 .into_iter()
243 |                 .map(Ok),
244 |         ))
245 |     }
246 | 
247 |     fn update(&mut self, table: &str, id: &Value, row: Row) -> Result<()> {
248 |         self.client.mutate(Mutation::Update {
249 |             txn: self.state.clone(),
250 |             table: table.to_string(),
251 |             id: id.clone(),
252 |             row,
253 |         })
254 |     }
255 | }
256 | 
257 | impl Catalog for Transaction {
258 |     fn create_table(&mut self, table: Table) -> Result<()> {
259 |         self.client.mutate(Mutation::CreateTable { txn: self.state.clone(), schema: table })
260 |     }
261 | 
262 |     fn delete_table(&mut self, table: &str) -> Result<()> {
263 |         self.client
264 |             .mutate(Mutation::DeleteTable { txn: self.state.clone(), table: table.to_string() })
265 |     }
266 | 
267 |     fn read_table(&self, table: &str) -> Result<Option<Table>> {
268 |         self.client.query(Query::ReadTable { txn: self.state.clone(), table: table.to_string() })
269 |     }
270 | 
271 |     fn scan_tables(&self) -> Result<Tables> {
272 |         Ok(Box::new(
273 |             self.client.query::<Vec<_>>(Query::ScanTables { txn: self.state.clone() })?.into_iter(),
274 |         ))
275 |     }
276 | }
277 | 
278 | /// The Raft state machine for the Raft-based SQL engine, using a KV SQL engine
279 | pub struct State<E: storage::engine::Engine> {
280 |     /// The underlying KV SQL engine
281 |     engine: super::KV<E>,
282 |     /// The last applied index
283 |     applied_index: u64,
284 | }
285 | 
286 | impl<E: storage::engine::Engine> State<E> {
287 |     /// Creates a new Raft state maching using the given storage engine.
288 |     pub fn new(engine: E) -> Result<Self> {
289 |         let engine = super::KV::new(engine);
290 |         let applied_index = engine
291 |             .get_metadata(b"applied_index")?
292 |             .map(|b| bincode::deserialize(&b))
293 |             .unwrap_or(Ok(0))?;
294 |         Ok(State { engine, applied_index })
295 |     }
296 | 
297 |     /// Mutates the state machine.
298 |     fn mutate(&mut self, mutation: Mutation) -> Result<Vec<u8>> {
299 |         match mutation {
300 |             Mutation::Begin { read_only, as_of } => {
301 |                 let txn = if !read_only {
302 |                     self.engine.begin()?
303 |                 } else if let Some(version) = as_of {
304 |                     self.engine.begin_as_of(version)?
305 |                 } else {
306 |                     self.engine.begin_read_only()?
307 |                 };
308 |                 bincode::serialize(&txn.state())
309 |             }
310 |             Mutation::Commit(txn) => bincode::serialize(&self.engine.resume(txn)?.commit()?),
311 |             Mutation::Rollback(txn) => bincode::serialize(&self.engine.resume(txn)?.rollback()?),
312 | 
313 |             Mutation::Create { txn, table, row } => {
314 |                 bincode::serialize(&self.engine.resume(txn)?.create(&table, row)?)
315 |             }
316 |             Mutation::Delete { txn, table, id } => {
317 |                 bincode::serialize(&self.engine.resume(txn)?.delete(&table, &id)?)
318 |             }
319 |             Mutation::Update { txn, table, id, row } => {
320 |                 bincode::serialize(&self.engine.resume(txn)?.update(&table, &id, row)?)
321 |             }
322 | 
323 |             Mutation::CreateTable { txn, schema } => {
324 |                 bincode::serialize(&self.engine.resume(txn)?.create_table(schema)?)
325 |             }
326 |             Mutation::DeleteTable { txn, table } => {
327 |                 bincode::serialize(&self.engine.resume(txn)?.delete_table(&table)?)
328 |             }
329 |         }
330 |     }
331 | }
332 | 
333 | impl<E: storage::engine::Engine> raft::State for State<E> {
334 |     fn get_applied_index(&self) -> u64 {
335 |         self.applied_index
336 |     }
337 | 
338 |     fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
339 |         assert_eq!(entry.index, self.applied_index + 1, "entry index not after applied index");
340 | 
341 |         let result = match &entry.command {
342 |             Some(command) => match self.mutate(bincode::deserialize(command)?) {
343 |                 error @ Err(Error::Internal(_)) => return error, // don't record as applied
344 |                 result => result,
345 |             },
346 |             None => Ok(Vec::new()),
347 |         };
348 |         self.applied_index = entry.index;
349 |         self.engine.set_metadata(b"applied_index", bincode::serialize(&entry.index)?)?;
350 |         result
351 |     }
352 | 
353 |     fn query(&self, command: Vec<u8>) -> Result<Vec<u8>> {
354 |         match bincode::deserialize(&command)? {
355 |             Query::Read { txn, table, id } => {
356 |                 bincode::serialize(&self.engine.resume(txn)?.read(&table, &id)?)
357 |             }
358 |             Query::ReadIndex { txn, table, column, value } => {
359 |                 bincode::serialize(&self.engine.resume(txn)?.read_index(&table, &column, &value)?)
360 |             }
361 |             // FIXME These need to stream rows somehow
362 |             Query::Scan { txn, table, filter } => bincode::serialize(
363 |                 &self.engine.resume(txn)?.scan(&table, filter)?.collect::<Result<Vec<_>>>()?,
364 |             ),
365 |             Query::ScanIndex { txn, table, column } => bincode::serialize(
366 |                 &self
367 |                     .engine
368 |                     .resume(txn)?
369 |                     .scan_index(&table, &column)?
370 |                     .collect::<Result<Vec<_>>>()?,
371 |             ),
372 |             Query::Status => bincode::serialize(&self.engine.kv.status()?),
373 | 
374 |             Query::ReadTable { txn, table } => {
375 |                 bincode::serialize(&self.engine.resume(txn)?.read_table(&table)?)
376 |             }
377 |             Query::ScanTables { txn } => {
378 |                 bincode::serialize(&self.engine.resume(txn)?.scan_tables()?.collect::<Vec<_>>())
379 |             }
380 |         }
381 |     }
382 | }
383 | 


--------------------------------------------------------------------------------
/src/sql/execution/aggregation.rs:
--------------------------------------------------------------------------------
  1 | use super::super::engine::Transaction;
  2 | use super::super::plan::Aggregate;
  3 | use super::super::types::{Column, Value};
  4 | use super::{Executor, ResultSet};
  5 | use crate::error::{Error, Result};
  6 | 
  7 | use std::cmp::Ordering;
  8 | use std::collections::HashMap;
  9 | 
 10 | /// An aggregation executor
 11 | pub struct Aggregation<T: Transaction> {
 12 |     source: Box<dyn Executor<T>>,
 13 |     aggregates: Vec<Aggregate>,
 14 |     accumulators: HashMap<Vec<Value>, Vec<Box<dyn Accumulator>>>,
 15 | }
 16 | 
 17 | impl<T: Transaction> Aggregation<T> {
 18 |     pub fn new(source: Box<dyn Executor<T>>, aggregates: Vec<Aggregate>) -> Box<Self> {
 19 |         Box::new(Self { source, aggregates, accumulators: HashMap::new() })
 20 |     }
 21 | }
 22 | 
 23 | impl<T: Transaction> Executor<T> for Aggregation<T> {
 24 |     #[allow(clippy::or_fun_call)]
 25 |     fn execute(mut self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 26 |         let agg_count = self.aggregates.len();
 27 |         match self.source.execute(txn)? {
 28 |             ResultSet::Query { columns, mut rows } => {
 29 |                 while let Some(mut row) = rows.next().transpose()? {
 30 |                     self.accumulators
 31 |                         .entry(row.split_off(self.aggregates.len()))
 32 |                         .or_insert(self.aggregates.iter().map(<dyn Accumulator>::from).collect())
 33 |                         .iter_mut()
 34 |                         .zip(row)
 35 |                         .try_for_each(|(acc, value)| acc.accumulate(&value))?
 36 |                 }
 37 |                 // If there were no rows and no group-by columns, return a row of empty accumulators:
 38 |                 // SELECT COUNT(*) FROM t WHERE FALSE
 39 |                 if self.accumulators.is_empty() && self.aggregates.len() == columns.len() {
 40 |                     self.accumulators.insert(
 41 |                         Vec::new(),
 42 |                         self.aggregates.iter().map(<dyn Accumulator>::from).collect(),
 43 |                     );
 44 |                 }
 45 |                 Ok(ResultSet::Query {
 46 |                     columns: columns
 47 |                         .into_iter()
 48 |                         .enumerate()
 49 |                         .map(|(i, c)| if i < agg_count { Column { name: None } } else { c })
 50 |                         .collect(),
 51 |                     rows: Box::new(self.accumulators.into_iter().map(|(bucket, accs)| {
 52 |                         Ok(accs
 53 |                             .into_iter()
 54 |                             .map(|acc| acc.aggregate())
 55 |                             .chain(bucket.into_iter())
 56 |                             .collect())
 57 |                     })),
 58 |                 })
 59 |             }
 60 |             r => Err(Error::Internal(format!("Unexpected result {:?}", r))),
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | // An accumulator
 66 | pub trait Accumulator: std::fmt::Debug + Send {
 67 |     // Accumulates a value
 68 |     fn accumulate(&mut self, value: &Value) -> Result<()>;
 69 | 
 70 |     // Calculates a final aggregate
 71 |     fn aggregate(&self) -> Value;
 72 | }
 73 | 
 74 | impl dyn Accumulator {
 75 |     fn from(aggregate: &Aggregate) -> Box<dyn Accumulator> {
 76 |         match aggregate {
 77 |             Aggregate::Average => Box::new(Average::new()),
 78 |             Aggregate::Count => Box::new(Count::new()),
 79 |             Aggregate::Max => Box::new(Max::new()),
 80 |             Aggregate::Min => Box::new(Min::new()),
 81 |             Aggregate::Sum => Box::new(Sum::new()),
 82 |         }
 83 |     }
 84 | }
 85 | 
 86 | // Count non-null values
 87 | #[derive(Debug)]
 88 | pub struct Count {
 89 |     count: u64,
 90 | }
 91 | 
 92 | impl Count {
 93 |     pub fn new() -> Self {
 94 |         Self { count: 0 }
 95 |     }
 96 | }
 97 | 
 98 | impl Accumulator for Count {
 99 |     fn accumulate(&mut self, value: &Value) -> Result<()> {
100 |         match value {
101 |             Value::Null => {}
102 |             _ => self.count += 1,
103 |         }
104 |         Ok(())
105 |     }
106 | 
107 |     fn aggregate(&self) -> Value {
108 |         Value::Integer(self.count as i64)
109 |     }
110 | }
111 | 
112 | // Average value
113 | #[derive(Debug)]
114 | pub struct Average {
115 |     count: Count,
116 |     sum: Sum,
117 | }
118 | 
119 | impl Average {
120 |     pub fn new() -> Self {
121 |         Self { count: Count::new(), sum: Sum::new() }
122 |     }
123 | }
124 | 
125 | impl Accumulator for Average {
126 |     fn accumulate(&mut self, value: &Value) -> Result<()> {
127 |         self.count.accumulate(value)?;
128 |         self.sum.accumulate(value)?;
129 |         Ok(())
130 |     }
131 | 
132 |     fn aggregate(&self) -> Value {
133 |         match (self.sum.aggregate(), self.count.aggregate()) {
134 |             (Value::Integer(s), Value::Integer(c)) => Value::Integer(s / c),
135 |             (Value::Float(s), Value::Integer(c)) => Value::Float(s / c as f64),
136 |             _ => Value::Null,
137 |         }
138 |     }
139 | }
140 | 
141 | // Maximum value
142 | #[derive(Debug)]
143 | pub struct Max {
144 |     max: Option<Value>,
145 | }
146 | 
147 | impl Max {
148 |     pub fn new() -> Self {
149 |         Self { max: None }
150 |     }
151 | }
152 | 
153 | impl Accumulator for Max {
154 |     fn accumulate(&mut self, value: &Value) -> Result<()> {
155 |         if let Some(max) = &mut self.max {
156 |             match value.partial_cmp(max) {
157 |                 _ if max.datatype() != value.datatype() => *max = Value::Null,
158 |                 None => *max = Value::Null,
159 |                 Some(Ordering::Greater) => *max = value.clone(),
160 |                 Some(Ordering::Equal) | Some(Ordering::Less) => {}
161 |             };
162 |         } else {
163 |             self.max = Some(value.clone())
164 |         }
165 |         Ok(())
166 |     }
167 | 
168 |     fn aggregate(&self) -> Value {
169 |         match &self.max {
170 |             Some(value) => value.clone(),
171 |             None => Value::Null,
172 |         }
173 |     }
174 | }
175 | 
176 | // Minimum value
177 | #[derive(Debug)]
178 | pub struct Min {
179 |     min: Option<Value>,
180 | }
181 | 
182 | impl Min {
183 |     pub fn new() -> Self {
184 |         Self { min: None }
185 |     }
186 | }
187 | 
188 | impl Accumulator for Min {
189 |     fn accumulate(&mut self, value: &Value) -> Result<()> {
190 |         if let Some(min) = &mut self.min {
191 |             match value.partial_cmp(min) {
192 |                 _ if min.datatype() != value.datatype() => *min = Value::Null,
193 |                 None => *min = Value::Null,
194 |                 Some(Ordering::Less) => *min = value.clone(),
195 |                 Some(Ordering::Equal) | Some(Ordering::Greater) => {}
196 |             };
197 |         } else {
198 |             self.min = Some(value.clone())
199 |         }
200 |         Ok(())
201 |     }
202 | 
203 |     fn aggregate(&self) -> Value {
204 |         match &self.min {
205 |             Some(value) => value.clone(),
206 |             None => Value::Null,
207 |         }
208 |     }
209 | }
210 | 
211 | // Sum of values
212 | #[derive(Debug)]
213 | pub struct Sum {
214 |     sum: Option<Value>,
215 | }
216 | 
217 | impl Sum {
218 |     pub fn new() -> Self {
219 |         Self { sum: None }
220 |     }
221 | }
222 | 
223 | impl Accumulator for Sum {
224 |     fn accumulate(&mut self, value: &Value) -> Result<()> {
225 |         self.sum = match (&self.sum, value) {
226 |             (Some(Value::Integer(s)), Value::Integer(i)) => Some(Value::Integer(s + i)),
227 |             (Some(Value::Float(s)), Value::Float(f)) => Some(Value::Float(s + f)),
228 |             (None, Value::Integer(i)) => Some(Value::Integer(*i)),
229 |             (None, Value::Float(f)) => Some(Value::Float(*f)),
230 |             _ => Some(Value::Null),
231 |         };
232 |         Ok(())
233 |     }
234 | 
235 |     fn aggregate(&self) -> Value {
236 |         match &self.sum {
237 |             Some(value) => value.clone(),
238 |             None => Value::Null,
239 |         }
240 |     }
241 | }
242 | 


--------------------------------------------------------------------------------
/src/sql/execution/join.rs:
--------------------------------------------------------------------------------
  1 | use super::super::engine::Transaction;
  2 | use super::super::types::{Expression, Rows};
  3 | use super::{Executor, ResultSet, Row, Value};
  4 | use crate::error::{Error, Result};
  5 | 
  6 | use std::collections::HashMap;
  7 | 
  8 | /// A nested loop join executor, which checks each row in the left source against every row in
  9 | /// the right source using the given predicate.
 10 | pub struct NestedLoopJoin<T: Transaction> {
 11 |     left: Box<dyn Executor<T>>,
 12 |     right: Box<dyn Executor<T>>,
 13 |     predicate: Option<Expression>,
 14 |     outer: bool,
 15 | }
 16 | 
 17 | impl<T: Transaction> NestedLoopJoin<T> {
 18 |     pub fn new(
 19 |         left: Box<dyn Executor<T>>,
 20 |         right: Box<dyn Executor<T>>,
 21 |         predicate: Option<Expression>,
 22 |         outer: bool,
 23 |     ) -> Box<Self> {
 24 |         Box::new(Self { left, right, predicate, outer })
 25 |     }
 26 | }
 27 | 
 28 | impl<T: Transaction> Executor<T> for NestedLoopJoin<T> {
 29 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 30 |         if let ResultSet::Query { mut columns, rows } = self.left.execute(txn)? {
 31 |             if let ResultSet::Query { columns: rcolumns, rows: rrows } = self.right.execute(txn)? {
 32 |                 let right_width = rcolumns.len();
 33 |                 columns.extend(rcolumns);
 34 |                 // FIXME Since making the iterators or sources clonable is non-trivial (requiring
 35 |                 // either avoiding Rust standard iterators or making sources generic), we simply
 36 |                 // fetch the entire right result as a vector.
 37 |                 return Ok(ResultSet::Query {
 38 |                     rows: Box::new(NestedLoopRows::new(
 39 |                         rows,
 40 |                         rrows.collect::<Result<Vec<_>>>()?,
 41 |                         right_width,
 42 |                         self.predicate,
 43 |                         self.outer,
 44 |                     )),
 45 |                     columns,
 46 |                 });
 47 |             }
 48 |         }
 49 |         Err(Error::Internal("Unexpected result set".into()))
 50 |     }
 51 | }
 52 | 
 53 | struct NestedLoopRows {
 54 |     left: Rows,
 55 |     left_row: Option<Result<Row>>,
 56 |     right: Box<dyn Iterator<Item = Row> + Send>,
 57 |     right_vec: Vec<Row>,
 58 |     right_empty: Vec<Value>,
 59 |     right_hit: bool,
 60 |     predicate: Option<Expression>,
 61 |     outer: bool,
 62 | }
 63 | 
 64 | impl NestedLoopRows {
 65 |     fn new(
 66 |         mut left: Rows,
 67 |         right: Vec<Row>,
 68 |         right_width: usize,
 69 |         predicate: Option<Expression>,
 70 |         outer: bool,
 71 |     ) -> Self {
 72 |         Self {
 73 |             left_row: left.next(),
 74 |             left,
 75 |             right: Box::new(right.clone().into_iter()),
 76 |             right_vec: right,
 77 |             right_empty: std::iter::repeat(Value::Null).take(right_width).collect(),
 78 |             right_hit: false,
 79 |             predicate,
 80 |             outer,
 81 |         }
 82 |     }
 83 | 
 84 |     // Tries to get the next joined row, with error handling.
 85 |     fn try_next(&mut self) -> Result<Option<Row>> {
 86 |         // While there is a valid left row, look for a right-hand match to return.
 87 |         while let Some(Ok(left_row)) = self.left_row.clone() {
 88 |             // If there is a hit in the remaining right rows, return it.
 89 |             if let Some(row) = self.try_next_hit(&left_row)? {
 90 |                 self.right_hit = true;
 91 |                 return Ok(Some(row));
 92 |             }
 93 | 
 94 |             // Otherwise, continue with the next left row and reset the right source.
 95 |             self.left_row = self.left.next();
 96 |             self.right = Box::new(self.right_vec.clone().into_iter());
 97 | 
 98 |             // If this is an outer join, when we reach the end of the right items without a hit,
 99 |             // we should return a row with nulls for the right fields.
100 |             if self.outer && !self.right_hit {
101 |                 let mut row = left_row;
102 |                 row.extend(self.right_empty.clone());
103 |                 return Ok(Some(row));
104 |             }
105 |             self.right_hit = false;
106 |         }
107 |         self.left_row.clone().transpose()
108 |     }
109 | 
110 |     /// Tries to find the next combined row that matches the predicate in the remaining right rows.
111 |     fn try_next_hit(&mut self, left_row: &[Value]) -> Result<Option<Row>> {
112 |         for right_row in &mut self.right {
113 |             let mut row = left_row.to_vec();
114 |             row.extend(right_row);
115 |             if let Some(predicate) = &self.predicate {
116 |                 match predicate.evaluate(Some(&row))? {
117 |                     Value::Boolean(true) => return Ok(Some(row)),
118 |                     Value::Boolean(false) => {}
119 |                     Value::Null => {}
120 |                     value => {
121 |                         return Err(Error::Value(format!(
122 |                             "Join predicate returned {}, expected boolean",
123 |                             value
124 |                         )))
125 |                     }
126 |                 }
127 |             } else {
128 |                 return Ok(Some(row));
129 |             }
130 |         }
131 |         Ok(None)
132 |     }
133 | }
134 | 
135 | impl Iterator for NestedLoopRows {
136 |     type Item = Result<Row>;
137 | 
138 |     fn next(&mut self) -> Option<Self::Item> {
139 |         self.try_next().transpose()
140 |     }
141 | }
142 | 
143 | /// A hash join executor
144 | pub struct HashJoin<T: Transaction> {
145 |     left: Box<dyn Executor<T>>,
146 |     left_field: usize,
147 |     right: Box<dyn Executor<T>>,
148 |     right_field: usize,
149 |     outer: bool,
150 | }
151 | 
152 | impl<T: Transaction> HashJoin<T> {
153 |     pub fn new(
154 |         left: Box<dyn Executor<T>>,
155 |         left_field: usize,
156 |         right: Box<dyn Executor<T>>,
157 |         right_field: usize,
158 |         outer: bool,
159 |     ) -> Box<Self> {
160 |         Box::new(Self { left, left_field, right, right_field, outer })
161 |     }
162 | }
163 | 
164 | impl<T: Transaction> Executor<T> for HashJoin<T> {
165 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
166 |         if let ResultSet::Query { mut columns, rows } = self.left.execute(txn)? {
167 |             if let ResultSet::Query { columns: rcolumns, rows: rrows } = self.right.execute(txn)? {
168 |                 let (l, r, outer) = (self.left_field, self.right_field, self.outer);
169 |                 let right: HashMap<Value, Row> = rrows
170 |                     .map(|res| match res {
171 |                         Ok(row) if row.len() <= r => {
172 |                             Err(Error::Internal(format!("Right index {} out of bounds", r)))
173 |                         }
174 |                         Ok(row) => Ok((row[r].clone(), row)),
175 |                         Err(err) => Err(err),
176 |                     })
177 |                     .collect::<Result<_>>()?;
178 |                 let empty = std::iter::repeat(Value::Null).take(rcolumns.len());
179 |                 columns.extend(rcolumns);
180 |                 let rows = Box::new(rows.filter_map(move |res| match res {
181 |                     Ok(row) if row.len() <= l => {
182 |                         Some(Err(Error::Value(format!("Left index {} out of bounds", l))))
183 |                     }
184 |                     Ok(mut row) => match right.get(&row[l]) {
185 |                         Some(hit) => {
186 |                             row.extend(hit.clone());
187 |                             Some(Ok(row))
188 |                         }
189 |                         None if outer => {
190 |                             row.extend(empty.clone());
191 |                             Some(Ok(row))
192 |                         }
193 |                         None => None,
194 |                     },
195 |                     Err(err) => Some(Err(err)),
196 |                 }));
197 |                 return Ok(ResultSet::Query { columns, rows });
198 |             }
199 |         }
200 |         Err(Error::Internal("Unexpected result set".into()))
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/src/sql/execution/mod.rs:
--------------------------------------------------------------------------------
  1 | mod aggregation;
  2 | mod join;
  3 | mod mutation;
  4 | mod query;
  5 | mod schema;
  6 | mod source;
  7 | 
  8 | use aggregation::Aggregation;
  9 | use join::{HashJoin, NestedLoopJoin};
 10 | use mutation::{Delete, Insert, Update};
 11 | use query::{Filter, Limit, Offset, Order, Projection};
 12 | use schema::{CreateTable, DropTable};
 13 | use source::{IndexLookup, KeyLookup, Nothing, Scan};
 14 | 
 15 | use super::engine::Transaction;
 16 | use super::plan::Node;
 17 | use super::types::{Columns, Row, Rows, Value};
 18 | use crate::error::{Error, Result};
 19 | 
 20 | use derivative::Derivative;
 21 | use serde_derive::{Deserialize, Serialize};
 22 | 
 23 | /// A plan executor
 24 | pub trait Executor<T: Transaction> {
 25 |     /// Executes the executor, consuming it and returning a result set
 26 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet>;
 27 | }
 28 | 
 29 | impl<T: Transaction + 'static> dyn Executor<T> {
 30 |     /// Builds an executor for a plan node, consuming it
 31 |     pub fn build(node: Node) -> Box<dyn Executor<T>> {
 32 |         match node {
 33 |             Node::Aggregation { source, aggregates } => {
 34 |                 Aggregation::new(Self::build(*source), aggregates)
 35 |             }
 36 |             Node::CreateTable { schema } => CreateTable::new(schema),
 37 |             Node::Delete { table, source } => Delete::new(table, Self::build(*source)),
 38 |             Node::DropTable { table } => DropTable::new(table),
 39 |             Node::Filter { source, predicate } => Filter::new(Self::build(*source), predicate),
 40 |             Node::HashJoin { left, left_field, right, right_field, outer } => HashJoin::new(
 41 |                 Self::build(*left),
 42 |                 left_field.0,
 43 |                 Self::build(*right),
 44 |                 right_field.0,
 45 |                 outer,
 46 |             ),
 47 |             Node::IndexLookup { table, alias: _, column, values } => {
 48 |                 IndexLookup::new(table, column, values)
 49 |             }
 50 |             Node::Insert { table, columns, expressions } => {
 51 |                 Insert::new(table, columns, expressions)
 52 |             }
 53 |             Node::KeyLookup { table, alias: _, keys } => KeyLookup::new(table, keys),
 54 |             Node::Limit { source, limit } => Limit::new(Self::build(*source), limit),
 55 |             Node::NestedLoopJoin { left, left_size: _, right, predicate, outer } => {
 56 |                 NestedLoopJoin::new(Self::build(*left), Self::build(*right), predicate, outer)
 57 |             }
 58 |             Node::Nothing => Nothing::new(),
 59 |             Node::Offset { source, offset } => Offset::new(Self::build(*source), offset),
 60 |             Node::Order { source, orders } => Order::new(Self::build(*source), orders),
 61 |             Node::Projection { source, expressions } => {
 62 |                 Projection::new(Self::build(*source), expressions)
 63 |             }
 64 |             Node::Scan { table, filter, alias: _ } => Scan::new(table, filter),
 65 |             Node::Update { table, source, expressions } => Update::new(
 66 |                 table,
 67 |                 Self::build(*source),
 68 |                 expressions.into_iter().map(|(i, _, e)| (i, e)).collect(),
 69 |             ),
 70 |         }
 71 |     }
 72 | }
 73 | 
 74 | /// An executor result set
 75 | #[derive(Derivative, Serialize, Deserialize)]
 76 | #[derivative(Debug, PartialEq)]
 77 | pub enum ResultSet {
 78 |     // Transaction started
 79 |     Begin {
 80 |         version: u64,
 81 |         read_only: bool,
 82 |     },
 83 |     // Transaction committed
 84 |     Commit {
 85 |         version: u64,
 86 |     },
 87 |     // Transaction rolled back
 88 |     Rollback {
 89 |         version: u64,
 90 |     },
 91 |     // Rows created
 92 |     Create {
 93 |         count: u64,
 94 |     },
 95 |     // Rows deleted
 96 |     Delete {
 97 |         count: u64,
 98 |     },
 99 |     // Rows updated
100 |     Update {
101 |         count: u64,
102 |     },
103 |     // Table created
104 |     CreateTable {
105 |         name: String,
106 |     },
107 |     // Table dropped
108 |     DropTable {
109 |         name: String,
110 |     },
111 |     // Query result
112 |     Query {
113 |         columns: Columns,
114 |         #[derivative(Debug = "ignore")]
115 |         #[derivative(PartialEq = "ignore")]
116 |         #[serde(skip, default = "ResultSet::empty_rows")]
117 |         rows: Rows,
118 |     },
119 |     // Explain result
120 |     Explain(Node),
121 | }
122 | 
123 | impl ResultSet {
124 |     /// Creates an empty row iterator, for use by serde(default).
125 |     fn empty_rows() -> Rows {
126 |         Box::new(std::iter::empty())
127 |     }
128 | 
129 |     /// Converts the ResultSet into a row, or errors if not a query result with rows.
130 |     pub fn into_row(self) -> Result<Row> {
131 |         if let ResultSet::Query { mut rows, .. } = self {
132 |             rows.next().transpose()?.ok_or_else(|| Error::Value("No rows returned".into()))
133 |         } else {
134 |             Err(Error::Value(format!("Not a query result: {:?}", self)))
135 |         }
136 |     }
137 | 
138 |     /// Converts the ResultSet into a value, if possible.
139 |     pub fn into_value(self) -> Result<Value> {
140 |         self.into_row()?.into_iter().next().ok_or_else(|| Error::Value("No value returned".into()))
141 |     }
142 | }
143 | 


--------------------------------------------------------------------------------
/src/sql/execution/mutation.rs:
--------------------------------------------------------------------------------
  1 | use super::super::engine::Transaction;
  2 | use super::super::schema::Table;
  3 | use super::super::types::{Expression, Row, Value};
  4 | use super::{Executor, ResultSet};
  5 | use crate::error::{Error, Result};
  6 | 
  7 | use std::collections::{HashMap, HashSet};
  8 | 
  9 | /// An INSERT executor
 10 | pub struct Insert {
 11 |     table: String,
 12 |     columns: Vec<String>,
 13 |     rows: Vec<Vec<Expression>>,
 14 | }
 15 | 
 16 | impl Insert {
 17 |     pub fn new(table: String, columns: Vec<String>, rows: Vec<Vec<Expression>>) -> Box<Self> {
 18 |         Box::new(Self { table, columns, rows })
 19 |     }
 20 | 
 21 |     // Builds a row from a set of column names and values, padding it with default values.
 22 |     pub fn make_row(table: &Table, columns: &[String], values: Vec<Value>) -> Result<Row> {
 23 |         if columns.len() != values.len() {
 24 |             return Err(Error::Value("Column and value counts do not match".into()));
 25 |         }
 26 |         let mut inputs = HashMap::new();
 27 |         for (c, v) in columns.iter().zip(values.into_iter()) {
 28 |             table.get_column(c)?;
 29 |             if inputs.insert(c.clone(), v).is_some() {
 30 |                 return Err(Error::Value(format!("Column {} given multiple times", c)));
 31 |             }
 32 |         }
 33 |         let mut row = Row::new();
 34 |         for column in table.columns.iter() {
 35 |             if let Some(value) = inputs.get(&column.name) {
 36 |                 row.push(value.clone())
 37 |             } else if let Some(value) = &column.default {
 38 |                 row.push(value.clone())
 39 |             } else {
 40 |                 return Err(Error::Value(format!("No value given for column {}", column.name)));
 41 |             }
 42 |         }
 43 |         Ok(row)
 44 |     }
 45 | 
 46 |     /// Pads a row with default values where possible.
 47 |     fn pad_row(table: &Table, mut row: Row) -> Result<Row> {
 48 |         for column in table.columns.iter().skip(row.len()) {
 49 |             if let Some(default) = &column.default {
 50 |                 row.push(default.clone())
 51 |             } else {
 52 |                 return Err(Error::Value(format!("No default value for column {}", column.name)));
 53 |             }
 54 |         }
 55 |         Ok(row)
 56 |     }
 57 | }
 58 | 
 59 | impl<T: Transaction> Executor<T> for Insert {
 60 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 61 |         let table = txn.must_read_table(&self.table)?;
 62 |         let mut count = 0;
 63 |         for expressions in self.rows {
 64 |             let mut row =
 65 |                 expressions.into_iter().map(|expr| expr.evaluate(None)).collect::<Result<_>>()?;
 66 |             if self.columns.is_empty() {
 67 |                 row = Self::pad_row(&table, row)?;
 68 |             } else {
 69 |                 row = Self::make_row(&table, &self.columns, row)?;
 70 |             }
 71 |             txn.create(&table.name, row)?;
 72 |             count += 1;
 73 |         }
 74 |         Ok(ResultSet::Create { count })
 75 |     }
 76 | }
 77 | 
 78 | /// An UPDATE executor
 79 | pub struct Update<T: Transaction> {
 80 |     table: String,
 81 |     source: Box<dyn Executor<T>>,
 82 |     expressions: Vec<(usize, Expression)>,
 83 | }
 84 | 
 85 | impl<T: Transaction> Update<T> {
 86 |     pub fn new(
 87 |         table: String,
 88 |         source: Box<dyn Executor<T>>,
 89 |         expressions: Vec<(usize, Expression)>,
 90 |     ) -> Box<Self> {
 91 |         Box::new(Self { table, source, expressions })
 92 |     }
 93 | }
 94 | 
 95 | impl<T: Transaction> Executor<T> for Update<T> {
 96 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 97 |         match self.source.execute(txn)? {
 98 |             ResultSet::Query { mut rows, .. } => {
 99 |                 let table = txn.must_read_table(&self.table)?;
100 | 
101 |                 // The iterator will see our changes, such that the same item may be iterated over
102 |                 // multiple times. We keep track of the primary keys here to avoid that, althought
103 |                 // it may cause ballooning memory usage for large updates.
104 |                 //
105 |                 // FIXME This is not safe for primary key updates, which may still be processed
106 |                 // multiple times - it should be possible to come up with a pathological case that
107 |                 // loops forever (e.g. UPDATE test SET id = id + 1).
108 |                 let mut updated = HashSet::new();
109 |                 while let Some(row) = rows.next().transpose()? {
110 |                     let id = table.get_row_key(&row)?;
111 |                     if updated.contains(&id) {
112 |                         continue;
113 |                     }
114 |                     let mut new = row.clone();
115 |                     for (field, expr) in &self.expressions {
116 |                         new[*field] = expr.evaluate(Some(&row))?;
117 |                     }
118 |                     txn.update(&table.name, &id, new)?;
119 |                     updated.insert(id);
120 |                 }
121 |                 Ok(ResultSet::Update { count: updated.len() as u64 })
122 |             }
123 |             r => Err(Error::Internal(format!("Unexpected response {:?}", r))),
124 |         }
125 |     }
126 | }
127 | 
128 | /// A DELETE executor
129 | pub struct Delete<T: Transaction> {
130 |     table: String,
131 |     source: Box<dyn Executor<T>>,
132 | }
133 | 
134 | impl<T: Transaction> Delete<T> {
135 |     pub fn new(table: String, source: Box<dyn Executor<T>>) -> Box<Self> {
136 |         Box::new(Self { table, source })
137 |     }
138 | }
139 | 
140 | impl<T: Transaction> Executor<T> for Delete<T> {
141 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
142 |         let table = txn.must_read_table(&self.table)?;
143 |         let mut count = 0;
144 |         match self.source.execute(txn)? {
145 |             ResultSet::Query { mut rows, .. } => {
146 |                 while let Some(row) = rows.next().transpose()? {
147 |                     txn.delete(&table.name, &table.get_row_key(&row)?)?;
148 |                     count += 1
149 |                 }
150 |                 Ok(ResultSet::Delete { count })
151 |             }
152 |             r => Err(Error::Internal(format!("Unexpected result {:?}", r))),
153 |         }
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/sql/execution/query.rs:
--------------------------------------------------------------------------------
  1 | use super::super::engine::Transaction;
  2 | use super::super::plan::Direction;
  3 | use super::super::types::{Column, Expression, Row, Value};
  4 | use super::{Executor, ResultSet};
  5 | use crate::error::{Error, Result};
  6 | 
  7 | /// A filter executor
  8 | pub struct Filter<T: Transaction> {
  9 |     source: Box<dyn Executor<T>>,
 10 |     predicate: Expression,
 11 | }
 12 | 
 13 | impl<T: Transaction> Filter<T> {
 14 |     pub fn new(source: Box<dyn Executor<T>>, predicate: Expression) -> Box<Self> {
 15 |         Box::new(Self { source, predicate })
 16 |     }
 17 | }
 18 | 
 19 | impl<T: Transaction> Executor<T> for Filter<T> {
 20 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 21 |         if let ResultSet::Query { columns, rows } = self.source.execute(txn)? {
 22 |             let predicate = self.predicate;
 23 |             Ok(ResultSet::Query {
 24 |                 columns,
 25 |                 rows: Box::new(rows.filter_map(move |r| {
 26 |                     r.and_then(|row| match predicate.evaluate(Some(&row))? {
 27 |                         Value::Boolean(true) => Ok(Some(row)),
 28 |                         Value::Boolean(false) => Ok(None),
 29 |                         Value::Null => Ok(None),
 30 |                         value => Err(Error::Value(format!(
 31 |                             "Filter returned {}, expected boolean",
 32 |                             value
 33 |                         ))),
 34 |                     })
 35 |                     .transpose()
 36 |                 })),
 37 |             })
 38 |         } else {
 39 |             Err(Error::Internal("Unexpected result".into()))
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | /// A projection executor
 45 | pub struct Projection<T: Transaction> {
 46 |     source: Box<dyn Executor<T>>,
 47 |     expressions: Vec<(Expression, Option<String>)>,
 48 | }
 49 | 
 50 | impl<T: Transaction> Projection<T> {
 51 |     pub fn new(
 52 |         source: Box<dyn Executor<T>>,
 53 |         expressions: Vec<(Expression, Option<String>)>,
 54 |     ) -> Box<Self> {
 55 |         Box::new(Self { source, expressions })
 56 |     }
 57 | }
 58 | 
 59 | impl<T: Transaction> Executor<T> for Projection<T> {
 60 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 61 |         if let ResultSet::Query { columns, rows } = self.source.execute(txn)? {
 62 |             let (expressions, labels): (Vec<Expression>, Vec<Option<String>>) =
 63 |                 self.expressions.into_iter().unzip();
 64 |             let columns = expressions
 65 |                 .iter()
 66 |                 .enumerate()
 67 |                 .map(|(i, e)| {
 68 |                     if let Some(Some(label)) = labels.get(i) {
 69 |                         Column { name: Some(label.clone()) }
 70 |                     } else if let Expression::Field(i, _) = e {
 71 |                         columns.get(*i).cloned().unwrap_or(Column { name: None })
 72 |                     } else {
 73 |                         Column { name: None }
 74 |                     }
 75 |                 })
 76 |                 .collect();
 77 |             let rows = Box::new(rows.map(move |r| {
 78 |                 r.and_then(|row| {
 79 |                     expressions.iter().map(|e| e.evaluate(Some(&row))).collect::<Result<_>>()
 80 |                 })
 81 |             }));
 82 |             Ok(ResultSet::Query { columns, rows })
 83 |         } else {
 84 |             Err(Error::Internal("Unexpected result".into()))
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | /// An ORDER BY executor
 90 | pub struct Order<T: Transaction> {
 91 |     source: Box<dyn Executor<T>>,
 92 |     order: Vec<(Expression, Direction)>,
 93 | }
 94 | 
 95 | impl<T: Transaction> Order<T> {
 96 |     pub fn new(source: Box<dyn Executor<T>>, order: Vec<(Expression, Direction)>) -> Box<Self> {
 97 |         Box::new(Self { source, order })
 98 |     }
 99 | }
100 | 
101 | impl<T: Transaction> Executor<T> for Order<T> {
102 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
103 |         match self.source.execute(txn)? {
104 |             ResultSet::Query { columns, mut rows } => {
105 |                 // FIXME Since we can't return errors from the sort_by closure, we have to
106 |                 // pre-evaluate all values. This means that we can't short-circuit evaluation,
107 |                 // and have to temporarily store evaluated values, which is bad for performance
108 |                 // and memory usage respectively
109 |                 struct Item {
110 |                     row: Row,
111 |                     values: Vec<Value>,
112 |                 }
113 | 
114 |                 let mut items = Vec::new();
115 |                 while let Some(row) = rows.next().transpose()? {
116 |                     let mut values = Vec::new();
117 |                     for (expr, _) in self.order.iter() {
118 |                         values.push(expr.evaluate(Some(&row))?);
119 |                     }
120 |                     items.push(Item { row, values })
121 |                 }
122 | 
123 |                 let order = &self.order;
124 |                 items.sort_by(|a, b| {
125 |                     for (i, (_, order)) in order.iter().enumerate() {
126 |                         let value_a = &a.values[i];
127 |                         let value_b = &b.values[i];
128 |                         match value_a.partial_cmp(value_b) {
129 |                             Some(std::cmp::Ordering::Equal) => {}
130 |                             Some(o) => {
131 |                                 return if *order == Direction::Ascending { o } else { o.reverse() }
132 |                             }
133 |                             None => {}
134 |                         }
135 |                     }
136 |                     std::cmp::Ordering::Equal
137 |                 });
138 | 
139 |                 Ok(ResultSet::Query {
140 |                     columns,
141 |                     rows: Box::new(items.into_iter().map(|i| Ok(i.row))),
142 |                 })
143 |             }
144 |             r => Err(Error::Internal(format!("Unexpected result {:?}", r))),
145 |         }
146 |     }
147 | }
148 | 
149 | /// A LIMIT executor
150 | pub struct Limit<T: Transaction> {
151 |     source: Box<dyn Executor<T>>,
152 |     limit: u64,
153 | }
154 | 
155 | impl<T: Transaction> Limit<T> {
156 |     pub fn new(source: Box<dyn Executor<T>>, limit: u64) -> Box<Self> {
157 |         Box::new(Self { source, limit })
158 |     }
159 | }
160 | 
161 | impl<T: Transaction> Executor<T> for Limit<T> {
162 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
163 |         if let ResultSet::Query { columns, rows } = self.source.execute(txn)? {
164 |             Ok(ResultSet::Query { columns, rows: Box::new(rows.take(self.limit as usize)) })
165 |         } else {
166 |             Err(Error::Internal("Unexpected result".into()))
167 |         }
168 |     }
169 | }
170 | 
171 | /// An OFFSET executor
172 | pub struct Offset<T: Transaction> {
173 |     source: Box<dyn Executor<T>>,
174 |     offset: u64,
175 | }
176 | 
177 | impl<T: Transaction> Offset<T> {
178 |     pub fn new(source: Box<dyn Executor<T>>, offset: u64) -> Box<Self> {
179 |         Box::new(Self { source, offset })
180 |     }
181 | }
182 | 
183 | impl<T: Transaction> Executor<T> for Offset<T> {
184 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
185 |         if let ResultSet::Query { columns, rows } = self.source.execute(txn)? {
186 |             Ok(ResultSet::Query { columns, rows: Box::new(rows.skip(self.offset as usize)) })
187 |         } else {
188 |             Err(Error::Internal("Unexpected result".into()))
189 |         }
190 |     }
191 | }
192 | 


--------------------------------------------------------------------------------
/src/sql/execution/schema.rs:
--------------------------------------------------------------------------------
 1 | use super::super::engine::Transaction;
 2 | use super::super::schema::Table;
 3 | use super::{Executor, ResultSet};
 4 | use crate::error::Result;
 5 | 
 6 | /// A CREATE TABLE executor
 7 | pub struct CreateTable {
 8 |     table: Table,
 9 | }
10 | 
11 | impl CreateTable {
12 |     pub fn new(table: Table) -> Box<Self> {
13 |         Box::new(Self { table })
14 |     }
15 | }
16 | 
17 | impl<T: Transaction> Executor<T> for CreateTable {
18 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
19 |         let name = self.table.name.clone();
20 |         txn.create_table(self.table)?;
21 |         Ok(ResultSet::CreateTable { name })
22 |     }
23 | }
24 | 
25 | /// A DROP TABLE executor
26 | pub struct DropTable {
27 |     table: String,
28 | }
29 | 
30 | impl DropTable {
31 |     pub fn new(table: String) -> Box<Self> {
32 |         Box::new(Self { table })
33 |     }
34 | }
35 | 
36 | impl<T: Transaction> Executor<T> for DropTable {
37 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
38 |         txn.delete_table(&self.table)?;
39 |         Ok(ResultSet::DropTable { name: self.table })
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/sql/execution/source.rs:
--------------------------------------------------------------------------------
  1 | use super::super::engine::Transaction;
  2 | use super::super::types::{Column, Expression, Row, Value};
  3 | use super::{Executor, ResultSet};
  4 | use crate::error::Result;
  5 | 
  6 | use std::collections::HashSet;
  7 | 
  8 | /// A table scan executor
  9 | pub struct Scan {
 10 |     table: String,
 11 |     filter: Option<Expression>,
 12 | }
 13 | 
 14 | impl Scan {
 15 |     pub fn new(table: String, filter: Option<Expression>) -> Box<Self> {
 16 |         Box::new(Self { table, filter })
 17 |     }
 18 | }
 19 | 
 20 | impl<T: Transaction> Executor<T> for Scan {
 21 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 22 |         let table = txn.must_read_table(&self.table)?;
 23 |         Ok(ResultSet::Query {
 24 |             columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(),
 25 |             rows: Box::new(txn.scan(&table.name, self.filter)?),
 26 |         })
 27 |     }
 28 | }
 29 | 
 30 | /// A primary key lookup executor
 31 | pub struct KeyLookup {
 32 |     table: String,
 33 |     keys: Vec<Value>,
 34 | }
 35 | 
 36 | impl KeyLookup {
 37 |     pub fn new(table: String, keys: Vec<Value>) -> Box<Self> {
 38 |         Box::new(Self { table, keys })
 39 |     }
 40 | }
 41 | 
 42 | impl<T: Transaction> Executor<T> for KeyLookup {
 43 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 44 |         let table = txn.must_read_table(&self.table)?;
 45 | 
 46 |         // FIXME Is there a way to pass the txn into an iterator closure instead?
 47 |         let rows = self
 48 |             .keys
 49 |             .into_iter()
 50 |             .filter_map(|key| txn.read(&table.name, &key).transpose())
 51 |             .collect::<Result<Vec<Row>>>()?;
 52 | 
 53 |         Ok(ResultSet::Query {
 54 |             columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(),
 55 |             rows: Box::new(rows.into_iter().map(Ok)),
 56 |         })
 57 |     }
 58 | }
 59 | 
 60 | /// An index value lookup executor
 61 | pub struct IndexLookup {
 62 |     table: String,
 63 |     column: String,
 64 |     values: Vec<Value>,
 65 | }
 66 | 
 67 | impl IndexLookup {
 68 |     pub fn new(table: String, column: String, values: Vec<Value>) -> Box<Self> {
 69 |         Box::new(Self { table, column, values })
 70 |     }
 71 | }
 72 | 
 73 | impl<T: Transaction> Executor<T> for IndexLookup {
 74 |     fn execute(self: Box<Self>, txn: &mut T) -> Result<ResultSet> {
 75 |         let table = txn.must_read_table(&self.table)?;
 76 | 
 77 |         let mut pks: HashSet<Value> = HashSet::new();
 78 |         for value in self.values {
 79 |             pks.extend(txn.read_index(&self.table, &self.column, &value)?);
 80 |         }
 81 | 
 82 |         // FIXME Is there a way to pass the txn into an iterator closure instead?
 83 |         let rows = pks
 84 |             .into_iter()
 85 |             .filter_map(|pk| txn.read(&table.name, &pk).transpose())
 86 |             .collect::<Result<Vec<Row>>>()?;
 87 | 
 88 |         Ok(ResultSet::Query {
 89 |             columns: table.columns.iter().map(|c| Column { name: Some(c.name.clone()) }).collect(),
 90 |             rows: Box::new(rows.into_iter().map(Ok)),
 91 |         })
 92 |     }
 93 | }
 94 | 
 95 | /// An executor that produces a single empty row
 96 | pub struct Nothing;
 97 | 
 98 | impl Nothing {
 99 |     pub fn new() -> Box<Self> {
100 |         Box::new(Self)
101 |     }
102 | }
103 | 
104 | impl<T: Transaction> Executor<T> for Nothing {
105 |     fn execute(self: Box<Self>, _: &mut T) -> Result<ResultSet> {
106 |         Ok(ResultSet::Query {
107 |             columns: Vec::new(),
108 |             rows: Box::new(std::iter::once(Ok(Row::new()))),
109 |         })
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/sql/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod engine;
2 | pub mod execution;
3 | pub mod parser;
4 | pub mod plan;
5 | pub mod schema;
6 | pub mod types;
7 | 


--------------------------------------------------------------------------------
/src/sql/parser/ast.rs:
--------------------------------------------------------------------------------
  1 | use super::super::types::DataType;
  2 | use crate::error::Result;
  3 | 
  4 | use std::collections::BTreeMap;
  5 | use std::mem::replace;
  6 | 
  7 | /// Statements
  8 | #[derive(Clone, Debug, PartialEq)]
  9 | #[allow(clippy::large_enum_variant)]
 10 | pub enum Statement {
 11 |     Begin {
 12 |         read_only: bool,
 13 |         as_of: Option<u64>,
 14 |     },
 15 |     Commit,
 16 |     Rollback,
 17 |     Explain(Box<Statement>),
 18 | 
 19 |     CreateTable {
 20 |         name: String,
 21 |         columns: Vec<Column>,
 22 |     },
 23 |     DropTable(String),
 24 | 
 25 |     Delete {
 26 |         table: String,
 27 |         r#where: Option<Expression>,
 28 |     },
 29 |     Insert {
 30 |         table: String,
 31 |         columns: Option<Vec<String>>,
 32 |         values: Vec<Vec<Expression>>,
 33 |     },
 34 |     Update {
 35 |         table: String,
 36 |         set: BTreeMap<String, Expression>,
 37 |         r#where: Option<Expression>,
 38 |     },
 39 | 
 40 |     Select {
 41 |         select: Vec<(Expression, Option<String>)>,
 42 |         from: Vec<FromItem>,
 43 |         r#where: Option<Expression>,
 44 |         group_by: Vec<Expression>,
 45 |         having: Option<Expression>,
 46 |         order: Vec<(Expression, Order)>,
 47 |         offset: Option<Expression>,
 48 |         limit: Option<Expression>,
 49 |     },
 50 | }
 51 | 
 52 | /// A FROM item
 53 | #[derive(Clone, Debug, PartialEq)]
 54 | pub enum FromItem {
 55 |     Table {
 56 |         name: String,
 57 |         alias: Option<String>,
 58 |     },
 59 |     Join {
 60 |         left: Box<FromItem>,
 61 |         right: Box<FromItem>,
 62 |         r#type: JoinType,
 63 |         predicate: Option<Expression>,
 64 |     },
 65 | }
 66 | 
 67 | /// A JOIN type
 68 | #[derive(Clone, Debug, PartialEq)]
 69 | pub enum JoinType {
 70 |     Cross,
 71 |     Inner,
 72 |     Left,
 73 |     Right,
 74 | }
 75 | 
 76 | /// A column
 77 | #[derive(Clone, Debug, PartialEq)]
 78 | pub struct Column {
 79 |     pub name: String,
 80 |     pub datatype: DataType,
 81 |     pub primary_key: bool,
 82 |     pub nullable: Option<bool>,
 83 |     pub default: Option<Expression>,
 84 |     pub unique: bool,
 85 |     pub index: bool,
 86 |     pub references: Option<String>,
 87 | }
 88 | 
 89 | /// Sort orders
 90 | #[derive(Clone, Debug, PartialEq)]
 91 | pub enum Order {
 92 |     Ascending,
 93 |     Descending,
 94 | }
 95 | 
 96 | /// Expressions
 97 | #[derive(Clone, Debug, PartialEq)]
 98 | pub enum Expression {
 99 |     Field(Option<String>, String),
100 |     Column(usize), // only used during plan building to break off expression subtrees
101 |     Literal(Literal),
102 |     Function(String, Vec<Expression>),
103 |     Operation(Operation),
104 | }
105 | 
106 | impl From<Literal> for Expression {
107 |     fn from(literal: Literal) -> Self {
108 |         Self::Literal(literal)
109 |     }
110 | }
111 | 
112 | impl From<Operation> for Expression {
113 |     fn from(op: Operation) -> Self {
114 |         Self::Operation(op)
115 |     }
116 | }
117 | 
118 | /// Literals
119 | #[derive(Clone, Debug, PartialEq)]
120 | pub enum Literal {
121 |     Null,
122 |     Boolean(bool),
123 |     Integer(i64),
124 |     Float(f64),
125 |     String(String),
126 | }
127 | 
128 | /// Operations (done by operators)
129 | #[derive(Clone, Debug, PartialEq)]
130 | pub enum Operation {
131 |     // Logical operators
132 |     And(Box<Expression>, Box<Expression>),
133 |     Not(Box<Expression>),
134 |     Or(Box<Expression>, Box<Expression>),
135 | 
136 |     // Comparison operators
137 |     Equal(Box<Expression>, Box<Expression>),
138 |     GreaterThan(Box<Expression>, Box<Expression>),
139 |     GreaterThanOrEqual(Box<Expression>, Box<Expression>),
140 |     IsNull(Box<Expression>),
141 |     LessThan(Box<Expression>, Box<Expression>),
142 |     LessThanOrEqual(Box<Expression>, Box<Expression>),
143 |     NotEqual(Box<Expression>, Box<Expression>),
144 | 
145 |     // Mathematical operators
146 |     Add(Box<Expression>, Box<Expression>),
147 |     Assert(Box<Expression>),
148 |     Divide(Box<Expression>, Box<Expression>),
149 |     Exponentiate(Box<Expression>, Box<Expression>),
150 |     Factorial(Box<Expression>),
151 |     Modulo(Box<Expression>, Box<Expression>),
152 |     Multiply(Box<Expression>, Box<Expression>),
153 |     Negate(Box<Expression>),
154 |     Subtract(Box<Expression>, Box<Expression>),
155 | 
156 |     // String operators
157 |     Like(Box<Expression>, Box<Expression>),
158 | }
159 | 
160 | impl Expression {
161 |     /// Walks the expression tree while calling a closure. Returns true as soon as the closure
162 |     /// returns true. This is the inverse of walk().
163 |     pub fn contains<F: Fn(&Expression) -> bool>(&self, visitor: &F) -> bool {
164 |         !self.walk(&|e| !visitor(e))
165 |     }
166 | 
167 |     /// Replaces the expression with result of the closure. Helper function for transform().
168 |     fn replace_with<F: FnMut(Self) -> Result<Self>>(&mut self, mut f: F) -> Result<()> {
169 |         // Temporarily replace expression with a null value, in case closure panics. May consider
170 |         // replace_with crate if this hampers performance.
171 |         let expr = replace(self, Expression::Literal(Literal::Null));
172 |         *self = f(expr)?;
173 |         Ok(())
174 |     }
175 | 
176 |     /// Transforms the expression tree by applying a closure before and after descending.
177 |     pub fn transform<B, A>(mut self, before: &mut B, after: &mut A) -> Result<Self>
178 |     where
179 |         B: FnMut(Self) -> Result<Self>,
180 |         A: FnMut(Self) -> Result<Self>,
181 |     {
182 |         use Operation::*;
183 |         self = before(self)?;
184 |         match &mut self {
185 |             Self::Operation(Add(lhs, rhs))
186 |             | Self::Operation(And(lhs, rhs))
187 |             | Self::Operation(Divide(lhs, rhs))
188 |             | Self::Operation(Equal(lhs, rhs))
189 |             | Self::Operation(Exponentiate(lhs, rhs))
190 |             | Self::Operation(GreaterThan(lhs, rhs))
191 |             | Self::Operation(GreaterThanOrEqual(lhs, rhs))
192 |             | Self::Operation(LessThan(lhs, rhs))
193 |             | Self::Operation(LessThanOrEqual(lhs, rhs))
194 |             | Self::Operation(Like(lhs, rhs))
195 |             | Self::Operation(Modulo(lhs, rhs))
196 |             | Self::Operation(Multiply(lhs, rhs))
197 |             | Self::Operation(NotEqual(lhs, rhs))
198 |             | Self::Operation(Or(lhs, rhs))
199 |             | Self::Operation(Subtract(lhs, rhs)) => {
200 |                 Self::replace_with(lhs, |e| e.transform(before, after))?;
201 |                 Self::replace_with(rhs, |e| e.transform(before, after))?;
202 |             }
203 | 
204 |             Self::Operation(Assert(expr))
205 |             | Self::Operation(Factorial(expr))
206 |             | Self::Operation(IsNull(expr))
207 |             | Self::Operation(Negate(expr))
208 |             | Self::Operation(Not(expr)) => {
209 |                 Self::replace_with(expr, |e| e.transform(before, after))?
210 |             }
211 | 
212 |             Self::Function(_, exprs) => {
213 |                 for expr in exprs {
214 |                     Self::replace_with(expr, |e| e.transform(before, after))?;
215 |                 }
216 |             }
217 | 
218 |             Self::Literal(_) | Self::Field(_, _) | Self::Column(_) => {}
219 |         };
220 |         after(self)
221 |     }
222 | 
223 |     /// Transforms an expression using a mutable reference.
224 |     pub fn transform_mut<B, A>(&mut self, before: &mut B, after: &mut A) -> Result<()>
225 |     where
226 |         B: FnMut(Self) -> Result<Self>,
227 |         A: FnMut(Self) -> Result<Self>,
228 |     {
229 |         self.replace_with(|e| e.transform(before, after))
230 |     }
231 | 
232 |     /// Walks the expression tree, calling a closure for every node. Halts if closure returns false.
233 |     pub fn walk<F: Fn(&Expression) -> bool>(&self, visitor: &F) -> bool {
234 |         use Operation::*;
235 |         visitor(self)
236 |             && match self {
237 |                 Self::Operation(Add(lhs, rhs))
238 |                 | Self::Operation(And(lhs, rhs))
239 |                 | Self::Operation(Divide(lhs, rhs))
240 |                 | Self::Operation(Equal(lhs, rhs))
241 |                 | Self::Operation(Exponentiate(lhs, rhs))
242 |                 | Self::Operation(GreaterThan(lhs, rhs))
243 |                 | Self::Operation(GreaterThanOrEqual(lhs, rhs))
244 |                 | Self::Operation(LessThan(lhs, rhs))
245 |                 | Self::Operation(LessThanOrEqual(lhs, rhs))
246 |                 | Self::Operation(Like(lhs, rhs))
247 |                 | Self::Operation(Modulo(lhs, rhs))
248 |                 | Self::Operation(Multiply(lhs, rhs))
249 |                 | Self::Operation(NotEqual(lhs, rhs))
250 |                 | Self::Operation(Or(lhs, rhs))
251 |                 | Self::Operation(Subtract(lhs, rhs)) => lhs.walk(visitor) && rhs.walk(visitor),
252 | 
253 |                 Self::Operation(Assert(expr))
254 |                 | Self::Operation(Factorial(expr))
255 |                 | Self::Operation(IsNull(expr))
256 |                 | Self::Operation(Negate(expr))
257 |                 | Self::Operation(Not(expr)) => expr.walk(visitor),
258 | 
259 |                 Self::Function(_, exprs) => {
260 |                     for expr in exprs {
261 |                         if !expr.walk(visitor) {
262 |                             return false;
263 |                         }
264 |                     }
265 |                     true
266 |                 }
267 | 
268 |                 Self::Literal(_) | Self::Field(_, _) | Self::Column(_) => true,
269 |             }
270 |     }
271 | }
272 | 


--------------------------------------------------------------------------------
/src/sql/plan/optimizer.rs:
--------------------------------------------------------------------------------
  1 | use super::super::schema::Catalog;
  2 | use super::super::types::{Expression, Value};
  3 | use super::Node;
  4 | use crate::error::Result;
  5 | 
  6 | use std::mem::replace;
  7 | 
  8 | /// A plan optimizer
  9 | pub trait Optimizer {
 10 |     fn optimize(&self, node: Node) -> Result<Node>;
 11 | }
 12 | 
 13 | /// A constant folding optimizer, which replaces constant expressions with their evaluated value, to
 14 | /// prevent it from being re-evaluated over and over again during plan execution.
 15 | pub struct ConstantFolder;
 16 | 
 17 | impl Optimizer for ConstantFolder {
 18 |     fn optimize(&self, node: Node) -> Result<Node> {
 19 |         node.transform(&Ok, &|n| {
 20 |             n.transform_expressions(
 21 |                 &|e| {
 22 |                     if !e.contains(&|expr| matches!(expr, Expression::Field(_, _))) {
 23 |                         Ok(Expression::Constant(e.evaluate(None)?))
 24 |                     } else {
 25 |                         Ok(e)
 26 |                     }
 27 |                 },
 28 |                 &Ok,
 29 |             )
 30 |         })
 31 |     }
 32 | }
 33 | 
 34 | /// A filter pushdown optimizer, which moves filter predicates into or closer to the source node.
 35 | pub struct FilterPushdown;
 36 | 
 37 | impl Optimizer for FilterPushdown {
 38 |     fn optimize(&self, node: Node) -> Result<Node> {
 39 |         node.transform(
 40 |             &|n| match n {
 41 |                 Node::Filter { mut source, predicate } => {
 42 |                     // We don't replace the filter node here, since doing so would cause transform()
 43 |                     // to skip the source as it won't reapply the transform to the "same" node.
 44 |                     // We leave a noop filter node instead, which will be cleaned up by NoopCleaner.
 45 |                     if let Some(remainder) = self.pushdown(predicate, &mut source) {
 46 |                         Ok(Node::Filter { source, predicate: remainder })
 47 |                     } else {
 48 |                         Ok(Node::Filter {
 49 |                             source,
 50 |                             predicate: Expression::Constant(Value::Boolean(true)),
 51 |                         })
 52 |                     }
 53 |                 }
 54 |                 Node::NestedLoopJoin {
 55 |                     mut left,
 56 |                     left_size,
 57 |                     mut right,
 58 |                     predicate: Some(predicate),
 59 |                     outer,
 60 |                 } => {
 61 |                     let predicate = self.pushdown_join(predicate, &mut left, &mut right, left_size);
 62 |                     Ok(Node::NestedLoopJoin { left, left_size, right, predicate, outer })
 63 |                 }
 64 |                 n => Ok(n),
 65 |             },
 66 |             &Ok,
 67 |         )
 68 |     }
 69 | }
 70 | 
 71 | impl FilterPushdown {
 72 |     /// Attempts to push an expression down into a target node, returns any remaining expression.
 73 |     fn pushdown(&self, mut expression: Expression, target: &mut Node) -> Option<Expression> {
 74 |         match target {
 75 |             Node::Scan { ref mut filter, .. } => {
 76 |                 if let Some(filter) = filter.take() {
 77 |                     expression = Expression::And(Box::new(expression), Box::new(filter))
 78 |                 }
 79 |                 filter.replace(expression)
 80 |             }
 81 |             Node::NestedLoopJoin { ref mut predicate, .. } => {
 82 |                 if let Some(predicate) = predicate.take() {
 83 |                     expression = Expression::And(Box::new(expression), Box::new(predicate));
 84 |                 }
 85 |                 predicate.replace(expression)
 86 |             }
 87 |             Node::Filter { ref mut predicate, .. } => {
 88 |                 let p = replace(predicate, Expression::Constant(Value::Null));
 89 |                 *predicate = Expression::And(Box::new(p), Box::new(expression));
 90 |                 None
 91 |             }
 92 |             _ => Some(expression),
 93 |         }
 94 |     }
 95 | 
 96 |     /// Attempts to partition a join predicate and push parts of it down into either source,
 97 |     /// returning any remaining expression.
 98 |     fn pushdown_join(
 99 |         &self,
100 |         predicate: Expression,
101 |         left: &mut Node,
102 |         right: &mut Node,
103 |         boundary: usize,
104 |     ) -> Option<Expression> {
105 |         // Convert the predicate into conjunctive normal form, and partition into expressions
106 |         // only referencing the left or right sources, leaving cross-source expressions.
107 |         let cnf = predicate.into_cnf_vec();
108 |         let (mut push_left, cnf): (Vec<Expression>, Vec<Expression>) =
109 |             cnf.into_iter().partition(|e| {
110 |                 // Partition only if no expressions reference the right-hand source.
111 |                 !e.contains(&|e| matches!(e, Expression::Field(i, _) if i >= &boundary))
112 |             });
113 |         let (mut push_right, mut cnf): (Vec<Expression>, Vec<Expression>) =
114 |             cnf.into_iter().partition(|e| {
115 |                 // Partition only if no expressions reference the left-hand source.
116 |                 !e.contains(&|e| matches!(e, Expression::Field(i, _) if i < &boundary))
117 |             });
118 | 
119 |         // Look for equijoins that have constant lookups on either side, and transfer the constants
120 |         // to the other side of the join as well. This allows index lookup optimization in both
121 |         // sides. We already know that the remaining cnf expressions span both sources.
122 |         for e in &cnf {
123 |             if let Expression::Equal(ref lhs, ref rhs) = e {
124 |                 if let (Expression::Field(l, ln), Expression::Field(r, rn)) = (&**lhs, &**rhs) {
125 |                     let (l, ln, r, rn) = if l > r { (r, rn, l, ln) } else { (l, ln, r, rn) };
126 |                     if let Some(lvals) = push_left.iter().find_map(|e| e.as_lookup(*l)) {
127 |                         push_right.push(Expression::from_lookup(*r, rn.clone(), lvals));
128 |                     } else if let Some(rvals) = push_right.iter().find_map(|e| e.as_lookup(*r)) {
129 |                         push_left.push(Expression::from_lookup(*l, ln.clone(), rvals));
130 |                     }
131 |                 }
132 |             }
133 |         }
134 | 
135 |         // Push predicates down into the sources.
136 |         if let Some(push_left) = Expression::from_cnf_vec(push_left) {
137 |             if let Some(remainder) = self.pushdown(push_left, left) {
138 |                 cnf.push(remainder)
139 |             }
140 |         }
141 |         if let Some(mut push_right) = Expression::from_cnf_vec(push_right) {
142 |             // All field references to the right must be shifted left.
143 |             push_right = push_right
144 |                 .transform(
145 |                     &|e| match e {
146 |                         Expression::Field(i, label) => Ok(Expression::Field(i - boundary, label)),
147 |                         e => Ok(e),
148 |                     },
149 |                     &Ok,
150 |                 )
151 |                 .unwrap();
152 |             if let Some(remainder) = self.pushdown(push_right, right) {
153 |                 cnf.push(remainder)
154 |             }
155 |         }
156 |         Expression::from_cnf_vec(cnf)
157 |     }
158 | }
159 | 
160 | /// An index lookup optimizer, which converts table scans to index lookups.
161 | pub struct IndexLookup<'a, C: Catalog> {
162 |     catalog: &'a mut C,
163 | }
164 | 
165 | impl<'a, C: Catalog> IndexLookup<'a, C> {
166 |     pub fn new(catalog: &'a mut C) -> Self {
167 |         Self { catalog }
168 |     }
169 | 
170 |     // Wraps a node in a filter for the given CNF vector, if any, otherwise returns the bare node.
171 |     fn wrap_cnf(&self, node: Node, cnf: Vec<Expression>) -> Node {
172 |         if let Some(predicate) = Expression::from_cnf_vec(cnf) {
173 |             Node::Filter { source: Box::new(node), predicate }
174 |         } else {
175 |             node
176 |         }
177 |     }
178 | }
179 | 
180 | impl<'a, C: Catalog> Optimizer for IndexLookup<'a, C> {
181 |     fn optimize(&self, node: Node) -> Result<Node> {
182 |         node.transform(&Ok, &|n| match n {
183 |             Node::Scan { table, alias, filter: Some(filter) } => {
184 |                 let columns = self.catalog.must_read_table(&table)?.columns;
185 |                 let pk = columns.iter().position(|c| c.primary_key).unwrap();
186 | 
187 |                 // Convert the filter into conjunctive normal form, and try to convert each
188 |                 // sub-expression into a lookup. If a lookup is found, return a lookup node and then
189 |                 // apply the remaining conjunctions as a filter node, if any.
190 |                 let mut cnf = filter.clone().into_cnf_vec();
191 |                 for i in 0..cnf.len() {
192 |                     if let Some(keys) = cnf[i].as_lookup(pk) {
193 |                         cnf.remove(i);
194 |                         return Ok(self.wrap_cnf(Node::KeyLookup { table, alias, keys }, cnf));
195 |                     }
196 |                     for (ci, column) in columns.iter().enumerate().filter(|(_, c)| c.index) {
197 |                         if let Some(values) = cnf[i].as_lookup(ci) {
198 |                             cnf.remove(i);
199 |                             return Ok(self.wrap_cnf(
200 |                                 Node::IndexLookup {
201 |                                     table,
202 |                                     alias,
203 |                                     column: column.name.clone(),
204 |                                     values,
205 |                                 },
206 |                                 cnf,
207 |                             ));
208 |                         }
209 |                     }
210 |                 }
211 |                 Ok(Node::Scan { table, alias, filter: Some(filter) })
212 |             }
213 |             n => Ok(n),
214 |         })
215 |     }
216 | }
217 | 
218 | /// Cleans up noops, e.g. filters with constant true/false predicates.
219 | /// FIXME This should perhaps replace nodes that can never return anything with a Nothing node,
220 | /// but that requires propagating the column names.
221 | pub struct NoopCleaner;
222 | 
223 | impl Optimizer for NoopCleaner {
224 |     fn optimize(&self, node: Node) -> Result<Node> {
225 |         use Expression::*;
226 |         node.transform(
227 |             // While descending the node tree, clean up boolean expressions.
228 |             &|n| {
229 |                 n.transform_expressions(&Ok, &|e| match &e {
230 |                     And(lhs, rhs) => match (&**lhs, &**rhs) {
231 |                         (Constant(Value::Boolean(false)), _)
232 |                         | (Constant(Value::Null), _)
233 |                         | (_, Constant(Value::Boolean(false)))
234 |                         | (_, Constant(Value::Null)) => Ok(Constant(Value::Boolean(false))),
235 |                         (Constant(Value::Boolean(true)), e)
236 |                         | (e, Constant(Value::Boolean(true))) => Ok(e.clone()),
237 |                         _ => Ok(e),
238 |                     },
239 |                     Or(lhs, rhs) => match (&**lhs, &**rhs) {
240 |                         (Constant(Value::Boolean(false)), e)
241 |                         | (Constant(Value::Null), e)
242 |                         | (e, Constant(Value::Boolean(false)))
243 |                         | (e, Constant(Value::Null)) => Ok(e.clone()),
244 |                         (Constant(Value::Boolean(true)), _)
245 |                         | (_, Constant(Value::Boolean(true))) => Ok(Constant(Value::Boolean(true))),
246 |                         _ => Ok(e),
247 |                     },
248 |                     // No need to handle Not, constant folder should have evaluated it already.
249 |                     _ => Ok(e),
250 |                 })
251 |             },
252 |             // While ascending the node tree, remove any unnecessary filters or nodes.
253 |             // FIXME This should replace scan and join predicates with None as well.
254 |             &|n| match n {
255 |                 Node::Filter { source, predicate } => match predicate {
256 |                     Expression::Constant(Value::Boolean(true)) => Ok(*source),
257 |                     predicate => Ok(Node::Filter { source, predicate }),
258 |                 },
259 |                 n => Ok(n),
260 |             },
261 |         )
262 |     }
263 | }
264 | 
265 | // Optimizes join types, currently by swapping nested-loop joins with hash joins where appropriate.
266 | pub struct JoinType;
267 | 
268 | impl Optimizer for JoinType {
269 |     fn optimize(&self, node: Node) -> Result<Node> {
270 |         node.transform(
271 |             &|n| match n {
272 |                 // Replace nested-loop equijoins with hash joins.
273 |                 Node::NestedLoopJoin {
274 |                     left,
275 |                     left_size,
276 |                     right,
277 |                     predicate: Some(Expression::Equal(a, b)),
278 |                     outer,
279 |                 } => match (*a, *b) {
280 |                     (Expression::Field(a, a_label), Expression::Field(b, b_label)) => {
281 |                         let (left_field, right_field) = if a < left_size {
282 |                             ((a, a_label), (b - left_size, b_label))
283 |                         } else {
284 |                             ((b, b_label), (a - left_size, a_label))
285 |                         };
286 |                         Ok(Node::HashJoin { left, left_field, right, right_field, outer })
287 |                     }
288 |                     (a, b) => Ok(Node::NestedLoopJoin {
289 |                         left,
290 |                         left_size,
291 |                         right,
292 |                         predicate: Some(Expression::Equal(a.into(), b.into())),
293 |                         outer,
294 |                     }),
295 |                 },
296 |                 n => Ok(n),
297 |             },
298 |             &Ok,
299 |         )
300 |     }
301 | }
302 | 


--------------------------------------------------------------------------------
/src/sql/schema.rs:
--------------------------------------------------------------------------------
  1 | use super::engine::Transaction;
  2 | use super::parser::format_ident;
  3 | use super::types::{DataType, Value};
  4 | use crate::error::{Error, Result};
  5 | 
  6 | use serde_derive::{Deserialize, Serialize};
  7 | use std::fmt::{self, Display};
  8 | 
  9 | /// The catalog stores schema information
 10 | pub trait Catalog {
 11 |     /// Creates a new table
 12 |     fn create_table(&mut self, table: Table) -> Result<()>;
 13 |     /// Deletes an existing table, or errors if it does not exist
 14 |     fn delete_table(&mut self, table: &str) -> Result<()>;
 15 |     /// Reads a table, if it exists
 16 |     fn read_table(&self, table: &str) -> Result<Option<Table>>;
 17 |     /// Iterates over all tables
 18 |     fn scan_tables(&self) -> Result<Tables>;
 19 | 
 20 |     /// Reads a table, and errors if it does not exist
 21 |     fn must_read_table(&self, table: &str) -> Result<Table> {
 22 |         self.read_table(table)?
 23 |             .ok_or_else(|| Error::Value(format!("Table {} does not exist", table)))
 24 |     }
 25 | 
 26 |     /// Returns all references to a table, as table,column pairs.
 27 |     fn table_references(&self, table: &str, with_self: bool) -> Result<Vec<(String, Vec<String>)>> {
 28 |         Ok(self
 29 |             .scan_tables()?
 30 |             .filter(|t| with_self || t.name != table)
 31 |             .map(|t| {
 32 |                 (
 33 |                     t.name,
 34 |                     t.columns
 35 |                         .iter()
 36 |                         .filter(|c| c.references.as_deref() == Some(table))
 37 |                         .map(|c| c.name.clone())
 38 |                         .collect::<Vec<_>>(),
 39 |                 )
 40 |             })
 41 |             .filter(|(_, cs)| !cs.is_empty())
 42 |             .collect())
 43 |     }
 44 | }
 45 | 
 46 | /// A table scan iterator
 47 | pub type Tables = Box<dyn DoubleEndedIterator<Item = Table> + Send>;
 48 | 
 49 | /// A table schema
 50 | #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
 51 | pub struct Table {
 52 |     pub name: String,
 53 |     pub columns: Vec<Column>,
 54 | }
 55 | 
 56 | impl Table {
 57 |     /// Creates a new table schema
 58 |     pub fn new(name: String, columns: Vec<Column>) -> Result<Self> {
 59 |         let table = Self { name, columns };
 60 |         Ok(table)
 61 |     }
 62 | 
 63 |     /// Fetches a column by name
 64 |     pub fn get_column(&self, name: &str) -> Result<&Column> {
 65 |         self.columns.iter().find(|c| c.name == name).ok_or_else(|| {
 66 |             Error::Value(format!("Column {} not found in table {}", name, self.name))
 67 |         })
 68 |     }
 69 | 
 70 |     /// Fetches a column index by name
 71 |     pub fn get_column_index(&self, name: &str) -> Result<usize> {
 72 |         self.columns.iter().position(|c| c.name == name).ok_or_else(|| {
 73 |             Error::Value(format!("Column {} not found in table {}", name, self.name))
 74 |         })
 75 |     }
 76 | 
 77 |     /// Returns the primary key column of the table
 78 |     pub fn get_primary_key(&self) -> Result<&Column> {
 79 |         self.columns
 80 |             .iter()
 81 |             .find(|c| c.primary_key)
 82 |             .ok_or_else(|| Error::Value(format!("Primary key not found in table {}", self.name)))
 83 |     }
 84 | 
 85 |     /// Returns the primary key value of a row
 86 |     pub fn get_row_key(&self, row: &[Value]) -> Result<Value> {
 87 |         row.get(
 88 |             self.columns
 89 |                 .iter()
 90 |                 .position(|c| c.primary_key)
 91 |                 .ok_or_else(|| Error::Value("Primary key not found".into()))?,
 92 |         )
 93 |         .cloned()
 94 |         .ok_or_else(|| Error::Value("Primary key value not found for row".into()))
 95 |     }
 96 | 
 97 |     /// Validates the table schema
 98 |     pub fn validate(&self, txn: &mut dyn Transaction) -> Result<()> {
 99 |         if self.columns.is_empty() {
100 |             return Err(Error::Value(format!("Table {} has no columns", self.name)));
101 |         }
102 |         match self.columns.iter().filter(|c| c.primary_key).count() {
103 |             1 => {}
104 |             0 => return Err(Error::Value(format!("No primary key in table {}", self.name))),
105 |             _ => return Err(Error::Value(format!("Multiple primary keys in table {}", self.name))),
106 |         };
107 |         for column in &self.columns {
108 |             column.validate(self, txn)?;
109 |         }
110 |         Ok(())
111 |     }
112 | 
113 |     /// Validates a row
114 |     pub fn validate_row(&self, row: &[Value], txn: &mut dyn Transaction) -> Result<()> {
115 |         if row.len() != self.columns.len() {
116 |             return Err(Error::Value(format!("Invalid row size for table {}", self.name)));
117 |         }
118 |         let pk = self.get_row_key(row)?;
119 |         for (column, value) in self.columns.iter().zip(row.iter()) {
120 |             column.validate_value(self, &pk, value, txn)?;
121 |         }
122 |         Ok(())
123 |     }
124 | }
125 | 
126 | impl Display for Table {
127 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
128 |         write!(
129 |             f,
130 |             "CREATE TABLE {} (\n{}\n)",
131 |             format_ident(&self.name),
132 |             self.columns.iter().map(|c| format!("  {}", c)).collect::<Vec<String>>().join(",\n")
133 |         )
134 |     }
135 | }
136 | 
137 | /// A table column schema
138 | #[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
139 | pub struct Column {
140 |     /// Column name
141 |     pub name: String,
142 |     /// Column datatype
143 |     pub datatype: DataType,
144 |     /// Whether the column is a primary key
145 |     pub primary_key: bool,
146 |     /// Whether the column allows null values
147 |     pub nullable: bool,
148 |     /// The default value of the column
149 |     pub default: Option<Value>,
150 |     /// Whether the column should only take unique values
151 |     pub unique: bool,
152 |     /// The table which is referenced by this foreign key
153 |     pub references: Option<String>,
154 |     /// Whether the column should be indexed
155 |     pub index: bool,
156 | }
157 | 
158 | impl Column {
159 |     /// Validates the column schema
160 |     pub fn validate(&self, table: &Table, txn: &mut dyn Transaction) -> Result<()> {
161 |         // Validate primary key
162 |         if self.primary_key && self.nullable {
163 |             return Err(Error::Value(format!("Primary key {} cannot be nullable", self.name)));
164 |         }
165 |         if self.primary_key && !self.unique {
166 |             return Err(Error::Value(format!("Primary key {} must be unique", self.name)));
167 |         }
168 | 
169 |         // Validate default value
170 |         if let Some(default) = &self.default {
171 |             if let Some(datatype) = default.datatype() {
172 |                 if datatype != self.datatype {
173 |                     return Err(Error::Value(format!(
174 |                         "Default value for column {} has datatype {}, must be {}",
175 |                         self.name, datatype, self.datatype
176 |                     )));
177 |                 }
178 |             } else if !self.nullable {
179 |                 return Err(Error::Value(format!(
180 |                     "Can't use NULL as default value for non-nullable column {}",
181 |                     self.name
182 |                 )));
183 |             }
184 |         } else if self.nullable {
185 |             return Err(Error::Value(format!(
186 |                 "Nullable column {} must have a default value",
187 |                 self.name
188 |             )));
189 |         }
190 | 
191 |         // Validate references
192 |         if let Some(reference) = &self.references {
193 |             let target = if reference == &table.name {
194 |                 table.clone()
195 |             } else if let Some(table) = txn.read_table(reference)? {
196 |                 table
197 |             } else {
198 |                 return Err(Error::Value(format!(
199 |                     "Table {} referenced by column {} does not exist",
200 |                     reference, self.name
201 |                 )));
202 |             };
203 |             if self.datatype != target.get_primary_key()?.datatype {
204 |                 return Err(Error::Value(format!(
205 |                     "Can't reference {} primary key of table {} from {} column {}",
206 |                     target.get_primary_key()?.datatype,
207 |                     target.name,
208 |                     self.datatype,
209 |                     self.name
210 |                 )));
211 |             }
212 |         }
213 | 
214 |         Ok(())
215 |     }
216 | 
217 |     /// Validates a column value
218 |     pub fn validate_value(
219 |         &self,
220 |         table: &Table,
221 |         pk: &Value,
222 |         value: &Value,
223 |         txn: &mut dyn Transaction,
224 |     ) -> Result<()> {
225 |         // Validate datatype
226 |         match value.datatype() {
227 |             None if self.nullable => Ok(()),
228 |             None => Err(Error::Value(format!("NULL value not allowed for column {}", self.name))),
229 |             Some(ref datatype) if datatype != &self.datatype => Err(Error::Value(format!(
230 |                 "Invalid datatype {} for {} column {}",
231 |                 datatype, self.datatype, self.name
232 |             ))),
233 |             _ => Ok(()),
234 |         }?;
235 | 
236 |         // Validate value
237 |         match value {
238 |             Value::String(s) if s.len() > 1024 => {
239 |                 Err(Error::Value("Strings cannot be more than 1024 bytes".into()))
240 |             }
241 |             _ => Ok(()),
242 |         }?;
243 | 
244 |         // Validate outgoing references
245 |         if let Some(target) = &self.references {
246 |             match value {
247 |                 Value::Null => Ok(()),
248 |                 Value::Float(f) if f.is_nan() => Ok(()),
249 |                 v if target == &table.name && v == pk => Ok(()),
250 |                 v if txn.read(target, v)?.is_none() => Err(Error::Value(format!(
251 |                     "Referenced primary key {} in table {} does not exist",
252 |                     v, target,
253 |                 ))),
254 |                 _ => Ok(()),
255 |             }?;
256 |         }
257 | 
258 |         // Validate uniqueness constraints
259 |         if self.unique && !self.primary_key && value != &Value::Null {
260 |             let index = table.get_column_index(&self.name)?;
261 |             let mut scan = txn.scan(&table.name, None)?;
262 |             while let Some(row) = scan.next().transpose()? {
263 |                 if row.get(index).unwrap_or(&Value::Null) == value
264 |                     && &table.get_row_key(&row)? != pk
265 |                 {
266 |                     return Err(Error::Value(format!(
267 |                         "Unique value {} already exists for column {}",
268 |                         value, self.name
269 |                     )));
270 |                 }
271 |             }
272 |         }
273 | 
274 |         Ok(())
275 |     }
276 | }
277 | 
278 | impl Display for Column {
279 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
280 |         let mut sql = format_ident(&self.name);
281 |         sql += &format!(" {}", self.datatype);
282 |         if self.primary_key {
283 |             sql += " PRIMARY KEY";
284 |         }
285 |         if !self.nullable && !self.primary_key {
286 |             sql += " NOT NULL";
287 |         }
288 |         if let Some(default) = &self.default {
289 |             sql += &format!(" DEFAULT {}", default);
290 |         }
291 |         if self.unique && !self.primary_key {
292 |             sql += " UNIQUE";
293 |         }
294 |         if let Some(reference) = &self.references {
295 |             sql += &format!(" REFERENCES {}", reference);
296 |         }
297 |         if self.index {
298 |             sql += " INDEX";
299 |         }
300 |         write!(f, "{}", sql)
301 |     }
302 | }
303 | 


--------------------------------------------------------------------------------
/src/sql/types/mod.rs:
--------------------------------------------------------------------------------
  1 | mod expression;
  2 | pub use expression::Expression;
  3 | 
  4 | use crate::error::{Error, Result};
  5 | 
  6 | use serde_derive::{Deserialize, Serialize};
  7 | use std::borrow::Cow;
  8 | use std::cmp::Ordering;
  9 | use std::hash::{Hash, Hasher};
 10 | 
 11 | /// A datatype
 12 | #[derive(Clone, Debug, Hash, PartialEq, Serialize, Deserialize)]
 13 | pub enum DataType {
 14 |     Boolean,
 15 |     Integer,
 16 |     Float,
 17 |     String,
 18 | }
 19 | 
 20 | impl std::fmt::Display for DataType {
 21 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
 22 |         f.write_str(match self {
 23 |             Self::Boolean => "BOOLEAN",
 24 |             Self::Integer => "INTEGER",
 25 |             Self::Float => "FLOAT",
 26 |             Self::String => "STRING",
 27 |         })
 28 |     }
 29 | }
 30 | 
 31 | /// A specific value of a data type
 32 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 33 | pub enum Value {
 34 |     Null,
 35 |     Boolean(bool),
 36 |     Integer(i64),
 37 |     Float(f64),
 38 |     String(String),
 39 | }
 40 | 
 41 | impl std::cmp::Eq for Value {}
 42 | 
 43 | #[allow(clippy::derived_hash_with_manual_eq)]
 44 | impl Hash for Value {
 45 |     fn hash<H: Hasher>(&self, state: &mut H) {
 46 |         self.datatype().hash(state);
 47 |         match self {
 48 |             Value::Null => self.hash(state),
 49 |             Value::Boolean(v) => v.hash(state),
 50 |             Value::Integer(v) => v.hash(state),
 51 |             Value::Float(v) => v.to_be_bytes().hash(state),
 52 |             Value::String(v) => v.hash(state),
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | impl<'a> From<Value> for Cow<'a, Value> {
 58 |     fn from(v: Value) -> Self {
 59 |         Cow::Owned(v)
 60 |     }
 61 | }
 62 | 
 63 | impl<'a> From<&'a Value> for Cow<'a, Value> {
 64 |     fn from(v: &'a Value) -> Self {
 65 |         Cow::Borrowed(v)
 66 |     }
 67 | }
 68 | 
 69 | impl Value {
 70 |     /// Returns the value's datatype, or None for null values
 71 |     pub fn datatype(&self) -> Option<DataType> {
 72 |         match self {
 73 |             Self::Null => None,
 74 |             Self::Boolean(_) => Some(DataType::Boolean),
 75 |             Self::Integer(_) => Some(DataType::Integer),
 76 |             Self::Float(_) => Some(DataType::Float),
 77 |             Self::String(_) => Some(DataType::String),
 78 |         }
 79 |     }
 80 | 
 81 |     /// Returns the inner boolean, or an error if not a boolean
 82 |     pub fn boolean(self) -> Result<bool> {
 83 |         match self {
 84 |             Self::Boolean(b) => Ok(b),
 85 |             v => Err(Error::Value(format!("Not a boolean: {:?}", v))),
 86 |         }
 87 |     }
 88 | 
 89 |     /// Returns the inner float, or an error if not a float
 90 |     pub fn float(self) -> Result<f64> {
 91 |         match self {
 92 |             Self::Float(f) => Ok(f),
 93 |             v => Err(Error::Value(format!("Not a float: {:?}", v))),
 94 |         }
 95 |     }
 96 | 
 97 |     /// Returns the inner integer, or an error if not an integer
 98 |     pub fn integer(self) -> Result<i64> {
 99 |         match self {
100 |             Self::Integer(i) => Ok(i),
101 |             v => Err(Error::Value(format!("Not an integer: {:?}", v))),
102 |         }
103 |     }
104 | 
105 |     /// Returns the inner string, or an error if not a string
106 |     pub fn string(self) -> Result<String> {
107 |         match self {
108 |             Self::String(s) => Ok(s),
109 |             v => Err(Error::Value(format!("Not a string: {:?}", v))),
110 |         }
111 |     }
112 | }
113 | 
114 | impl std::fmt::Display for Value {
115 |     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
116 |         f.write_str(
117 |             match self {
118 |                 Self::Null => "NULL".to_string(),
119 |                 Self::Boolean(b) if *b => "TRUE".to_string(),
120 |                 Self::Boolean(_) => "FALSE".to_string(),
121 |                 Self::Integer(i) => i.to_string(),
122 |                 Self::Float(f) => f.to_string(),
123 |                 Self::String(s) => s.clone(),
124 |             }
125 |             .as_ref(),
126 |         )
127 |     }
128 | }
129 | 
130 | impl PartialOrd for Value {
131 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
132 |         match (self, other) {
133 |             (Self::Null, Self::Null) => Some(Ordering::Equal),
134 |             (Self::Null, _) => Some(Ordering::Less),
135 |             (_, Self::Null) => Some(Ordering::Greater),
136 |             (Self::Boolean(a), Self::Boolean(b)) => a.partial_cmp(b),
137 |             (Self::Float(a), Self::Float(b)) => a.partial_cmp(b),
138 |             (Self::Float(a), Self::Integer(b)) => a.partial_cmp(&(*b as f64)),
139 |             (Self::Integer(a), Self::Float(b)) => (*a as f64).partial_cmp(b),
140 |             (Self::Integer(a), Self::Integer(b)) => a.partial_cmp(b),
141 |             (Self::String(a), Self::String(b)) => a.partial_cmp(b),
142 |             (_, _) => None,
143 |         }
144 |     }
145 | }
146 | 
147 | impl From<bool> for Value {
148 |     fn from(v: bool) -> Self {
149 |         Value::Boolean(v)
150 |     }
151 | }
152 | 
153 | impl From<f64> for Value {
154 |     fn from(v: f64) -> Self {
155 |         Value::Float(v)
156 |     }
157 | }
158 | 
159 | impl From<i64> for Value {
160 |     fn from(v: i64) -> Self {
161 |         Value::Integer(v)
162 |     }
163 | }
164 | 
165 | impl From<String> for Value {
166 |     fn from(v: String) -> Self {
167 |         Value::String(v)
168 |     }
169 | }
170 | 
171 | impl From<&str> for Value {
172 |     fn from(v: &str) -> Self {
173 |         Value::String(v.to_owned())
174 |     }
175 | }
176 | 
177 | /// A row of values
178 | pub type Row = Vec<Value>;
179 | 
180 | /// A row iterator
181 | pub type Rows = Box<dyn Iterator<Item = Result<Row>> + Send>;
182 | 
183 | /// A column (in a result set, see schema::Column for table columns)
184 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
185 | pub struct Column {
186 |     pub name: Option<String>,
187 | }
188 | 
189 | /// A set of columns
190 | pub type Columns = Vec<Column>;
191 | 


--------------------------------------------------------------------------------
/src/storage/bincode.rs:
--------------------------------------------------------------------------------
 1 | //! Bincode is used to encode values. For details, see:
 2 | //! https://github.com/bincode-org/bincode
 3 | //!
 4 | //! By default, the bincode::(de)serialize functions use fixed-length integer
 5 | //! encoding, despite DefaultOptions using variable-length encoding. This module
 6 | //! provides simple wrappers for these functions that use variable-length
 7 | //! encoding and the other defaults.
 8 | 
 9 | use crate::error::Result;
10 | 
11 | use bincode::Options;
12 | use lazy_static::lazy_static;
13 | 
14 | lazy_static! {
15 |     /// Create a static binding for the default Bincode options.
16 |     static ref BINCODE: bincode::DefaultOptions = bincode::DefaultOptions::new();
17 | }
18 | 
19 | /// Deserializes a value using Bincode.
20 | pub fn deserialize<'de, T: serde::Deserialize<'de>>(bytes: &'de [u8]) -> Result<T> {
21 |     Ok(BINCODE.deserialize(bytes)?)
22 | }
23 | 
24 | /// Serializes a value using Bincode.
25 | pub fn serialize<T: serde::Serialize>(value: &T) -> Result<Vec<u8>> {
26 |     Ok(BINCODE.serialize(value)?)
27 | }
28 | 


--------------------------------------------------------------------------------
/src/storage/debug.rs:
--------------------------------------------------------------------------------
  1 | //! Storage debug helpers, primarily formatting of raw engine data.
  2 | 
  3 | use std::collections::HashSet;
  4 | 
  5 | use super::bincode;
  6 | use super::mvcc::{self, TransactionState};
  7 | use crate::error::Result;
  8 | 
  9 | /// Formats a raw byte string, either as a UTF-8 string (if valid and
 10 | /// printable), otherwise hex-encoded.
 11 | pub fn format_raw(v: &[u8]) -> String {
 12 |     if v.is_empty() {
 13 |         return String::from("[]");
 14 |     }
 15 |     if let Ok(s) = String::from_utf8(v.to_vec()) {
 16 |         if s.chars().all(|c| !c.is_control()) {
 17 |             return format!(r#""{}""#, s);
 18 |         }
 19 |     }
 20 |     format!("0x{}", hex::encode(v))
 21 | }
 22 | 
 23 | /// Formats a transaction state.
 24 | pub fn format_txn(state: &TransactionState) -> String {
 25 |     format!(
 26 |         "v{} {} active={}",
 27 |         state.version,
 28 |         if state.read_only { "read-only" } else { "read-write" },
 29 |         format_hashset(&state.active)
 30 |     )
 31 | }
 32 | 
 33 | /// Formats a HashSet with sorted elements.
 34 | pub fn format_hashset<T: Copy + Ord + std::fmt::Display>(set: &HashSet<T>) -> String {
 35 |     let mut elements: Vec<T> = set.iter().copied().collect();
 36 |     elements.sort();
 37 |     let elements: Vec<String> = elements.into_iter().map(|v| v.to_string()).collect();
 38 |     format!("{{{}}}", elements.join(","))
 39 | }
 40 | 
 41 | /// Formats a raw engine key/value pair, or just the key if the value is None.
 42 | /// Attempts to decode known MVCC key formats and values.
 43 | pub fn format_key_value(key: &[u8], value: &Option<Vec<u8>>) -> (String, Option<String>) {
 44 |     // Default to string/hex formatting of the raw key and value.
 45 |     let mut fkey = format_raw(key);
 46 |     let mut fvalue = value.as_ref().map(|v| format_raw(v.as_slice()));
 47 | 
 48 |     // Try to decode MVCC keys and values.
 49 |     if let Ok(key) = mvcc::Key::decode(key) {
 50 |         // Use the debug formatting of the key, unless we need more.
 51 |         fkey = format!("{:?}", key);
 52 | 
 53 |         match key {
 54 |             mvcc::Key::NextVersion => {
 55 |                 if let Some(ref v) = value {
 56 |                     if let Ok(v) = bincode::deserialize::<u64>(v) {
 57 |                         fvalue = Some(format!("{}", v))
 58 |                     }
 59 |                 }
 60 |             }
 61 |             mvcc::Key::TxnActive(_) => {}
 62 |             mvcc::Key::TxnActiveSnapshot(_) => {
 63 |                 if let Some(ref v) = value {
 64 |                     if let Ok(active) = bincode::deserialize::<HashSet<u64>>(v) {
 65 |                         fvalue = Some(format_hashset(&active));
 66 |                     }
 67 |                 }
 68 |             }
 69 |             mvcc::Key::TxnWrite(version, userkey) => {
 70 |                 fkey = format!("TxnWrite({}, {})", version, format_raw(&userkey))
 71 |             }
 72 |             mvcc::Key::Version(userkey, version) => {
 73 |                 fkey = format!("Version({}, {})", format_raw(&userkey), version);
 74 |                 if let Some(ref v) = value {
 75 |                     match bincode::deserialize(v) {
 76 |                         Ok(Some(v)) => fvalue = Some(format_raw(v)),
 77 |                         Ok(None) => fvalue = Some(String::from("None")),
 78 |                         Err(_) => {}
 79 |                     }
 80 |                 }
 81 |             }
 82 |             mvcc::Key::Unversioned(userkey) => {
 83 |                 fkey = format!("Unversioned({})", format_raw(&userkey));
 84 |             }
 85 |         }
 86 |     }
 87 | 
 88 |     (fkey, fvalue)
 89 | }
 90 | 
 91 | /// A debug storage engine, which wraps another engine and logs mutations.
 92 | pub struct Engine<E: super::engine::Engine> {
 93 |     /// The wrapped engine.
 94 |     inner: E,
 95 |     /// Write log as key/value tuples. Value is None for deletes.
 96 |     write_log: Vec<(Vec<u8>, Option<Vec<u8>>)>,
 97 | }
 98 | 
 99 | impl<E: super::engine::Engine> std::fmt::Display for Engine<E> {
100 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
101 |         write!(f, "debug:{}", self.inner)
102 |     }
103 | }
104 | 
105 | impl<E: super::engine::Engine> Engine<E> {
106 |     pub fn new(inner: E) -> Self {
107 |         Self { inner, write_log: Vec::new() }
108 |     }
109 | 
110 |     /// Returns and resets the write log. The next call only returns new writes.
111 |     pub fn take_write_log(&mut self) -> Vec<(Vec<u8>, Option<Vec<u8>>)> {
112 |         let mut write_log = Vec::new();
113 |         std::mem::swap(&mut write_log, &mut self.write_log);
114 |         write_log
115 |     }
116 | }
117 | 
118 | impl<E: super::engine::Engine> super::engine::Engine for Engine<E> {
119 |     type ScanIterator<'a> = E::ScanIterator<'a> where E: 'a;
120 | 
121 |     fn flush(&mut self) -> Result<()> {
122 |         self.inner.flush()
123 |     }
124 | 
125 |     fn delete(&mut self, key: &[u8]) -> Result<()> {
126 |         self.inner.delete(key)?;
127 |         self.write_log.push((key.to_vec(), None));
128 |         Ok(())
129 |     }
130 | 
131 |     fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
132 |         self.inner.get(key)
133 |     }
134 | 
135 |     fn scan<R: std::ops::RangeBounds<Vec<u8>>>(&mut self, range: R) -> Self::ScanIterator<'_> {
136 |         self.inner.scan(range)
137 |     }
138 | 
139 |     fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
140 |         self.inner.set(key, value.clone())?;
141 |         self.write_log.push((key.to_vec(), Some(value)));
142 |         Ok(())
143 |     }
144 | 
145 |     fn status(&mut self) -> Result<super::engine::Status> {
146 |         self.inner.status()
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/storage/engine/memory.rs:
--------------------------------------------------------------------------------
  1 | use super::{Engine, Status};
  2 | use crate::error::Result;
  3 | 
  4 | /// An in-memory key/value storage engine using the Rust standard library B-tree
  5 | /// implementation. Data is not persisted.
  6 | /// This engine is primarily used for testing and scenarios where persistence is not required.
  7 | /// It provides a simple and fast key-value store with all data held in memory.
  8 | pub struct Memory {
  9 |     data: std::collections::BTreeMap<Vec<u8>, Vec<u8>>,
 10 | }
 11 | 
 12 | impl Memory {
 13 |     /// Creates a new Memory key-value storage engine.
 14 |     /// Initializes an empty BTreeMap to hold the key-value data in memory.
 15 |     pub fn new() -> Self {
 16 |         Self { data: std::collections::BTreeMap::new() }
 17 |     }
 18 | }
 19 | 
 20 | impl std::fmt::Display for Memory {
 21 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 22 |         // Display the name of the engine when printed.
 23 |         write!(f, "memory")
 24 |     }
 25 | }
 26 | 
 27 | impl Engine for Memory {
 28 |     type ScanIterator<'a> = ScanIterator<'a>;
 29 | 
 30 |     fn flush(&mut self) -> Result<()> {
 31 |         // Flushing is a no-op for the in-memory engine, as there is no disk I/O.
 32 |         Ok(())
 33 |     }
 34 | 
 35 |     fn delete(&mut self, key: &[u8]) -> Result<()> {
 36 |         // Remove the key from the BTreeMap, effectively deleting it from the in-memory store.
 37 |         self.data.remove(key);
 38 |         Ok(())
 39 |     }
 40 | 
 41 |     fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
 42 |         // Retrieve the value associated with the key from the BTreeMap, if it exists.
 43 |         Ok(self.data.get(key).cloned())
 44 |     }
 45 | 
 46 |     fn scan<R: std::ops::RangeBounds<Vec<u8>>>(&mut self, range: R) -> Self::ScanIterator<'_> {
 47 |         // Create an iterator that scans over the range of keys specified, using the BTreeMap's range function.
 48 |         ScanIterator { inner: self.data.range(range) }
 49 |     }
 50 | 
 51 |     fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
 52 |         // Insert the key-value pair into the BTreeMap, updating the value if the key already exists.
 53 |         self.data.insert(key.to_vec(), value);
 54 |         Ok(())
 55 |     }
 56 | 
 57 |     fn status(&mut self) -> Result<Status> {
 58 |         // Generate a status report containing the engine name, number of keys, and size of all keys and values.
 59 |         Ok(Status {
 60 |             name: self.to_string(),
 61 |             keys: self.data.len() as u64,
 62 |             size: self.data.iter().fold(0, |size, (k, v)| size + k.len() as u64 + v.len() as u64),
 63 |             total_disk_size: 0,
 64 |             live_disk_size: 0,
 65 |             garbage_disk_size: 0,
 66 |         })
 67 |     }
 68 | }
 69 | 
 70 | pub struct ScanIterator<'a> {
 71 |     inner: std::collections::btree_map::Range<'a, Vec<u8>, Vec<u8>>,
 72 | }
 73 | 
 74 | impl<'a> ScanIterator<'a> {
 75 |     fn map(item: (&Vec<u8>, &Vec<u8>)) -> <Self as Iterator>::Item {
 76 |         // Map the key and value references to owned clones for the iterator to yield.
 77 |         let (key, value) = item;
 78 |         Ok((key.clone(), value.clone()))
 79 |     }
 80 | }
 81 | 
 82 | impl<'a> Iterator for ScanIterator<'a> {
 83 |     type Item = Result<(Vec<u8>, Vec<u8>)>;
 84 | 
 85 |     fn next(&mut self) -> Option<Self::Item> {
 86 |         // Advance the iterator and return the next key-value pair, cloning the data from the BTreeMap.
 87 |         self.inner.next().map(Self::map)
 88 |     }
 89 | }
 90 | 
 91 | impl<'a> DoubleEndedIterator for ScanIterator<'a> {
 92 |     fn next_back(&mut self) -> Option<Self::Item> {
 93 |         // Advance the iterator in reverse and return the previous key-value pair, cloning the data from the BTreeMap.
 94 |         self.inner.next_back().map(Self::map)
 95 |     }
 96 | }
 97 | 
 98 | #[cfg(test)]
 99 | mod tests {
100 |     // This module contains tests for the Memory storage engine.
101 |     use super::*;
102 | 
103 |     // Run the generic engine tests using an instance of the Memory engine.
104 |     super::super::tests::test_engine!(Memory::new());
105 | }
106 | 


--------------------------------------------------------------------------------
/src/storage/golden/bitcask/compact-after:
--------------------------------------------------------------------------------
 1 | entry = 0, offset 0
 2 | klen  = 0 [0, 0, 0, 0]
 3 | vlen  = 0 [0, 0, 0, 0]
 4 | key   = "" []
 5 | value = "" []
 6 | 
 7 | entry = 1, offset 8
 8 | klen  = 1 [0, 0, 0, 1]
 9 | vlen  = 1 [0, 0, 0, 1]
10 | key   = "a" [61]
11 | value = [1]
12 | 
13 | entry = 2, offset 18
14 | klen  = 1 [0, 0, 0, 1]
15 | vlen  = 1 [0, 0, 0, 1]
16 | key   = "b" [62]
17 | value = [2]
18 | 
19 | entry = 3, offset 28
20 | klen  = 1 [0, 0, 0, 1]
21 | vlen  = 1 [0, 0, 0, 1]
22 | key   = "c" [63]
23 | value = [3]
24 | 
25 | entry = 4, offset 38
26 | klen  = 1 [0, 0, 0, 1]
27 | vlen  = 1 [0, 0, 0, 1]
28 | key   = "d" [64]
29 | value = [4]
30 | 
31 | 


--------------------------------------------------------------------------------
/src/storage/golden/bitcask/compact-before:
--------------------------------------------------------------------------------
 1 | entry = 0, offset 0
 2 | klen  = 1 [0, 0, 0, 1]
 3 | vlen  = 1 [0, 0, 0, 1]
 4 | key   = "b" [62]
 5 | value = [1]
 6 | 
 7 | entry = 1, offset 10
 8 | klen  = 1 [0, 0, 0, 1]
 9 | vlen  = 1 [0, 0, 0, 1]
10 | key   = "b" [62]
11 | value = [2]
12 | 
13 | entry = 2, offset 20
14 | klen  = 1 [0, 0, 0, 1]
15 | vlen  = 1 [0, 0, 0, 1]
16 | key   = "e" [65]
17 | value = [5]
18 | 
19 | entry = 3, offset 30
20 | klen  = 1 [0, 0, 0, 1]
21 | vlen  = -1 [ff, ff, ff, ff]
22 | key   = "e" [65]
23 | value = tombstone []
24 | 
25 | entry = 4, offset 39
26 | klen  = 1 [0, 0, 0, 1]
27 | vlen  = 1 [0, 0, 0, 1]
28 | key   = "c" [63]
29 | value = [0]
30 | 
31 | entry = 5, offset 49
32 | klen  = 1 [0, 0, 0, 1]
33 | vlen  = -1 [ff, ff, ff, ff]
34 | key   = "c" [63]
35 | value = tombstone []
36 | 
37 | entry = 6, offset 58
38 | klen  = 1 [0, 0, 0, 1]
39 | vlen  = 1 [0, 0, 0, 1]
40 | key   = "c" [63]
41 | value = [3]
42 | 
43 | entry = 7, offset 68
44 | klen  = 0 [0, 0, 0, 0]
45 | vlen  = 0 [0, 0, 0, 0]
46 | key   = "" []
47 | value = "" []
48 | 
49 | entry = 8, offset 76
50 | klen  = 1 [0, 0, 0, 1]
51 | vlen  = 1 [0, 0, 0, 1]
52 | key   = "a" [61]
53 | value = [1]
54 | 
55 | entry = 9, offset 86
56 | klen  = 1 [0, 0, 0, 1]
57 | vlen  = -1 [ff, ff, ff, ff]
58 | key   = "f" [66]
59 | value = tombstone []
60 | 
61 | entry = 10, offset 95
62 | klen  = 1 [0, 0, 0, 1]
63 | vlen  = -1 [ff, ff, ff, ff]
64 | key   = "d" [64]
65 | value = tombstone []
66 | 
67 | entry = 11, offset 104
68 | klen  = 1 [0, 0, 0, 1]
69 | vlen  = 1 [0, 0, 0, 1]
70 | key   = "d" [64]
71 | value = [4]
72 | 
73 | 


--------------------------------------------------------------------------------
/src/storage/golden/bitcask/log:
--------------------------------------------------------------------------------
 1 | entry = 0, offset 0
 2 | klen  = 1 [0, 0, 0, 1]
 3 | vlen  = 1 [0, 0, 0, 1]
 4 | key   = "b" [62]
 5 | value = [1]
 6 | 
 7 | entry = 1, offset 10
 8 | klen  = 1 [0, 0, 0, 1]
 9 | vlen  = 1 [0, 0, 0, 1]
10 | key   = "b" [62]
11 | value = [2]
12 | 
13 | entry = 2, offset 20
14 | klen  = 1 [0, 0, 0, 1]
15 | vlen  = 1 [0, 0, 0, 1]
16 | key   = "e" [65]
17 | value = [5]
18 | 
19 | entry = 3, offset 30
20 | klen  = 1 [0, 0, 0, 1]
21 | vlen  = -1 [ff, ff, ff, ff]
22 | key   = "e" [65]
23 | value = tombstone []
24 | 
25 | entry = 4, offset 39
26 | klen  = 1 [0, 0, 0, 1]
27 | vlen  = 1 [0, 0, 0, 1]
28 | key   = "c" [63]
29 | value = [0]
30 | 
31 | entry = 5, offset 49
32 | klen  = 1 [0, 0, 0, 1]
33 | vlen  = -1 [ff, ff, ff, ff]
34 | key   = "c" [63]
35 | value = tombstone []
36 | 
37 | entry = 6, offset 58
38 | klen  = 1 [0, 0, 0, 1]
39 | vlen  = 1 [0, 0, 0, 1]
40 | key   = "c" [63]
41 | value = [3]
42 | 
43 | entry = 7, offset 68
44 | klen  = 0 [0, 0, 0, 0]
45 | vlen  = 0 [0, 0, 0, 0]
46 | key   = "" []
47 | value = "" []
48 | 
49 | entry = 8, offset 76
50 | klen  = 1 [0, 0, 0, 1]
51 | vlen  = 1 [0, 0, 0, 1]
52 | key   = "a" [61]
53 | value = [1]
54 | 
55 | entry = 9, offset 86
56 | klen  = 1 [0, 0, 0, 1]
57 | vlen  = -1 [ff, ff, ff, ff]
58 | key   = "f" [66]
59 | value = tombstone []
60 | 
61 | entry = 10, offset 95
62 | klen  = 1 [0, 0, 0, 1]
63 | vlen  = -1 [ff, ff, ff, ff]
64 | key   = "d" [64]
65 | value = tombstone []
66 | 
67 | entry = 11, offset 104
68 | klen  = 1 [0, 0, 0, 1]
69 | vlen  = 1 [0, 0, 0, 1]
70 | key   = "d" [64]
71 | value = [4]
72 | 
73 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_dirty_read:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set "key" = 0x01
 6 |     set TxnWrite(1, "key") = []
 7 |     set Version("key", 1) = 0x01
 8 | 
 9 | T2: begin → v2 read-write active={1}
10 |     set NextVersion = 3
11 |     set TxnActiveSnapshot(2) = {1}
12 |     set TxnActive(2) = []
13 | 
14 | T2: get "key" → None
15 | 
16 | Engine state:
17 | NextVersion = 3
18 | TxnActive(1) = []
19 | TxnActive(2) = []
20 | TxnActiveSnapshot(2) = {1}
21 | TxnWrite(1, "key") = []
22 | Version("key", 1) = 0x01
23 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_dirty_write:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set "key" = 0x01
 6 |     set TxnWrite(1, "key") = []
 7 |     set Version("key", 1) = 0x01
 8 | 
 9 | T2: begin → v2 read-write active={1}
10 |     set NextVersion = 3
11 |     set TxnActiveSnapshot(2) = {1}
12 |     set TxnActive(2) = []
13 | 
14 | T2: set "key" = 0x02 → Error::Serialization
15 | 
16 | Engine state:
17 | NextVersion = 3
18 | TxnActive(1) = []
19 | TxnActive(2) = []
20 | TxnActiveSnapshot(2) = {1}
21 | TxnWrite(1, "key") = []
22 | Version("key", 1) = 0x01
23 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_fuzzy_read:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("key", 1) = 0x00
 4 | 
 5 | T1: begin → v2 read-write active={}
 6 |     set NextVersion = 3
 7 |     set TxnActive(2) = []
 8 | 
 9 | T2: begin → v3 read-write active={2}
10 |     set NextVersion = 4
11 |     set TxnActiveSnapshot(3) = {2}
12 |     set TxnActive(3) = []
13 | 
14 | T2: get "key" → 0x00
15 | 
16 | T1: set "key" = "t1"
17 |     set TxnWrite(2, "key") = []
18 |     set Version("key", 2) = "t1"
19 | 
20 | T1: commit
21 |     del TxnWrite(2, "key")
22 |     del TxnActive(2)
23 | 
24 | T2: get "key" → 0x00
25 | 
26 | Engine state:
27 | NextVersion = 4
28 | TxnActive(3) = []
29 | TxnActiveSnapshot(3) = {2}
30 | Version("key", 1) = 0x00
31 | Version("key", 2) = "t1"
32 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_lost_update:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("key", 1) = 0x00
 4 | 
 5 | T1: begin → v2 read-write active={}
 6 |     set NextVersion = 3
 7 |     set TxnActive(2) = []
 8 | 
 9 | T2: begin → v3 read-write active={2}
10 |     set NextVersion = 4
11 |     set TxnActiveSnapshot(3) = {2}
12 |     set TxnActive(3) = []
13 | 
14 | T1: get "key" → 0x00
15 | 
16 | T2: get "key" → 0x00
17 | 
18 | T1: set "key" = 0x01
19 |     set TxnWrite(2, "key") = []
20 |     set Version("key", 2) = 0x01
21 | 
22 | T2: set "key" = 0x02 → Error::Serialization
23 | 
24 | T1: commit
25 |     del TxnWrite(2, "key")
26 |     del TxnActive(2)
27 | 
28 | Engine state:
29 | NextVersion = 4
30 | TxnActive(3) = []
31 | TxnActiveSnapshot(3) = {2}
32 | Version("key", 1) = 0x00
33 | Version("key", 2) = 0x01
34 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_phantom_read:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("a", 1) = 0x00
 4 | Version("ba", 1) = 0x00
 5 | Version("bb", 1) = 0x00
 6 | 
 7 | T1: begin → v2 read-write active={}
 8 |     set NextVersion = 3
 9 |     set TxnActive(2) = []
10 | 
11 | T2: begin → v3 read-write active={2}
12 |     set NextVersion = 4
13 |     set TxnActiveSnapshot(3) = {2}
14 |     set TxnActive(3) = []
15 | 
16 | T1: scan prefix "b"
17 |     "ba" = 0x00
18 |     "bb" = 0x00
19 | 
20 | T2: del "ba"
21 |     set TxnWrite(3, "ba") = []
22 |     set Version("ba", 3) = None
23 | 
24 | T2: set "bc" = 0x02
25 |     set TxnWrite(3, "bc") = []
26 |     set Version("bc", 3) = 0x02
27 | 
28 | T2: commit
29 |     del TxnWrite(3, "ba")
30 |     del TxnWrite(3, "bc")
31 |     del TxnActive(3)
32 | 
33 | T1: scan prefix "b"
34 |     "ba" = 0x00
35 |     "bb" = 0x00
36 | 
37 | Engine state:
38 | NextVersion = 4
39 | TxnActive(2) = []
40 | TxnActiveSnapshot(3) = {2}
41 | Version("a", 1) = 0x00
42 | Version("ba", 1) = 0x00
43 | Version("ba", 3) = None
44 | Version("bb", 1) = 0x00
45 | Version("bc", 3) = 0x02
46 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_read_skew:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("a", 1) = 0x00
 4 | Version("b", 1) = 0x00
 5 | 
 6 | T1: begin → v2 read-write active={}
 7 |     set NextVersion = 3
 8 |     set TxnActive(2) = []
 9 | 
10 | T2: begin → v3 read-write active={2}
11 |     set NextVersion = 4
12 |     set TxnActiveSnapshot(3) = {2}
13 |     set TxnActive(3) = []
14 | 
15 | T1: get "a" → 0x00
16 | 
17 | T2: set "a" = 0x02
18 |     set TxnWrite(3, "a") = []
19 |     set Version("a", 3) = 0x02
20 | 
21 | T2: set "b" = 0x02
22 |     set TxnWrite(3, "b") = []
23 |     set Version("b", 3) = 0x02
24 | 
25 | T2: commit
26 |     del TxnWrite(3, "a")
27 |     del TxnWrite(3, "b")
28 |     del TxnActive(3)
29 | 
30 | T1: get "a" → 0x00
31 | 
32 | Engine state:
33 | NextVersion = 4
34 | TxnActive(2) = []
35 | TxnActiveSnapshot(3) = {2}
36 | Version("a", 1) = 0x00
37 | Version("a", 3) = 0x02
38 | Version("b", 1) = 0x00
39 | Version("b", 3) = 0x02
40 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/anomaly_write_skew:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("a", 1) = 0x01
 4 | Version("b", 1) = 0x02
 5 | 
 6 | T1: begin → v2 read-write active={}
 7 |     set NextVersion = 3
 8 |     set TxnActive(2) = []
 9 | 
10 | T2: begin → v3 read-write active={2}
11 |     set NextVersion = 4
12 |     set TxnActiveSnapshot(3) = {2}
13 |     set TxnActive(3) = []
14 | 
15 | T1: get "a" → 0x01
16 | 
17 | T2: get "b" → 0x02
18 | 
19 | T1: set "b" = 0x01
20 |     set TxnWrite(2, "b") = []
21 |     set Version("b", 2) = 0x01
22 | 
23 | T2: set "a" = 0x02
24 |     set TxnWrite(3, "a") = []
25 |     set Version("a", 3) = 0x02
26 | 
27 | T1: commit
28 |     del TxnWrite(2, "b")
29 |     del TxnActive(2)
30 | 
31 | T2: commit
32 |     del TxnWrite(3, "a")
33 |     del TxnActive(3)
34 | 
35 | Engine state:
36 | NextVersion = 4
37 | TxnActiveSnapshot(3) = {2}
38 | Version("a", 1) = 0x01
39 | Version("a", 3) = 0x02
40 | Version("b", 1) = 0x02
41 | Version("b", 2) = 0x01
42 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/begin:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T2: begin → v2 read-write active={1}
 6 |     set NextVersion = 3
 7 |     set TxnActiveSnapshot(2) = {1}
 8 |     set TxnActive(2) = []
 9 | 
10 | T3: begin → v3 read-write active={1,2}
11 |     set NextVersion = 4
12 |     set TxnActiveSnapshot(3) = {1,2}
13 |     set TxnActive(3) = []
14 | 
15 | T2: commit
16 |     del TxnActive(2)
17 | 
18 | T4: begin → v4 read-write active={1,3}
19 |     set NextVersion = 5
20 |     set TxnActiveSnapshot(4) = {1,3}
21 |     set TxnActive(4) = []
22 | 
23 | Engine state:
24 | NextVersion = 5
25 | TxnActive(1) = []
26 | TxnActive(3) = []
27 | TxnActive(4) = []
28 | TxnActiveSnapshot(2) = {1}
29 | TxnActiveSnapshot(3) = {1,2}
30 | TxnActiveSnapshot(4) = {1,3}
31 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/begin_as_of:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set "other" = 0x01
 6 |     set TxnWrite(1, "other") = []
 7 |     set Version("other", 1) = 0x01
 8 | 
 9 | T2: begin → v2 read-write active={1}
10 |     set NextVersion = 3
11 |     set TxnActiveSnapshot(2) = {1}
12 |     set TxnActive(2) = []
13 | 
14 | T2: set "key" = 0x02
15 |     set TxnWrite(2, "key") = []
16 |     set Version("key", 2) = 0x02
17 | 
18 | T2: commit
19 |     del TxnWrite(2, "key")
20 |     del TxnActive(2)
21 | 
22 | T3: begin → v3 read-write active={1}
23 |     set NextVersion = 4
24 |     set TxnActiveSnapshot(3) = {1}
25 |     set TxnActive(3) = []
26 | 
27 | T3: set "key" = 0x03
28 |     set TxnWrite(3, "key") = []
29 |     set Version("key", 3) = 0x03
30 | 
31 | T4: begin as of 3 → v3 read-only active={1}
32 | 
33 | T4: scan ..
34 |     "key" = 0x02
35 | 
36 | T4: set "foo" = 0x01 → Error::ReadOnly
37 | 
38 | T4: del "foo" → Error::ReadOnly
39 | 
40 | T1: commit
41 |     del TxnWrite(1, "other")
42 |     del TxnActive(1)
43 | 
44 | T3: commit
45 |     del TxnWrite(3, "key")
46 |     del TxnActive(3)
47 | 
48 | T4: scan ..
49 |     "key" = 0x02
50 | 
51 | T5: begin as of 3 → v3 read-only active={1}
52 | 
53 | T5: scan ..
54 |     "key" = 0x02
55 | 
56 | T4: rollback
57 | 
58 | T5: commit
59 | 
60 | T6: begin → v4 read-write active={}
61 |     set NextVersion = 5
62 |     set TxnActive(4) = []
63 | 
64 | T6: set "key" = 0x04
65 |     set TxnWrite(4, "key") = []
66 |     set Version("key", 4) = 0x04
67 | 
68 | T6: commit
69 |     del TxnWrite(4, "key")
70 |     del TxnActive(4)
71 | 
72 | T7: begin as of 4 → v4 read-only active={}
73 | 
74 | T7: scan ..
75 |     "key" = 0x03
76 |     "other" = 0x01
77 | 
78 | T8: begin as of 5 → Error::Value("Version 5 does not exist")
79 | 
80 | T9: begin as of 9 → Error::Value("Version 9 does not exist")
81 | 
82 | Engine state:
83 | NextVersion = 5
84 | TxnActiveSnapshot(2) = {1}
85 | TxnActiveSnapshot(3) = {1}
86 | Version("key", 2) = 0x02
87 | Version("key", 3) = 0x03
88 | Version("key", 4) = 0x04
89 | Version("other", 1) = 0x01
90 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/begin_read_only:
--------------------------------------------------------------------------------
 1 | T1: begin read-only → v1 read-only active={}
 2 | 
 3 | T1: set "foo" = 0x01 → Error::ReadOnly
 4 | 
 5 | T1: del "foo" → Error::ReadOnly
 6 | 
 7 | T2: begin → v1 read-write active={}
 8 |     set NextVersion = 2
 9 |     set TxnActive(1) = []
10 | 
11 | T3: begin read-only → v2 read-only active={1}
12 | 
13 | Engine state:
14 | NextVersion = 2
15 | TxnActive(1) = []
16 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/delete:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("key", 1) = 0x01
 4 | Version("tombstone", 1) = None
 5 | 
 6 | T1: begin → v2 read-write active={}
 7 |     set NextVersion = 3
 8 |     set TxnActive(2) = []
 9 | 
10 | T1: set "key" = 0x02
11 |     set TxnWrite(2, "key") = []
12 |     set Version("key", 2) = 0x02
13 | 
14 | T1: del "key"
15 |     set TxnWrite(2, "key") = []
16 |     set Version("key", 2) = None
17 | 
18 | T1: del "key"
19 |     set TxnWrite(2, "key") = []
20 |     set Version("key", 2) = None
21 | 
22 | T1: del "tombstone"
23 |     set TxnWrite(2, "tombstone") = []
24 |     set Version("tombstone", 2) = None
25 | 
26 | T1: del "missing"
27 |     set TxnWrite(2, "missing") = []
28 |     set Version("missing", 2) = None
29 | 
30 | T1: commit
31 |     del TxnWrite(2, "key")
32 |     del TxnWrite(2, "missing")
33 |     del TxnWrite(2, "tombstone")
34 |     del TxnActive(2)
35 | 
36 | Engine state:
37 | NextVersion = 3
38 | Version("key", 1) = 0x01
39 | Version("key", 2) = None
40 | Version("missing", 2) = None
41 | Version("tombstone", 1) = None
42 | Version("tombstone", 2) = None
43 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/delete_conflict:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T2: begin → v2 read-write active={1}
 6 |     set NextVersion = 3
 7 |     set TxnActiveSnapshot(2) = {1}
 8 |     set TxnActive(2) = []
 9 | 
10 | T3: begin → v3 read-write active={1,2}
11 |     set NextVersion = 4
12 |     set TxnActiveSnapshot(3) = {1,2}
13 |     set TxnActive(3) = []
14 | 
15 | T4: begin → v4 read-write active={1,2,3}
16 |     set NextVersion = 5
17 |     set TxnActiveSnapshot(4) = {1,2,3}
18 |     set TxnActive(4) = []
19 | 
20 | T1: set "a" = 0x01
21 |     set TxnWrite(1, "a") = []
22 |     set Version("a", 1) = 0x01
23 | 
24 | T3: set "c" = 0x03
25 |     set TxnWrite(3, "c") = []
26 |     set Version("c", 3) = 0x03
27 | 
28 | T4: set "d" = 0x04
29 |     set TxnWrite(4, "d") = []
30 |     set Version("d", 4) = 0x04
31 | 
32 | T4: commit
33 |     del TxnWrite(4, "d")
34 |     del TxnActive(4)
35 | 
36 | T2: del "a" → Error::Serialization
37 | 
38 | T2: del "c" → Error::Serialization
39 | 
40 | T2: del "d" → Error::Serialization
41 | 
42 | Engine state:
43 | NextVersion = 5
44 | TxnActive(1) = []
45 | TxnActive(2) = []
46 | TxnActive(3) = []
47 | TxnActiveSnapshot(2) = {1}
48 | TxnActiveSnapshot(3) = {1,2}
49 | TxnActiveSnapshot(4) = {1,2,3}
50 | TxnWrite(1, "a") = []
51 | TxnWrite(3, "c") = []
52 | Version("a", 1) = 0x01
53 | Version("c", 3) = 0x03
54 | Version("d", 4) = 0x04
55 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/get:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 3
 3 | Version("deleted", 1) = 0x01
 4 | Version("deleted", 2) = None
 5 | Version("key", 1) = 0x01
 6 | Version("tombstone", 1) = None
 7 | Version("updated", 1) = 0x01
 8 | Version("updated", 2) = 0x02
 9 | 
10 | T1: begin read-only → v3 read-only active={}
11 | 
12 | T1: get "key" → 0x01
13 | 
14 | T1: get "updated" → 0x02
15 | 
16 | T1: get "deleted" → None
17 | 
18 | T1: get "tombstone" → None
19 | 
20 | Engine state:
21 | NextVersion = 3
22 | Version("deleted", 1) = 0x01
23 | Version("deleted", 2) = None
24 | Version("key", 1) = 0x01
25 | Version("tombstone", 1) = None
26 | Version("updated", 1) = 0x01
27 | Version("updated", 2) = 0x02
28 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/get_isolation:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set "a" = 0x01
 6 |     set TxnWrite(1, "a") = []
 7 |     set Version("a", 1) = 0x01
 8 | 
 9 | T1: set "b" = 0x01
10 |     set TxnWrite(1, "b") = []
11 |     set Version("b", 1) = 0x01
12 | 
13 | T1: set "d" = 0x01
14 |     set TxnWrite(1, "d") = []
15 |     set Version("d", 1) = 0x01
16 | 
17 | T1: set "e" = 0x01
18 |     set TxnWrite(1, "e") = []
19 |     set Version("e", 1) = 0x01
20 | 
21 | T1: commit
22 |     del TxnWrite(1, "a")
23 |     del TxnWrite(1, "b")
24 |     del TxnWrite(1, "d")
25 |     del TxnWrite(1, "e")
26 |     del TxnActive(1)
27 | 
28 | T2: begin → v2 read-write active={}
29 |     set NextVersion = 3
30 |     set TxnActive(2) = []
31 | 
32 | T2: set "a" = 0x02
33 |     set TxnWrite(2, "a") = []
34 |     set Version("a", 2) = 0x02
35 | 
36 | T2: del "b"
37 |     set TxnWrite(2, "b") = []
38 |     set Version("b", 2) = None
39 | 
40 | T2: set "c" = 0x02
41 |     set TxnWrite(2, "c") = []
42 |     set Version("c", 2) = 0x02
43 | 
44 | T3: begin read-only → v3 read-only active={2}
45 | 
46 | T4: begin → v3 read-write active={2}
47 |     set NextVersion = 4
48 |     set TxnActiveSnapshot(3) = {2}
49 |     set TxnActive(3) = []
50 | 
51 | T4: set "d" = 0x03
52 |     set TxnWrite(3, "d") = []
53 |     set Version("d", 3) = 0x03
54 | 
55 | T4: del "e"
56 |     set TxnWrite(3, "e") = []
57 |     set Version("e", 3) = None
58 | 
59 | T4: set "f" = 0x03
60 |     set TxnWrite(3, "f") = []
61 |     set Version("f", 3) = 0x03
62 | 
63 | T4: commit
64 |     del TxnWrite(3, "d")
65 |     del TxnWrite(3, "e")
66 |     del TxnWrite(3, "f")
67 |     del TxnActive(3)
68 | 
69 | T3: get "a" → 0x01
70 | 
71 | T3: get "b" → 0x01
72 | 
73 | T3: get "c" → None
74 | 
75 | T3: get "d" → 0x01
76 | 
77 | T3: get "e" → 0x01
78 | 
79 | T3: get "f" → None
80 | 
81 | Engine state:
82 | NextVersion = 4
83 | TxnActive(2) = []
84 | TxnActiveSnapshot(3) = {2}
85 | TxnWrite(2, "a") = []
86 | TxnWrite(2, "b") = []
87 | TxnWrite(2, "c") = []
88 | Version("a", 1) = 0x01
89 | Version("a", 2) = 0x02
90 | Version("b", 1) = 0x01
91 | Version("b", 2) = None
92 | Version("c", 2) = 0x02
93 | Version("d", 1) = 0x01
94 | Version("d", 3) = 0x03
95 | Version("e", 1) = 0x01
96 | Version("e", 3) = None
97 | Version("f", 3) = 0x03
98 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/resume:
--------------------------------------------------------------------------------
  1 | T1: begin → v1 read-write active={}
  2 |     set NextVersion = 2
  3 |     set TxnActive(1) = []
  4 | 
  5 | T1: set "a" = 0x01
  6 |     set TxnWrite(1, "a") = []
  7 |     set Version("a", 1) = 0x01
  8 | 
  9 | T1: set "b" = 0x01
 10 |     set TxnWrite(1, "b") = []
 11 |     set Version("b", 1) = 0x01
 12 | 
 13 | T1: commit
 14 |     del TxnWrite(1, "a")
 15 |     del TxnWrite(1, "b")
 16 |     del TxnActive(1)
 17 | 
 18 | T2: begin → v2 read-write active={}
 19 |     set NextVersion = 3
 20 |     set TxnActive(2) = []
 21 | 
 22 | T3: begin → v3 read-write active={2}
 23 |     set NextVersion = 4
 24 |     set TxnActiveSnapshot(3) = {2}
 25 |     set TxnActive(3) = []
 26 | 
 27 | T4: begin → v4 read-write active={2,3}
 28 |     set NextVersion = 5
 29 |     set TxnActiveSnapshot(4) = {2,3}
 30 |     set TxnActive(4) = []
 31 | 
 32 | T2: set "a" = 0x02
 33 |     set TxnWrite(2, "a") = []
 34 |     set Version("a", 2) = 0x02
 35 | 
 36 | T3: set "b" = 0x03
 37 |     set TxnWrite(3, "b") = []
 38 |     set Version("b", 3) = 0x03
 39 | 
 40 | T4: set "c" = 0x04
 41 |     set TxnWrite(4, "c") = []
 42 |     set Version("c", 4) = 0x04
 43 | 
 44 | T2: commit
 45 |     del TxnWrite(2, "a")
 46 |     del TxnActive(2)
 47 | 
 48 | T4: commit
 49 |     del TxnWrite(4, "c")
 50 |     del TxnActive(4)
 51 | 
 52 | T5: resume → v3 read-write active={2}
 53 | 
 54 | T5: scan ..
 55 |     "a" = 0x01
 56 |     "b" = 0x03
 57 | 
 58 | T6: begin → v5 read-write active={3}
 59 |     set NextVersion = 6
 60 |     set TxnActiveSnapshot(5) = {3}
 61 |     set TxnActive(5) = []
 62 | 
 63 | T6: scan ..
 64 |     "a" = 0x02
 65 |     "b" = 0x01
 66 |     "c" = 0x04
 67 | 
 68 | T6: rollback
 69 |     del TxnActive(5)
 70 | 
 71 | T5: commit
 72 |     del TxnWrite(3, "b")
 73 |     del TxnActive(3)
 74 | 
 75 | T7: begin → v6 read-write active={}
 76 |     set NextVersion = 7
 77 |     set TxnActive(6) = []
 78 | 
 79 | T7: scan ..
 80 |     "a" = 0x02
 81 |     "b" = 0x03
 82 |     "c" = 0x04
 83 | 
 84 | T7: rollback
 85 |     del TxnActive(6)
 86 | 
 87 | T8: resume → Error::Internal("No active transaction at version 3")
 88 | 
 89 | T9: begin as of 3 → v3 read-only active={2}
 90 | 
 91 | T9: scan ..
 92 |     "a" = 0x01
 93 |     "b" = 0x01
 94 | 
 95 | T10: resume → v3 read-only active={2}
 96 | 
 97 | T10: scan ..
 98 |     "a" = 0x01
 99 |     "b" = 0x01
100 | 
101 | Engine state:
102 | NextVersion = 7
103 | TxnActiveSnapshot(3) = {2}
104 | TxnActiveSnapshot(4) = {2,3}
105 | TxnActiveSnapshot(5) = {3}
106 | Version("a", 1) = 0x01
107 | Version("a", 2) = 0x02
108 | Version("b", 1) = 0x01
109 | Version("b", 3) = 0x03
110 | Version("c", 4) = 0x04
111 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/rollback:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("a", 1) = 0x00
 4 | Version("b", 1) = 0x00
 5 | Version("c", 1) = 0x00
 6 | Version("d", 1) = 0x00
 7 | 
 8 | T1: begin → v2 read-write active={}
 9 |     set NextVersion = 3
10 |     set TxnActive(2) = []
11 | 
12 | T2: begin → v3 read-write active={2}
13 |     set NextVersion = 4
14 |     set TxnActiveSnapshot(3) = {2}
15 |     set TxnActive(3) = []
16 | 
17 | T3: begin → v4 read-write active={2,3}
18 |     set NextVersion = 5
19 |     set TxnActiveSnapshot(4) = {2,3}
20 |     set TxnActive(4) = []
21 | 
22 | T1: set "a" = 0x01
23 |     set TxnWrite(2, "a") = []
24 |     set Version("a", 2) = 0x01
25 | 
26 | T2: set "b" = 0x02
27 |     set TxnWrite(3, "b") = []
28 |     set Version("b", 3) = 0x02
29 | 
30 | T2: del "c"
31 |     set TxnWrite(3, "c") = []
32 |     set Version("c", 3) = None
33 | 
34 | T3: set "d" = 0x03
35 |     set TxnWrite(4, "d") = []
36 |     set Version("d", 4) = 0x03
37 | 
38 | T1: set "b" = 0x01 → Error::Serialization
39 | 
40 | T3: set "c" = 0x03 → Error::Serialization
41 | 
42 | T2: rollback
43 |     del Version("b", 3)
44 |     del TxnWrite(3, "b")
45 |     del Version("c", 3)
46 |     del TxnWrite(3, "c")
47 |     del TxnActive(3)
48 | 
49 | T4: begin read-only → v5 read-only active={2,4}
50 | 
51 | T4: scan ..
52 |     "a" = 0x00
53 |     "b" = 0x00
54 |     "c" = 0x00
55 |     "d" = 0x00
56 | 
57 | T1: set "b" = 0x01
58 |     set TxnWrite(2, "b") = []
59 |     set Version("b", 2) = 0x01
60 | 
61 | T3: set "c" = 0x03
62 |     set TxnWrite(4, "c") = []
63 |     set Version("c", 4) = 0x03
64 | 
65 | T1: commit
66 |     del TxnWrite(2, "a")
67 |     del TxnWrite(2, "b")
68 |     del TxnActive(2)
69 | 
70 | T3: commit
71 |     del TxnWrite(4, "c")
72 |     del TxnWrite(4, "d")
73 |     del TxnActive(4)
74 | 
75 | T5: begin read-only → v5 read-only active={}
76 | 
77 | T5: scan ..
78 |     "a" = 0x01
79 |     "b" = 0x01
80 |     "c" = 0x03
81 |     "d" = 0x03
82 | 
83 | Engine state:
84 | NextVersion = 5
85 | TxnActiveSnapshot(3) = {2}
86 | TxnActiveSnapshot(4) = {2,3}
87 | Version("a", 1) = 0x00
88 | Version("a", 2) = 0x01
89 | Version("b", 1) = 0x00
90 | Version("b", 2) = 0x01
91 | Version("c", 1) = 0x00
92 | Version("c", 4) = 0x03
93 | Version("d", 1) = 0x00
94 | Version("d", 4) = 0x03
95 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/scan:
--------------------------------------------------------------------------------
  1 | Engine state:
  2 | NextVersion = 5
  3 | Version("B", 1) = 0x0001
  4 | Version("B", 3) = None
  5 | Version("a", 1) = 0x0a01
  6 | Version("a", 2) = None
  7 | Version("a", 3) = 0x0a03
  8 | Version("b", 1) = None
  9 | Version("b", 3) = 0x0b03
 10 | Version("b", 4) = None
 11 | Version("ba", 2) = 0xba02
 12 | Version("ba", 4) = 0xba04
 13 | Version("bb", 2) = 0xbb02
 14 | Version("bb", 3) = None
 15 | Version("bc", 2) = 0xbc02
 16 | Version("c", 1) = 0x0c01
 17 | 
 18 | T1: begin as of 1 → v1 read-only active={}
 19 | 
 20 | T1: scan ..
 21 | 
 22 | T2: begin as of 2 → v2 read-only active={}
 23 | 
 24 | T2: scan ..
 25 |     "B" = 0x0001
 26 |     "a" = 0x0a01
 27 |     "c" = 0x0c01
 28 | 
 29 | T3: begin as of 3 → v3 read-only active={}
 30 | 
 31 | T3: scan ..
 32 |     "B" = 0x0001
 33 |     "ba" = 0xba02
 34 |     "bb" = 0xbb02
 35 |     "bc" = 0xbc02
 36 |     "c" = 0x0c01
 37 | 
 38 | T4: begin as of 4 → v4 read-only active={}
 39 | 
 40 | T4: scan ..
 41 |     "a" = 0x0a03
 42 |     "b" = 0x0b03
 43 |     "ba" = 0xba02
 44 |     "bc" = 0xbc02
 45 |     "c" = 0x0c01
 46 | 
 47 | T5: begin as of 3 → v3 read-only active={}
 48 | 
 49 | T5: scan ..
 50 |     "B" = 0x0001
 51 |     "ba" = 0xba02
 52 |     "bb" = 0xbb02
 53 |     "bc" = 0xbc02
 54 |     "c" = 0x0c01
 55 | 
 56 | T5: scan .."bc"]
 57 |     "B" = 0x0001
 58 |     "ba" = 0xba02
 59 |     "bb" = 0xbb02
 60 |     "bc" = 0xbc02
 61 | 
 62 | T5: scan .."bc")
 63 |     "B" = 0x0001
 64 |     "ba" = 0xba02
 65 |     "bb" = 0xbb02
 66 | 
 67 | T5: scan ["ba"..
 68 |     "ba" = 0xba02
 69 |     "bb" = 0xbb02
 70 |     "bc" = 0xbc02
 71 |     "c" = 0x0c01
 72 | 
 73 | T5: scan ["ba".."bc"]
 74 |     "ba" = 0xba02
 75 |     "bb" = 0xbb02
 76 |     "bc" = 0xbc02
 77 | 
 78 | T5: scan ["ba".."bc")
 79 |     "ba" = 0xba02
 80 |     "bb" = 0xbb02
 81 | 
 82 | T5: scan ("ba"..
 83 |     "bb" = 0xbb02
 84 |     "bc" = 0xbc02
 85 |     "c" = 0x0c01
 86 | 
 87 | T5: scan ("ba".."bc"]
 88 |     "bb" = 0xbb02
 89 |     "bc" = 0xbc02
 90 | 
 91 | T5: scan ("ba".."bc")
 92 |     "bb" = 0xbb02
 93 | 
 94 | Engine state:
 95 | NextVersion = 5
 96 | Version("B", 1) = 0x0001
 97 | Version("B", 3) = None
 98 | Version("a", 1) = 0x0a01
 99 | Version("a", 2) = None
100 | Version("a", 3) = 0x0a03
101 | Version("b", 1) = None
102 | Version("b", 3) = 0x0b03
103 | Version("b", 4) = None
104 | Version("ba", 2) = 0xba02
105 | Version("ba", 4) = 0xba04
106 | Version("bb", 2) = 0xbb02
107 | Version("bb", 3) = None
108 | Version("bc", 2) = 0xbc02
109 | Version("c", 1) = 0x0c01
110 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/scan_isolation:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set "a" = 0x01
 6 |     set TxnWrite(1, "a") = []
 7 |     set Version("a", 1) = 0x01
 8 | 
 9 | T1: set "b" = 0x01
10 |     set TxnWrite(1, "b") = []
11 |     set Version("b", 1) = 0x01
12 | 
13 | T1: set "d" = 0x01
14 |     set TxnWrite(1, "d") = []
15 |     set Version("d", 1) = 0x01
16 | 
17 | T1: set "e" = 0x01
18 |     set TxnWrite(1, "e") = []
19 |     set Version("e", 1) = 0x01
20 | 
21 | T1: commit
22 |     del TxnWrite(1, "a")
23 |     del TxnWrite(1, "b")
24 |     del TxnWrite(1, "d")
25 |     del TxnWrite(1, "e")
26 |     del TxnActive(1)
27 | 
28 | T2: begin → v2 read-write active={}
29 |     set NextVersion = 3
30 |     set TxnActive(2) = []
31 | 
32 | T2: set "a" = 0x02
33 |     set TxnWrite(2, "a") = []
34 |     set Version("a", 2) = 0x02
35 | 
36 | T2: del "b"
37 |     set TxnWrite(2, "b") = []
38 |     set Version("b", 2) = None
39 | 
40 | T2: set "c" = 0x02
41 |     set TxnWrite(2, "c") = []
42 |     set Version("c", 2) = 0x02
43 | 
44 | T3: begin read-only → v3 read-only active={2}
45 | 
46 | T4: begin → v3 read-write active={2}
47 |     set NextVersion = 4
48 |     set TxnActiveSnapshot(3) = {2}
49 |     set TxnActive(3) = []
50 | 
51 | T4: set "d" = 0x03
52 |     set TxnWrite(3, "d") = []
53 |     set Version("d", 3) = 0x03
54 | 
55 | T4: del "e"
56 |     set TxnWrite(3, "e") = []
57 |     set Version("e", 3) = None
58 | 
59 | T4: set "f" = 0x03
60 |     set TxnWrite(3, "f") = []
61 |     set Version("f", 3) = 0x03
62 | 
63 | T4: commit
64 |     del TxnWrite(3, "d")
65 |     del TxnWrite(3, "e")
66 |     del TxnWrite(3, "f")
67 |     del TxnActive(3)
68 | 
69 | T3: scan ..
70 |     "a" = 0x01
71 |     "b" = 0x01
72 |     "d" = 0x01
73 |     "e" = 0x01
74 | 
75 | Engine state:
76 | NextVersion = 4
77 | TxnActive(2) = []
78 | TxnActiveSnapshot(3) = {2}
79 | TxnWrite(2, "a") = []
80 | TxnWrite(2, "b") = []
81 | TxnWrite(2, "c") = []
82 | Version("a", 1) = 0x01
83 | Version("a", 2) = 0x02
84 | Version("b", 1) = 0x01
85 | Version("b", 2) = None
86 | Version("c", 2) = 0x02
87 | Version("d", 1) = 0x01
88 | Version("d", 3) = 0x03
89 | Version("e", 1) = 0x01
90 | Version("e", 3) = None
91 | Version("f", 3) = 0x03
92 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/scan_key_version_encoding:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T1: set 0x00 = 0x01
 6 |     set TxnWrite(1, 0x00) = []
 7 |     set Version(0x00, 1) = 0x01
 8 | 
 9 | T1: commit
10 |     del TxnWrite(1, 0x00)
11 |     del TxnActive(1)
12 | 
13 | T2: begin → v2 read-write active={}
14 |     set NextVersion = 3
15 |     set TxnActive(2) = []
16 | 
17 | T2: set 0x00 = 0x02
18 |     set TxnWrite(2, 0x00) = []
19 |     set Version(0x00, 2) = 0x02
20 | 
21 | T2: set 0x000000000000000002 = 0x02
22 |     set TxnWrite(2, 0x000000000000000002) = []
23 |     set Version(0x000000000000000002, 2) = 0x02
24 | 
25 | T2: commit
26 |     del TxnWrite(2, 0x00)
27 |     del TxnWrite(2, 0x000000000000000002)
28 |     del TxnActive(2)
29 | 
30 | T3: begin → v3 read-write active={}
31 |     set NextVersion = 4
32 |     set TxnActive(3) = []
33 | 
34 | T3: set 0x00 = 0x03
35 |     set TxnWrite(3, 0x00) = []
36 |     set Version(0x00, 3) = 0x03
37 | 
38 | T3: commit
39 |     del TxnWrite(3, 0x00)
40 |     del TxnActive(3)
41 | 
42 | T4: begin read-only → v4 read-only active={}
43 | 
44 | T4: scan ..
45 |     0x00 = 0x03
46 |     0x000000000000000002 = 0x02
47 | 
48 | Engine state:
49 | NextVersion = 4
50 | Version(0x00, 1) = 0x01
51 | Version(0x00, 2) = 0x02
52 | Version(0x00, 3) = 0x03
53 | Version(0x000000000000000002, 2) = 0x02
54 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/scan_prefix:
--------------------------------------------------------------------------------
  1 | Engine state:
  2 | NextVersion = 5
  3 | Version("B", 1) = 0x0001
  4 | Version("B", 3) = None
  5 | Version("a", 1) = 0x0a01
  6 | Version("a", 2) = None
  7 | Version("a", 3) = 0x0a03
  8 | Version("b", 1) = None
  9 | Version("b", 3) = 0x0b03
 10 | Version("b", 4) = None
 11 | Version("ba", 2) = 0xba02
 12 | Version("ba", 4) = 0xba04
 13 | Version("bb", 2) = 0xbb02
 14 | Version("bb", 3) = None
 15 | Version("bc", 2) = 0xbc02
 16 | Version("c", 1) = 0x0c01
 17 | 
 18 | T1: begin as of 1 → v1 read-only active={}
 19 | 
 20 | T1: scan prefix []
 21 | 
 22 | T2: begin as of 2 → v2 read-only active={}
 23 | 
 24 | T2: scan prefix []
 25 |     "B" = 0x0001
 26 |     "a" = 0x0a01
 27 |     "c" = 0x0c01
 28 | 
 29 | T3: begin as of 3 → v3 read-only active={}
 30 | 
 31 | T3: scan prefix []
 32 |     "B" = 0x0001
 33 |     "ba" = 0xba02
 34 |     "bb" = 0xbb02
 35 |     "bc" = 0xbc02
 36 |     "c" = 0x0c01
 37 | 
 38 | T4: begin as of 4 → v4 read-only active={}
 39 | 
 40 | T4: scan prefix []
 41 |     "a" = 0x0a03
 42 |     "b" = 0x0b03
 43 |     "ba" = 0xba02
 44 |     "bc" = 0xbc02
 45 |     "c" = 0x0c01
 46 | 
 47 | T5: begin as of 3 → v3 read-only active={}
 48 | 
 49 | T5: scan prefix "B"
 50 |     "B" = 0x0001
 51 | 
 52 | T5: scan prefix "a"
 53 | 
 54 | T5: scan prefix "b"
 55 |     "ba" = 0xba02
 56 |     "bb" = 0xbb02
 57 |     "bc" = 0xbc02
 58 | 
 59 | T5: scan prefix "ba"
 60 |     "ba" = 0xba02
 61 | 
 62 | T5: scan prefix "bb"
 63 |     "bb" = 0xbb02
 64 | 
 65 | T5: scan prefix "bbb"
 66 | 
 67 | T5: scan prefix "bc"
 68 |     "bc" = 0xbc02
 69 | 
 70 | T5: scan prefix "c"
 71 |     "c" = 0x0c01
 72 | 
 73 | T5: scan prefix "d"
 74 | 
 75 | T6: begin as of 4 → v4 read-only active={}
 76 | 
 77 | T6: scan prefix "B"
 78 | 
 79 | T6: scan prefix "a"
 80 |     "a" = 0x0a03
 81 | 
 82 | T6: scan prefix "b"
 83 |     "b" = 0x0b03
 84 |     "ba" = 0xba02
 85 |     "bc" = 0xbc02
 86 | 
 87 | T6: scan prefix "ba"
 88 |     "ba" = 0xba02
 89 | 
 90 | T6: scan prefix "bb"
 91 | 
 92 | T6: scan prefix "bbb"
 93 | 
 94 | T6: scan prefix "bc"
 95 |     "bc" = 0xbc02
 96 | 
 97 | T6: scan prefix "c"
 98 |     "c" = 0x0c01
 99 | 
100 | T6: scan prefix "d"
101 | 
102 | Engine state:
103 | NextVersion = 5
104 | Version("B", 1) = 0x0001
105 | Version("B", 3) = None
106 | Version("a", 1) = 0x0a01
107 | Version("a", 2) = None
108 | Version("a", 3) = 0x0a03
109 | Version("b", 1) = None
110 | Version("b", 3) = 0x0b03
111 | Version("b", 4) = None
112 | Version("ba", 2) = 0xba02
113 | Version("ba", 4) = 0xba04
114 | Version("bb", 2) = 0xbb02
115 | Version("bb", 3) = None
116 | Version("bc", 2) = 0xbc02
117 | Version("c", 1) = 0x0c01
118 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/set:
--------------------------------------------------------------------------------
 1 | Engine state:
 2 | NextVersion = 2
 3 | Version("key", 1) = 0x01
 4 | Version("tombstone", 1) = None
 5 | 
 6 | T1: begin → v2 read-write active={}
 7 |     set NextVersion = 3
 8 |     set TxnActive(2) = []
 9 | 
10 | T1: set "key" = 0x02
11 |     set TxnWrite(2, "key") = []
12 |     set Version("key", 2) = 0x02
13 | 
14 | T1: set "tombstone" = 0x02
15 |     set TxnWrite(2, "tombstone") = []
16 |     set Version("tombstone", 2) = 0x02
17 | 
18 | T1: set "new" = 0x01
19 |     set TxnWrite(2, "new") = []
20 |     set Version("new", 2) = 0x01
21 | 
22 | T1: set "new" = 0x01
23 |     set TxnWrite(2, "new") = []
24 |     set Version("new", 2) = 0x01
25 | 
26 | T1: set "new" = 0x02
27 |     set TxnWrite(2, "new") = []
28 |     set Version("new", 2) = 0x02
29 | 
30 | T1: commit
31 |     del TxnWrite(2, "key")
32 |     del TxnWrite(2, "new")
33 |     del TxnWrite(2, "tombstone")
34 |     del TxnActive(2)
35 | 
36 | Engine state:
37 | NextVersion = 3
38 | Version("key", 1) = 0x01
39 | Version("key", 2) = 0x02
40 | Version("new", 2) = 0x02
41 | Version("tombstone", 1) = None
42 | Version("tombstone", 2) = 0x02
43 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/set_conflict:
--------------------------------------------------------------------------------
 1 | T1: begin → v1 read-write active={}
 2 |     set NextVersion = 2
 3 |     set TxnActive(1) = []
 4 | 
 5 | T2: begin → v2 read-write active={1}
 6 |     set NextVersion = 3
 7 |     set TxnActiveSnapshot(2) = {1}
 8 |     set TxnActive(2) = []
 9 | 
10 | T3: begin → v3 read-write active={1,2}
11 |     set NextVersion = 4
12 |     set TxnActiveSnapshot(3) = {1,2}
13 |     set TxnActive(3) = []
14 | 
15 | T4: begin → v4 read-write active={1,2,3}
16 |     set NextVersion = 5
17 |     set TxnActiveSnapshot(4) = {1,2,3}
18 |     set TxnActive(4) = []
19 | 
20 | T1: set "a" = 0x01
21 |     set TxnWrite(1, "a") = []
22 |     set Version("a", 1) = 0x01
23 | 
24 | T3: set "c" = 0x03
25 |     set TxnWrite(3, "c") = []
26 |     set Version("c", 3) = 0x03
27 | 
28 | T4: set "d" = 0x04
29 |     set TxnWrite(4, "d") = []
30 |     set Version("d", 4) = 0x04
31 | 
32 | T4: commit
33 |     del TxnWrite(4, "d")
34 |     del TxnActive(4)
35 | 
36 | T2: set "a" = 0x02 → Error::Serialization
37 | 
38 | T2: set "c" = 0x02 → Error::Serialization
39 | 
40 | T2: set "d" = 0x02 → Error::Serialization
41 | 
42 | Engine state:
43 | NextVersion = 5
44 | TxnActive(1) = []
45 | TxnActive(2) = []
46 | TxnActive(3) = []
47 | TxnActiveSnapshot(2) = {1}
48 | TxnActiveSnapshot(3) = {1,2}
49 | TxnActiveSnapshot(4) = {1,2,3}
50 | TxnWrite(1, "a") = []
51 | TxnWrite(3, "c") = []
52 | Version("a", 1) = 0x01
53 | Version("c", 3) = 0x03
54 | Version("d", 4) = 0x04
55 | 


--------------------------------------------------------------------------------
/src/storage/golden/mvcc/unversioned:
--------------------------------------------------------------------------------
 1 | T_: set unversioned "a" = 0x00
 2 |     set Unversioned("a") = 0x00
 3 | 
 4 | T1: begin → v1 read-write active={}
 5 |     set NextVersion = 2
 6 |     set TxnActive(1) = []
 7 | 
 8 | T1: set "a" = 0x01
 9 |     set TxnWrite(1, "a") = []
10 |     set Version("a", 1) = 0x01
11 | 
12 | T1: set "b" = 0x01
13 |     set TxnWrite(1, "b") = []
14 |     set Version("b", 1) = 0x01
15 | 
16 | T1: set "c" = 0x01
17 |     set TxnWrite(1, "c") = []
18 |     set Version("c", 1) = 0x01
19 | 
20 | T1: commit
21 |     del TxnWrite(1, "a")
22 |     del TxnWrite(1, "b")
23 |     del TxnWrite(1, "c")
24 |     del TxnActive(1)
25 | 
26 | T_: set unversioned "b" = 0x00
27 |     set Unversioned("b") = 0x00
28 | 
29 | T_: set unversioned "d" = 0x00
30 |     set Unversioned("d") = 0x00
31 | 
32 | T2: begin read-only → v2 read-only active={}
33 | 
34 | T2: scan ..
35 |     "a" = 0x01
36 |     "b" = 0x01
37 |     "c" = 0x01
38 | 
39 | T_: get unversioned "a" → 0x00
40 | 
41 | T_: get unversioned "b" → 0x00
42 | 
43 | T_: get unversioned "c" → None
44 | 
45 | T_: get unversioned "d" → 0x00
46 | 
47 | T_: set unversioned "a" = 0x01
48 |     set Unversioned("a") = 0x01
49 | 
50 | T_: get unversioned "a" → 0x01
51 | 
52 | Engine state:
53 | NextVersion = 2
54 | Version("a", 1) = 0x01
55 | Version("b", 1) = 0x01
56 | Version("c", 1) = 0x01
57 | Unversioned("a") = 0x01
58 | Unversioned("b") = 0x00
59 | Unversioned("d") = 0x00
60 | 


--------------------------------------------------------------------------------
/src/storage/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod bincode;
2 | pub mod debug;
3 | pub mod engine;
4 | pub mod keycode;
5 | pub mod mvcc;
6 | 


--------------------------------------------------------------------------------