├── blog ├── part6.md ├── part4.md ├── part5.md ├── part2.md ├── part3.md └── part1.md ├── README.md ├── .gitignore ├── src ├── engine │ ├── mod.rs │ ├── operator.rs │ └── plan.rs ├── sql │ ├── mod.rs │ ├── ast.rs │ ├── tokenizer.rs │ └── parser.rs ├── main.rs ├── value.rs ├── db.rs ├── page.rs ├── pager.rs └── cursor.rs ├── Cargo.toml └── Cargo.lock /blog/part6.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rqlite 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target 3 | -------------------------------------------------------------------------------- /src/engine/mod.rs: -------------------------------------------------------------------------------- 1 | mod operator; 2 | pub mod plan; 3 | -------------------------------------------------------------------------------- /src/sql/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod ast; 2 | mod parser; 3 | mod tokenizer; 4 | 5 | pub use parser::{parse_create_statement, parse_statement}; 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rsqlite" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | anyhow = "1.0" 10 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.75" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" 10 | 11 | [[package]] 12 | name = "rsqlite" 13 | version = "0.1.0" 14 | dependencies = [ 15 | "anyhow", 16 | ] 17 | -------------------------------------------------------------------------------- /src/engine/operator.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context; 2 | 3 | use crate::{cursor::Scanner, value::OwnedValue}; 4 | 5 | #[derive(Debug)] 6 | pub enum Operator { 7 | SeqScan(SeqScan), 8 | } 9 | 10 | impl Operator { 11 | pub fn next_row(&mut self) -> anyhow::Result> { 12 | match self { 13 | Operator::SeqScan(s) => s.next_row(), 14 | } 15 | } 16 | } 17 | 18 | #[derive(Debug)] 19 | pub struct SeqScan { 20 | fields: Vec, 21 | scanner: Scanner, 22 | row_buffer: Vec, 23 | } 24 | 25 | impl SeqScan { 26 | pub fn new(fields: Vec, scanner: Scanner) -> Self { 27 | let row_buffer = vec![OwnedValue::Null; fields.len()]; 28 | 29 | Self { 30 | fields, 31 | scanner, 32 | row_buffer, 33 | } 34 | } 35 | 36 | fn next_row(&mut self) -> anyhow::Result> { 37 | let Some(mut record) = self.scanner.next_record()? else { 38 | return Ok(None); 39 | }; 40 | 41 | for (i, &n) in self.fields.iter().enumerate() { 42 | self.row_buffer[i] = record.owned_field(n)?.context("missing record field")?; 43 | } 44 | 45 | Ok(Some(&self.row_buffer)) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/sql/ast.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, Eq, PartialEq)] 2 | pub enum Statement { 3 | Select(SelectStatement), 4 | CreateTable(CreateTableStatement), 5 | } 6 | 7 | #[derive(Debug, Clone, Eq, PartialEq)] 8 | pub struct CreateTableStatement { 9 | pub name: String, 10 | pub columns: Vec, 11 | } 12 | 13 | #[derive(Debug, Clone, Eq, PartialEq)] 14 | pub struct ColumnDef { 15 | pub name: String, 16 | pub col_type: Type, 17 | } 18 | 19 | #[derive(Debug, Clone, Eq, PartialEq)] 20 | pub enum Type { 21 | Integer, 22 | Real, 23 | Text, 24 | Blob, 25 | } 26 | 27 | #[derive(Debug, Clone, Eq, PartialEq)] 28 | pub struct SelectStatement { 29 | pub core: SelectCore, 30 | } 31 | 32 | #[derive(Debug, Clone, Eq, PartialEq)] 33 | pub struct SelectCore { 34 | pub result_columns: Vec, 35 | pub from: SelectFrom, 36 | } 37 | 38 | #[derive(Debug, Clone, Eq, PartialEq)] 39 | pub enum ResultColumn { 40 | Star, 41 | Expr(ExprResultColumn), 42 | } 43 | 44 | #[derive(Debug, Clone, Eq, PartialEq)] 45 | pub struct ExprResultColumn { 46 | pub expr: Expr, 47 | pub alias: Option, 48 | } 49 | 50 | #[derive(Debug, Clone, Eq, PartialEq)] 51 | pub enum Expr { 52 | Column(Column), 53 | } 54 | 55 | #[derive(Debug, Clone, Eq, PartialEq)] 56 | pub struct Column { 57 | pub name: String, 58 | } 59 | 60 | #[derive(Debug, Clone, Eq, PartialEq)] 61 | pub enum SelectFrom { 62 | Table(String), 63 | } 64 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::{BufRead, Write, stdin}; 2 | 3 | use anyhow::Context; 4 | 5 | mod cursor; 6 | mod db; 7 | mod engine; 8 | mod page; 9 | mod pager; 10 | mod sql; 11 | mod value; 12 | 13 | fn main() -> anyhow::Result<()> { 14 | let database = db::Db::from_file(std::env::args().nth(1).context("missing db file")?)?; 15 | cli(database) 16 | } 17 | 18 | fn cli(mut db: db::Db) -> anyhow::Result<()> { 19 | print_flushed("rqlite> ")?; 20 | 21 | let mut line_buffer = String::new(); 22 | 23 | while stdin().lock().read_line(&mut line_buffer).is_ok() { 24 | match line_buffer.trim() { 25 | ".exit" => break, 26 | ".tables" => display_tables(&mut db)?, 27 | stmt => eval_query(&db, stmt)?, 28 | } 29 | 30 | print_flushed("\nrqlite> ")?; 31 | 32 | line_buffer.clear(); 33 | } 34 | 35 | Ok(()) 36 | } 37 | 38 | fn display_tables(db: &mut db::Db) -> anyhow::Result<()> { 39 | for table in &db.tables_metadata { 40 | print!("{} ", &table.name) 41 | } 42 | Ok(()) 43 | } 44 | 45 | fn print_flushed(s: &str) -> anyhow::Result<()> { 46 | print!("{s}"); 47 | std::io::stdout().flush().context("flush stdout") 48 | } 49 | 50 | fn eval_query(db: &db::Db, query: &str) -> anyhow::Result<()> { 51 | let parsed_query = sql::parse_statement(query, false)?; 52 | let mut op = engine::plan::Planner::new(db).compile(&parsed_query)?; 53 | 54 | while let Some(values) = op.next_row()? { 55 | let formated = values 56 | .iter() 57 | .map(ToString::to_string) 58 | .collect::>() 59 | .join("|"); 60 | 61 | println!("{formated}"); 62 | } 63 | 64 | Ok(()) 65 | } 66 | -------------------------------------------------------------------------------- /src/value.rs: -------------------------------------------------------------------------------- 1 | use std::{borrow::Cow, rc::Rc}; 2 | 3 | #[derive(Debug, Clone)] 4 | pub enum Value<'p> { 5 | Null, 6 | String(Cow<'p, str>), 7 | Blob(Cow<'p, [u8]>), 8 | Int(i64), 9 | Float(f64), 10 | } 11 | 12 | impl Value<'_> { 13 | pub fn as_str(&self) -> Option<&str> { 14 | if let Value::String(s) = self { 15 | Some(s.as_ref()) 16 | } else { 17 | None 18 | } 19 | } 20 | 21 | pub fn as_int(&self) -> Option { 22 | if let Value::Int(i) = self { 23 | Some(*i) 24 | } else { 25 | None 26 | } 27 | } 28 | } 29 | 30 | #[derive(Debug, Clone)] 31 | pub enum OwnedValue { 32 | Null, 33 | String(Rc), 34 | Blob(Rc>), 35 | Int(i64), 36 | Float(f64), 37 | } 38 | 39 | impl<'p> From> for OwnedValue { 40 | fn from(value: Value<'p>) -> Self { 41 | match value { 42 | Value::Null => Self::Null, 43 | Value::Int(i) => Self::Int(i), 44 | Value::Float(f) => Self::Float(f), 45 | Value::Blob(b) => Self::Blob(Rc::new(b.into_owned())), 46 | Value::String(s) => Self::String(Rc::new(s.into_owned())), 47 | } 48 | } 49 | } 50 | 51 | impl std::fmt::Display for OwnedValue { 52 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 53 | match self { 54 | OwnedValue::Null => write!(f, "null"), 55 | OwnedValue::String(s) => s.fmt(f), 56 | OwnedValue::Blob(items) => { 57 | write!( 58 | f, 59 | "{}", 60 | items 61 | .iter() 62 | .filter_map(|&n| char::from_u32(n as u32).filter(char::is_ascii)) 63 | .collect::() 64 | ) 65 | } 66 | OwnedValue::Int(i) => i.fmt(f), 67 | OwnedValue::Float(x) => x.fmt(f), 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/engine/plan.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Context, Ok}; 2 | 3 | use crate::{ 4 | db::Db, 5 | sql::ast::{self, SelectFrom}, 6 | }; 7 | 8 | use super::operator::{Operator, SeqScan}; 9 | 10 | pub struct Planner<'d> { 11 | db: &'d Db, 12 | } 13 | 14 | impl<'d> Planner<'d> { 15 | pub fn new(db: &'d Db) -> Self { 16 | Self { db } 17 | } 18 | pub fn compile(self, statement: &ast::Statement) -> anyhow::Result { 19 | match statement { 20 | ast::Statement::Select(s) => self.compile_select(s), 21 | stmt => bail!("unsupported statement: {stmt:?}"), 22 | } 23 | } 24 | 25 | fn compile_select(self, select: &ast::SelectStatement) -> anyhow::Result { 26 | let SelectFrom::Table(table_name) = &select.core.from; 27 | 28 | let table = self 29 | .db 30 | .tables_metadata 31 | .iter() 32 | .find(|m| &m.name == table_name) 33 | .with_context(|| format!("invalid table name: {table_name}"))?; 34 | 35 | let mut columns = Vec::new(); 36 | 37 | for res_col in &select.core.result_columns { 38 | match res_col { 39 | ast::ResultColumn::Star => { 40 | for i in 0..table.columns.len() { 41 | columns.push(i); 42 | } 43 | } 44 | ast::ResultColumn::Expr(e) => { 45 | let ast::Expr::Column(col) = &e.expr; 46 | let (index, _) = table 47 | .columns 48 | .iter() 49 | .enumerate() 50 | .find(|(_, c)| c.name == col.name) 51 | .with_context(|| format!("invalid column name: {}", col.name))?; 52 | columns.push(index); 53 | } 54 | } 55 | } 56 | 57 | Ok(Operator::SeqScan(SeqScan::new( 58 | columns, 59 | self.db.scanner(table.first_page), 60 | ))) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/sql/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use anyhow::bail; 2 | 3 | #[derive(Debug, Eq, PartialEq)] 4 | pub enum Token { 5 | Create, 6 | Table, 7 | Select, 8 | As, 9 | From, 10 | LPar, 11 | RPar, 12 | Star, 13 | Comma, 14 | SemiColon, 15 | Identifier(String), 16 | } 17 | 18 | impl Token { 19 | pub fn as_identifier(&self) -> Option<&str> { 20 | match self { 21 | Token::Identifier(ident) => Some(ident), 22 | _ => None, 23 | } 24 | } 25 | } 26 | 27 | pub fn tokenize(input: &str) -> anyhow::Result> { 28 | let mut tokens = Vec::new(); 29 | let mut chars = input.chars().peekable(); 30 | 31 | while let Some(c) = chars.next() { 32 | match c { 33 | '(' => tokens.push(Token::LPar), 34 | ')' => tokens.push(Token::RPar), 35 | '*' => tokens.push(Token::Star), 36 | ',' => tokens.push(Token::Comma), 37 | ';' => tokens.push(Token::SemiColon), 38 | c if c.is_whitespace() => continue, 39 | c if c.is_alphabetic() => { 40 | let mut ident = c.to_string().to_lowercase(); 41 | while let Some(cc) = chars.next_if(|&cc| cc.is_alphanumeric() || cc == '_') { 42 | ident.extend(cc.to_lowercase()); 43 | } 44 | 45 | match ident.as_str() { 46 | "create" => tokens.push(Token::Create), 47 | "table" => tokens.push(Token::Table), 48 | "select" => tokens.push(Token::Select), 49 | "as" => tokens.push(Token::As), 50 | "from" => tokens.push(Token::From), 51 | _ => tokens.push(Token::Identifier(ident)), 52 | } 53 | } 54 | _ => bail!("unexpected character: {}", c), 55 | } 56 | } 57 | 58 | Ok(tokens) 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use super::*; 64 | 65 | #[test] 66 | fn tokenize_select() { 67 | let input = "SeLect *, col as c FroM TableName_1;"; 68 | let expected = vec![ 69 | Token::Select, 70 | Token::Star, 71 | Token::Comma, 72 | Token::Identifier("col".to_string()), 73 | Token::As, 74 | Token::Identifier("c".to_string()), 75 | Token::From, 76 | Token::Identifier("tablename_1".to_string()), 77 | Token::SemiColon, 78 | ]; 79 | assert_eq!(tokenize(input).unwrap(), expected); 80 | } 81 | 82 | #[test] 83 | fn tokenize_invalid_char() { 84 | let input = "select @ from table;"; 85 | assert!(tokenize(input).is_err()); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/db.rs: -------------------------------------------------------------------------------- 1 | use std::{io::Read, path::Path}; 2 | 3 | use anyhow::Context; 4 | 5 | use crate::{ 6 | cursor::{Cursor, Scanner}, 7 | pager::{self, Pager}, 8 | sql::{self, ast}, 9 | }; 10 | 11 | #[derive(Debug, Clone)] 12 | pub struct TableMetadata { 13 | pub name: String, 14 | pub columns: Vec, 15 | pub first_page: usize, 16 | } 17 | 18 | impl TableMetadata { 19 | fn from_cursor(mut cursor: Cursor) -> anyhow::Result> { 20 | let type_value = cursor 21 | .field(0)? 22 | .context("missing type field") 23 | .context("invalid type field")?; 24 | 25 | if type_value.as_str() != Some("table") { 26 | return Ok(None); 27 | } 28 | 29 | let create_stmt = cursor 30 | .field(4)? 31 | .context("missing create statement") 32 | .context("invalid create statement")? 33 | .as_str() 34 | .context("table create statement should be a string")? 35 | .to_owned(); 36 | 37 | let create = sql::parse_create_statement(&create_stmt)?; 38 | 39 | let first_page = cursor 40 | .field(3)? 41 | .context("missing table first page")? 42 | .as_int() 43 | .context("table first page should be an integer")? as usize; 44 | 45 | Ok(Some(TableMetadata { 46 | name: create.name, 47 | columns: create.columns, 48 | first_page, 49 | })) 50 | } 51 | } 52 | 53 | pub struct Db { 54 | pub tables_metadata: Vec, 55 | pager: Pager, 56 | } 57 | 58 | impl Db { 59 | pub fn from_file(filename: impl AsRef) -> anyhow::Result { 60 | let mut file = std::fs::File::open(filename.as_ref()).context("open db file")?; 61 | 62 | let mut header_buffer = [0; pager::HEADER_SIZE]; 63 | file.read_exact(&mut header_buffer) 64 | .context("read db header")?; 65 | 66 | let header = pager::parse_header(&header_buffer).context("parse db header")?; 67 | 68 | let pager = Pager::new(header, file); 69 | 70 | let tables_metadata = Self::collect_tables_metadata(pager.clone())?; 71 | 72 | Ok(Db { 73 | pager, 74 | tables_metadata, 75 | }) 76 | } 77 | 78 | pub fn scanner(&self, page: usize) -> Scanner { 79 | Scanner::new(page, self.pager.clone()) 80 | } 81 | 82 | fn collect_tables_metadata(pager: Pager) -> anyhow::Result> { 83 | let mut metadata = Vec::new(); 84 | let mut scanner = Scanner::new(1, pager); 85 | 86 | while let Some(record) = scanner.next_record()? { 87 | if let Some(m) = TableMetadata::from_cursor(record)? { 88 | metadata.push(m); 89 | } 90 | } 91 | 92 | Ok(metadata) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/page.rs: -------------------------------------------------------------------------------- 1 | use anyhow::bail; 2 | 3 | #[derive(Debug, Copy, Clone)] 4 | pub struct DbHeader { 5 | pub page_size: u32, 6 | pub page_reserved_size: u8, 7 | } 8 | 9 | impl DbHeader { 10 | pub fn usable_page_size(&self) -> usize { 11 | self.page_size as usize - (self.page_reserved_size as usize) 12 | } 13 | } 14 | 15 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 16 | pub enum PageType { 17 | TableLeaf, 18 | TableInterior, 19 | } 20 | 21 | #[derive(Debug, Copy, Clone)] 22 | pub struct PageHeader { 23 | pub page_type: PageType, 24 | pub cell_count: u16, 25 | pub rightmost_pointer: Option, 26 | } 27 | 28 | impl PageHeader { 29 | pub fn byte_size(&self) -> usize { 30 | if self.rightmost_pointer.is_some() { 31 | 12 32 | } else { 33 | 8 34 | } 35 | } 36 | 37 | pub fn local_and_overflow_size( 38 | &self, 39 | db_header: &DbHeader, 40 | payload_size: usize, 41 | ) -> anyhow::Result<(usize, Option)> { 42 | let local = self.local_payload_size(db_header, payload_size)?; 43 | if local == payload_size { 44 | Ok((local, None)) 45 | } else { 46 | Ok((local, Some(payload_size.saturating_sub(local)))) 47 | } 48 | } 49 | 50 | fn local_payload_size( 51 | &self, 52 | db_header: &DbHeader, 53 | payload_size: usize, 54 | ) -> anyhow::Result { 55 | match self.page_type { 56 | PageType::TableInterior => bail!("no payload size for interior pages"), 57 | PageType::TableLeaf => { 58 | let usable = db_header.usable_page_size(); 59 | let max_size = usable - 35; 60 | if payload_size <= max_size { 61 | return Ok(payload_size); 62 | } 63 | let min_size = ((usable - 12) * 32 / 255) - 23; 64 | let k = min_size + ((payload_size - min_size) % (usable - 4)); 65 | let size = if k <= max_size { k } else { min_size }; 66 | Ok(size) 67 | } 68 | } 69 | } 70 | } 71 | 72 | #[derive(Debug, Clone)] 73 | pub struct Page { 74 | pub header: PageHeader, 75 | pub cells: Vec, 76 | } 77 | 78 | impl Page { 79 | pub fn get(&self, n: usize) -> Option<&Cell> { 80 | self.cells.get(n) 81 | } 82 | } 83 | 84 | #[derive(Debug, Clone)] 85 | pub struct TableLeafCell { 86 | pub payload: Vec, 87 | pub first_overflow: Option, 88 | } 89 | 90 | #[derive(Debug, Clone)] 91 | pub struct TableInteriorCell { 92 | pub left_child_page: u32, 93 | } 94 | 95 | #[derive(Debug, Clone)] 96 | pub enum Cell { 97 | TableLeaf(TableLeafCell), 98 | TableInterior(TableInteriorCell), 99 | } 100 | 101 | impl From for Cell { 102 | fn from(cell: TableLeafCell) -> Self { 103 | Cell::TableLeaf(cell) 104 | } 105 | } 106 | 107 | impl From for Cell { 108 | fn from(cell: TableInteriorCell) -> Self { 109 | Cell::TableInterior(cell) 110 | } 111 | } 112 | 113 | #[derive(Debug, Clone)] 114 | pub struct OverflowPage { 115 | pub next: Option, 116 | pub payload: Vec, 117 | } 118 | -------------------------------------------------------------------------------- /src/sql/parser.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Context}; 2 | 3 | use crate::sql::{ 4 | ast::{ 5 | Column, ColumnDef, CreateTableStatement, Expr, ExprResultColumn, ResultColumn, SelectCore, 6 | SelectFrom, SelectStatement, Statement, Type, 7 | }, 8 | tokenizer::{self, Token}, 9 | }; 10 | 11 | #[derive(Debug)] 12 | struct ParserState { 13 | tokens: Vec, 14 | pos: usize, 15 | } 16 | 17 | impl ParserState { 18 | fn new(tokens: Vec) -> Self { 19 | Self { tokens, pos: 0 } 20 | } 21 | 22 | fn parse_statement(&mut self) -> anyhow::Result { 23 | match self.peek_next_token().context("unexpected end of input")? { 24 | Token::Select => self.parse_select().map(Statement::Select), 25 | Token::Create => self.parse_create_table().map(Statement::CreateTable), 26 | token => bail!("unexpected token: {token:?}"), 27 | } 28 | } 29 | 30 | fn parse_create_table(&mut self) -> anyhow::Result { 31 | self.expect_eq(Token::Create)?; 32 | self.expect_eq(Token::Table)?; 33 | let name = self.expect_identifier()?.to_string(); 34 | self.expect_eq(Token::LPar)?; 35 | let mut columns = vec![self.parse_column_def()?]; 36 | while self.next_token_is(Token::Comma) { 37 | self.advance(); 38 | columns.push(self.parse_column_def()?); 39 | } 40 | self.expect_eq(Token::RPar)?; 41 | Ok(CreateTableStatement { name, columns }) 42 | } 43 | 44 | fn parse_column_def(&mut self) -> anyhow::Result { 45 | Ok(ColumnDef { 46 | name: self.expect_identifier()?.to_string(), 47 | col_type: self.parse_type()?, 48 | }) 49 | } 50 | 51 | fn parse_type(&mut self) -> anyhow::Result { 52 | let type_name = self.expect_identifier()?; 53 | let t = match type_name.to_lowercase().as_str() { 54 | "integer" => Type::Integer, 55 | "real" => Type::Real, 56 | "blob" => Type::Blob, 57 | "text" | "string" => Type::Text, 58 | _ => bail!("unsupported type: {type_name}"), 59 | }; 60 | Ok(t) 61 | } 62 | 63 | fn parse_select(&mut self) -> anyhow::Result { 64 | self.expect_eq(Token::Select)?; 65 | let result_columns = self.parse_result_columns()?; 66 | self.expect_eq(Token::From)?; 67 | let from = self.parse_select_from()?; 68 | Ok(SelectStatement { 69 | core: SelectCore { 70 | result_columns, 71 | from, 72 | }, 73 | }) 74 | } 75 | 76 | fn parse_select_from(&mut self) -> anyhow::Result { 77 | let table = self.expect_identifier()?; 78 | Ok(SelectFrom::Table(table.to_string())) 79 | } 80 | 81 | fn parse_result_columns(&mut self) -> anyhow::Result> { 82 | let mut result_coluns = vec![self.parse_result_column()?]; 83 | while self.next_token_is(Token::Comma) { 84 | self.advance(); 85 | result_coluns.push(self.parse_result_column()?); 86 | } 87 | Ok(result_coluns) 88 | } 89 | 90 | fn parse_result_column(&mut self) -> anyhow::Result { 91 | if self.peek_next_token()? == &Token::Star { 92 | self.advance(); 93 | return Ok(ResultColumn::Star); 94 | } 95 | 96 | Ok(ResultColumn::Expr(self.parse_expr_result_column()?)) 97 | } 98 | 99 | fn parse_expr_result_column(&mut self) -> anyhow::Result { 100 | let expr = self.parse_expr()?; 101 | let alias = if self.next_token_is(Token::As) { 102 | self.advance(); 103 | Some(self.expect_identifier()?.to_string()) 104 | } else { 105 | None 106 | }; 107 | Ok(ExprResultColumn { expr, alias }) 108 | } 109 | 110 | fn parse_expr(&mut self) -> anyhow::Result { 111 | Ok(Expr::Column(Column { 112 | name: self.expect_identifier()?.to_string(), 113 | })) 114 | } 115 | 116 | fn next_token_is(&self, expected: Token) -> bool { 117 | self.tokens.get(self.pos) == Some(&expected) 118 | } 119 | 120 | fn expect_identifier(&mut self) -> anyhow::Result<&str> { 121 | self.expect_matching(|t| matches!(t, Token::Identifier(_))) 122 | .map(|t| t.as_identifier().unwrap()) 123 | } 124 | 125 | fn expect_eq(&mut self, expected: Token) -> anyhow::Result<&Token> { 126 | self.expect_matching(|t| *t == expected) 127 | } 128 | 129 | fn expect_matching(&mut self, f: impl Fn(&Token) -> bool) -> anyhow::Result<&Token> { 130 | match self.next_token() { 131 | Some(token) if f(token) => Ok(token), 132 | Some(token) => bail!("unexpected token: {:?}", token), 133 | None => bail!("unexpected end of input"), 134 | } 135 | } 136 | 137 | fn peek_next_token(&self) -> anyhow::Result<&Token> { 138 | self.tokens.get(self.pos).context("unexpected end of input") 139 | } 140 | 141 | fn next_token(&mut self) -> Option<&Token> { 142 | let token = self.tokens.get(self.pos); 143 | if token.is_some() { 144 | self.pos += 1; 145 | } 146 | token 147 | } 148 | 149 | fn advance(&mut self) { 150 | self.pos += 1; 151 | } 152 | } 153 | 154 | pub fn parse_statement(input: &str, trailing_semicolon: bool) -> anyhow::Result { 155 | let tokens = tokenizer::tokenize(input)?; 156 | let mut state = ParserState::new(tokens); 157 | let statement = state.parse_statement()?; 158 | if trailing_semicolon { 159 | state.expect_eq(Token::SemiColon)?; 160 | } 161 | Ok(statement) 162 | } 163 | 164 | pub fn parse_create_statement(input: &str) -> anyhow::Result { 165 | match parse_statement(input, false)? { 166 | Statement::CreateTable(c) => Ok(c), 167 | Statement::Select(_) => bail!("expected a create statement"), 168 | } 169 | } 170 | 171 | #[cfg(test)] 172 | mod tests { 173 | use super::*; 174 | 175 | #[test] 176 | fn create_table() { 177 | let input = "create table table1(key integer, value text)"; 178 | let statement = parse_statement(input, false).unwrap(); 179 | assert_eq!( 180 | statement, 181 | Statement::CreateTable(CreateTableStatement { 182 | name: "table1".to_string(), 183 | columns: vec![ 184 | ColumnDef { 185 | name: "key".to_string(), 186 | col_type: Type::Integer, 187 | }, 188 | ColumnDef { 189 | name: "value".to_string(), 190 | col_type: Type::Text, 191 | } 192 | ] 193 | }) 194 | ) 195 | } 196 | 197 | #[test] 198 | fn select_star_from_table() { 199 | let input = "select * from table1"; 200 | let statement = parse_statement(input, false).unwrap(); 201 | assert_eq!( 202 | statement, 203 | Statement::Select(SelectStatement { 204 | core: SelectCore { 205 | result_columns: vec![ResultColumn::Star], 206 | from: SelectFrom::Table("table1".to_string()), 207 | }, 208 | }) 209 | ); 210 | } 211 | 212 | #[test] 213 | fn select_columns_from_table() { 214 | let input = "select col1 as first, col2 from table1;"; 215 | let statement = parse_statement(input, true).unwrap(); 216 | assert_eq!( 217 | statement, 218 | Statement::Select(SelectStatement { 219 | core: SelectCore { 220 | result_columns: vec![ 221 | ResultColumn::Expr(ExprResultColumn { 222 | expr: Expr::Column(Column { 223 | name: "col1".to_string() 224 | }), 225 | alias: Some("first".to_string()) 226 | }), 227 | ResultColumn::Expr(ExprResultColumn { 228 | expr: Expr::Column(Column { 229 | name: "col2".to_string() 230 | }), 231 | alias: None 232 | }), 233 | ], 234 | from: SelectFrom::Table("table1".to_string()), 235 | }, 236 | }) 237 | ); 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /src/pager.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::HashMap, 3 | io::{Read, Seek, SeekFrom}, 4 | sync::{Arc, Mutex, RwLock}, 5 | }; 6 | 7 | use anyhow::{Context, anyhow, bail}; 8 | 9 | use crate::page::{self, DbHeader, PageHeader}; 10 | 11 | pub const HEADER_SIZE: usize = 100; 12 | const HEADER_PREFIX: &[u8] = b"SQLite format 3\0"; 13 | const HEADER_PAGE_SIZE_OFFSET: usize = 16; 14 | const HEADER_PAGE_RESERVED_SIZE_OFFSET: usize = 20; 15 | 16 | const PAGE_MAX_SIZE: u32 = 65536; 17 | 18 | const PAGE_LEAF_TABLE_ID: u8 = 0x0d; 19 | const PAGE_INTERIOR_TABLE_ID: u8 = 0x05; 20 | 21 | const PAGE_CELL_COUNT_OFFSET: usize = 3; 22 | const PAGE_RIGHTMOST_POINTER_OFFSET: usize = 8; 23 | 24 | #[derive(Debug, Clone)] 25 | enum CachedPage { 26 | Page(Arc), 27 | Overflow(Arc), 28 | } 29 | 30 | impl From> for CachedPage { 31 | fn from(value: Arc) -> Self { 32 | CachedPage::Page(value) 33 | } 34 | } 35 | 36 | impl TryFrom for Arc { 37 | type Error = anyhow::Error; 38 | 39 | fn try_from(value: CachedPage) -> Result { 40 | if let CachedPage::Page(p) = value { 41 | Ok(p.clone()) 42 | } else { 43 | bail!("expected a regular page") 44 | } 45 | } 46 | } 47 | 48 | impl From> for CachedPage { 49 | fn from(value: Arc) -> Self { 50 | CachedPage::Overflow(value) 51 | } 52 | } 53 | 54 | impl TryFrom for Arc { 55 | type Error = anyhow::Error; 56 | 57 | fn try_from(value: CachedPage) -> Result { 58 | if let CachedPage::Overflow(o) = value { 59 | Ok(o.clone()) 60 | } else { 61 | bail!("expected an overflow page") 62 | } 63 | } 64 | } 65 | 66 | #[derive(Debug)] 67 | pub struct Pager { 68 | input: Arc>, 69 | pages: Arc>>, 70 | header: DbHeader, 71 | } 72 | 73 | impl Pager { 74 | pub fn new(header: DbHeader, input: I) -> Self { 75 | Self { 76 | input: Arc::new(Mutex::new(input)), 77 | pages: Arc::default(), 78 | header, 79 | } 80 | } 81 | 82 | pub fn read_overflow(&self, n: usize) -> anyhow::Result> { 83 | self.load(n, |buffer| Ok(parse_overflow_page(buffer))) 84 | } 85 | 86 | pub fn read_page(&self, n: usize) -> anyhow::Result> { 87 | self.load(n, |buffer| parse_page(&self.header, buffer, n)) 88 | } 89 | 90 | fn load(&self, n: usize, f: impl Fn(&[u8]) -> anyhow::Result) -> anyhow::Result> 91 | where 92 | Arc: Into, 93 | CachedPage: TryInto, Error = anyhow::Error>, 94 | { 95 | { 96 | let read_pages = self 97 | .pages 98 | .read() 99 | .map_err(|_| anyhow!("poisoned page cache lock"))?; 100 | 101 | if let Some(page) = read_pages.get(&n).cloned() { 102 | return page.try_into(); 103 | } 104 | } 105 | 106 | let mut write_pages = self 107 | .pages 108 | .write() 109 | .map_err(|_| anyhow!("failed to acquire pager write lock"))?; 110 | 111 | if let Some(page) = write_pages.get(&n).cloned() { 112 | return page.try_into(); 113 | } 114 | 115 | let buffer = self.load_raw(n)?; 116 | let parsed = f(&buffer[0..self.header.usable_page_size()])?; 117 | let ptr = Arc::new(parsed); 118 | 119 | write_pages.insert(n, ptr.clone().into()); 120 | 121 | Ok(ptr) 122 | } 123 | 124 | fn load_raw(&self, n: usize) -> anyhow::Result> { 125 | let offset = n.saturating_sub(1) * self.header.page_size as usize; 126 | 127 | let mut input_guard = self 128 | .input 129 | .lock() 130 | .map_err(|_| anyhow!("poisoned pager mutex"))?; 131 | 132 | input_guard 133 | .seek(SeekFrom::Start(offset as u64)) 134 | .context("seek to page start")?; 135 | 136 | let mut buffer = vec![0; self.header.page_size as usize]; 137 | input_guard.read_exact(&mut buffer).context("read page")?; 138 | 139 | Ok(buffer) 140 | } 141 | } 142 | 143 | impl Clone for Pager { 144 | fn clone(&self) -> Self { 145 | Self { 146 | input: self.input.clone(), 147 | pages: self.pages.clone(), 148 | header: self.header, 149 | } 150 | } 151 | } 152 | 153 | fn parse_overflow_page(buffer: &[u8]) -> page::OverflowPage { 154 | let next = read_be_double_at(buffer, 0); 155 | page::OverflowPage { 156 | payload: buffer[4..].to_vec(), 157 | next: if next != 0 { Some(next as usize) } else { None }, 158 | } 159 | } 160 | 161 | pub fn parse_header(buffer: &[u8]) -> anyhow::Result { 162 | if !buffer.starts_with(HEADER_PREFIX) { 163 | let prefix = String::from_utf8_lossy(&buffer[..HEADER_PREFIX.len()]); 164 | anyhow::bail!("invalid header prefix: {prefix}"); 165 | } 166 | 167 | let page_size_raw = read_be_word_at(buffer, HEADER_PAGE_SIZE_OFFSET); 168 | let page_size = match page_size_raw { 169 | 1 => PAGE_MAX_SIZE, 170 | n if n.is_power_of_two() => n as u32, 171 | _ => anyhow::bail!("page size is not a power of 2: {}", page_size_raw), 172 | }; 173 | 174 | let page_reserved_size = buffer[HEADER_PAGE_RESERVED_SIZE_OFFSET]; 175 | 176 | Ok(page::DbHeader { 177 | page_size, 178 | page_reserved_size, 179 | }) 180 | } 181 | 182 | fn parse_page(db_header: &DbHeader, buffer: &[u8], page_num: usize) -> anyhow::Result { 183 | let ptr_offset = if page_num == 1 { HEADER_SIZE as u16 } else { 0 }; 184 | let content_buffer = &buffer[ptr_offset as usize..]; 185 | let header = parse_page_header(content_buffer)?; 186 | let cell_pointers = parse_cell_pointers( 187 | &content_buffer[header.byte_size()..], 188 | header.cell_count as usize, 189 | ptr_offset, 190 | ); 191 | 192 | let cells_parsing_fn = match header.page_type { 193 | page::PageType::TableLeaf => parse_table_leaf_cell, 194 | page::PageType::TableInterior => parse_table_interior_cell, 195 | }; 196 | 197 | let cells = parse_cells( 198 | db_header, 199 | &header, 200 | content_buffer, 201 | &cell_pointers, 202 | cells_parsing_fn, 203 | )?; 204 | 205 | Ok(page::Page { header, cells }) 206 | } 207 | 208 | fn parse_cells( 209 | db_header: &DbHeader, 210 | header: &PageHeader, 211 | buffer: &[u8], 212 | cell_pointers: &[u16], 213 | parse_fn: impl Fn(&DbHeader, &PageHeader, &[u8]) -> anyhow::Result, 214 | ) -> anyhow::Result> { 215 | cell_pointers 216 | .iter() 217 | .map(|&ptr| parse_fn(db_header, header, &buffer[ptr as usize..])) 218 | .collect() 219 | } 220 | 221 | fn parse_table_leaf_cell( 222 | db_header: &DbHeader, 223 | header: &PageHeader, 224 | mut buffer: &[u8], 225 | ) -> anyhow::Result { 226 | let (n, size) = read_varint_at(buffer, 0); 227 | buffer = &buffer[n as usize..]; 228 | 229 | let (n, _) = read_varint_at(buffer, 0); 230 | buffer = &buffer[n as usize..]; 231 | 232 | let (local_size, overflow_size) = header.local_and_overflow_size(db_header, size as usize)?; 233 | let first_overflow = overflow_size.map(|_| read_be_double_at(buffer, local_size) as usize); 234 | 235 | let payload = buffer[..local_size].to_vec(); 236 | 237 | Ok(page::TableLeafCell { 238 | payload, 239 | first_overflow, 240 | } 241 | .into()) 242 | } 243 | 244 | fn parse_table_interior_cell( 245 | _: &DbHeader, 246 | _: &PageHeader, 247 | buffer: &[u8], 248 | ) -> anyhow::Result { 249 | Ok(page::TableInteriorCell { 250 | left_child_page: read_be_double_at(buffer, 0), 251 | } 252 | .into()) 253 | } 254 | 255 | fn parse_page_header(buffer: &[u8]) -> anyhow::Result { 256 | let (page_type, rightmost_ptr) = match buffer[0] { 257 | PAGE_LEAF_TABLE_ID => (page::PageType::TableLeaf, false), 258 | PAGE_INTERIOR_TABLE_ID => (page::PageType::TableInterior, true), 259 | _ => anyhow::bail!("unknown page type: {}", buffer[0]), 260 | }; 261 | 262 | let cell_count = read_be_word_at(buffer, PAGE_CELL_COUNT_OFFSET); 263 | 264 | let rightmost_pointer = if rightmost_ptr { 265 | Some(read_be_double_at(buffer, PAGE_RIGHTMOST_POINTER_OFFSET)) 266 | } else { 267 | None 268 | }; 269 | 270 | Ok(page::PageHeader { 271 | page_type, 272 | cell_count, 273 | rightmost_pointer, 274 | }) 275 | } 276 | 277 | fn parse_cell_pointers(buffer: &[u8], n: usize, ptr_offset: u16) -> Vec { 278 | let mut pointers = Vec::with_capacity(n); 279 | for i in 0..n { 280 | pointers.push(read_be_word_at(buffer, 2 * i) - ptr_offset); 281 | } 282 | pointers 283 | } 284 | 285 | pub fn read_varint_at(buffer: &[u8], mut offset: usize) -> (u8, i64) { 286 | let mut size = 0; 287 | let mut result = 0; 288 | 289 | while size < 9 { 290 | let current_byte = buffer[offset] as i64; 291 | if size == 8 { 292 | result = (result << 8) | current_byte; 293 | } else { 294 | result = (result << 7) | (current_byte & 0b0111_1111); 295 | } 296 | 297 | offset += 1; 298 | size += 1; 299 | 300 | if current_byte & 0b1000_0000 == 0 { 301 | break; 302 | } 303 | } 304 | 305 | (size, result) 306 | } 307 | 308 | pub fn read_be_double_at(input: &[u8], offset: usize) -> u32 { 309 | u32::from_be_bytes(input[offset..offset + 4].try_into().unwrap()) 310 | } 311 | 312 | fn read_be_word_at(input: &[u8], offset: usize) -> u16 { 313 | u16::from_be_bytes(input[offset..offset + 2].try_into().unwrap()) 314 | } 315 | 316 | #[cfg(test)] 317 | mod test { 318 | use super::*; 319 | 320 | #[test] 321 | fn short_varint() { 322 | let buffer = [0b0000_0001]; 323 | assert_eq!(read_varint_at(&buffer, 0), (1, 1)); 324 | } 325 | 326 | #[test] 327 | fn middle_varint() { 328 | let buffer = [0b1000_0001, 0b0111_1111]; 329 | assert_eq!(read_varint_at(&buffer, 0), (2, 255)); 330 | } 331 | 332 | #[test] 333 | fn long_varint() { 334 | let buffer = [ 335 | 0b1000_0000, 336 | 0b1111_1111, 337 | 0b1000_0000, 338 | 0b1000_0000, 339 | 0b1000_0000, 340 | 0b1000_0000, 341 | 0b1000_0000, 342 | 0b1000_0000, 343 | 0b0110_1101, 344 | ]; 345 | assert_eq!( 346 | read_varint_at(&buffer, 0), 347 | ( 348 | 9, 349 | 0b00000001_11111100_00000000_00000000_00000000_00000000_00000000_01101101, 350 | ) 351 | ); 352 | } 353 | 354 | #[test] 355 | fn minus_one() { 356 | let buffer = [ 357 | 0b1111_1111, 358 | 0b1111_1111, 359 | 0b1111_1111, 360 | 0b1111_1111, 361 | 0b1111_1111, 362 | 0b1111_1111, 363 | 0b1111_1111, 364 | 0b1111_1111, 365 | 0b1111_1111, 366 | ]; 367 | assert_eq!(read_varint_at(&buffer, 0), (9, -1)); 368 | } 369 | } 370 | -------------------------------------------------------------------------------- /src/cursor.rs: -------------------------------------------------------------------------------- 1 | use std::{borrow::Cow, sync::Arc}; 2 | 3 | use anyhow::Context; 4 | 5 | use crate::{ 6 | page::{Cell, Page, PageType}, 7 | pager::Pager, 8 | value::{OwnedValue, Value}, 9 | }; 10 | 11 | #[derive(Debug, Copy, Clone)] 12 | pub enum RecordFieldType { 13 | Null, 14 | I8, 15 | I16, 16 | I24, 17 | I32, 18 | I48, 19 | I64, 20 | Float, 21 | Zero, 22 | One, 23 | String(usize), 24 | Blob(usize), 25 | } 26 | 27 | #[derive(Debug, Clone)] 28 | pub struct RecordField { 29 | pub offset: usize, 30 | pub field_type: RecordFieldType, 31 | } 32 | 33 | impl RecordField { 34 | pub fn end_offset(&self) -> usize { 35 | let size = match self.field_type { 36 | RecordFieldType::Null => 0, 37 | RecordFieldType::I8 => 1, 38 | RecordFieldType::I16 => 2, 39 | RecordFieldType::I24 => 3, 40 | RecordFieldType::I32 => 4, 41 | RecordFieldType::I48 => 5, 42 | RecordFieldType::I64 => 8, 43 | RecordFieldType::Float => 8, 44 | RecordFieldType::Zero => 0, 45 | RecordFieldType::One => 0, 46 | RecordFieldType::String(size) | RecordFieldType::Blob(size) => size, 47 | }; 48 | 49 | self.offset + size 50 | } 51 | } 52 | 53 | #[derive(Debug, Clone)] 54 | pub struct RecordHeader { 55 | pub fields: Vec, 56 | } 57 | 58 | fn parse_record_header(mut buffer: &[u8]) -> anyhow::Result { 59 | let (varint_size, header_length) = crate::pager::read_varint_at(buffer, 0); 60 | buffer = &buffer[varint_size as usize..header_length as usize]; 61 | 62 | let mut fields = Vec::new(); 63 | let mut current_offset = header_length as usize; 64 | 65 | while !buffer.is_empty() { 66 | let (discriminant_size, discriminant) = crate::pager::read_varint_at(buffer, 0); 67 | buffer = &buffer[discriminant_size as usize..]; 68 | 69 | let (field_type, field_size) = match discriminant { 70 | 0 => (RecordFieldType::Null, 0), 71 | 1 => (RecordFieldType::I8, 1), 72 | 2 => (RecordFieldType::I16, 2), 73 | 3 => (RecordFieldType::I24, 3), 74 | 4 => (RecordFieldType::I32, 4), 75 | 5 => (RecordFieldType::I48, 6), 76 | 6 => (RecordFieldType::I64, 8), 77 | 7 => (RecordFieldType::Float, 8), 78 | 8 => (RecordFieldType::Zero, 0), 79 | 9 => (RecordFieldType::One, 0), 80 | n if n >= 12 && n % 2 == 0 => { 81 | let size = ((n - 12) / 2) as usize; 82 | (RecordFieldType::Blob(size), size) 83 | } 84 | n if n >= 13 && n % 2 == 1 => { 85 | let size = ((n - 13) / 2) as usize; 86 | (RecordFieldType::String(size), size) 87 | } 88 | n => anyhow::bail!("unsupported field type: {}", n), 89 | }; 90 | 91 | fields.push(RecordField { 92 | offset: current_offset, 93 | field_type, 94 | }); 95 | 96 | current_offset += field_size; 97 | } 98 | 99 | Ok(RecordHeader { fields }) 100 | } 101 | 102 | #[derive(Debug)] 103 | pub struct Cursor { 104 | header: RecordHeader, 105 | payload: Vec, 106 | pager: Pager, 107 | next_overflow_page: Option, 108 | } 109 | 110 | impl Cursor { 111 | pub fn owned_field(&mut self, n: usize) -> anyhow::Result> { 112 | Ok(self.field(n)?.map(Into::into)) 113 | } 114 | 115 | pub fn field(&mut self, n: usize) -> anyhow::Result> { 116 | let Some(record_field) = self.header.fields.get(n) else { 117 | return Ok(None); 118 | }; 119 | 120 | let end_offset = record_field.end_offset(); 121 | 122 | if end_offset > (self.payload.len() - 1) 123 | && let Some(overflow_page) = self.next_overflow_page 124 | { 125 | let overflow_size = end_offset.saturating_sub(self.payload.len()); 126 | let (next_overflow, overflow_data) = OverflowScanner::new(self.pager.clone()) 127 | .read(overflow_page, overflow_size) 128 | .context("read overflow page")?; 129 | self.next_overflow_page = next_overflow; 130 | self.payload.extend_from_slice(&overflow_data); 131 | } 132 | 133 | let value = match record_field.field_type { 134 | RecordFieldType::Null => Some(Value::Null), 135 | RecordFieldType::I8 => Some(Value::Int(read_i8_at(&self.payload, record_field.offset))), 136 | RecordFieldType::I16 => { 137 | Some(Value::Int(read_i16_at(&self.payload, record_field.offset))) 138 | } 139 | RecordFieldType::I24 => { 140 | Some(Value::Int(read_i24_at(&self.payload, record_field.offset))) 141 | } 142 | RecordFieldType::I32 => { 143 | Some(Value::Int(read_i32_at(&self.payload, record_field.offset))) 144 | } 145 | RecordFieldType::I48 => { 146 | Some(Value::Int(read_i48_at(&self.payload, record_field.offset))) 147 | } 148 | RecordFieldType::I64 => { 149 | Some(Value::Int(read_i64_at(&self.payload, record_field.offset))) 150 | } 151 | RecordFieldType::Float => Some(Value::Float(read_f64_at( 152 | &self.payload, 153 | record_field.offset, 154 | ))), 155 | RecordFieldType::String(length) => { 156 | let value = std::str::from_utf8( 157 | &self.payload[record_field.offset..record_field.offset + length], 158 | ) 159 | .expect("invalid utf8"); 160 | Some(Value::String(Cow::Borrowed(value))) 161 | } 162 | RecordFieldType::Blob(length) => { 163 | let value = &self.payload[record_field.offset..record_field.offset + length]; 164 | Some(Value::Blob(Cow::Borrowed(value))) 165 | } 166 | RecordFieldType::One => Some(Value::Int(1)), 167 | RecordFieldType::Zero => Some(Value::Int(0)), 168 | }; 169 | 170 | Ok(value) 171 | } 172 | } 173 | 174 | fn read_i8_at(input: &[u8], offset: usize) -> i64 { 175 | input[offset] as i64 176 | } 177 | 178 | fn read_i16_at(input: &[u8], offset: usize) -> i64 { 179 | i16::from_be_bytes(input[offset..offset + 2].try_into().unwrap()) as i64 180 | } 181 | 182 | fn read_i24_at(input: &[u8], offset: usize) -> i64 { 183 | (i32::from_be_bytes(input[offset..offset + 3].try_into().unwrap()) & 0x00FFFFFF) as i64 184 | } 185 | 186 | fn read_i32_at(input: &[u8], offset: usize) -> i64 { 187 | i32::from_be_bytes(input[offset..offset + 4].try_into().unwrap()) as i64 188 | } 189 | 190 | fn read_i48_at(input: &[u8], offset: usize) -> i64 { 191 | i64::from_be_bytes(input[offset..offset + 6].try_into().unwrap()) & 0x0000FFFFFFFFFFFF 192 | } 193 | 194 | fn read_i64_at(input: &[u8], offset: usize) -> i64 { 195 | i64::from_be_bytes(input[offset..offset + 8].try_into().unwrap()) 196 | } 197 | 198 | fn read_f64_at(input: &[u8], offset: usize) -> f64 { 199 | f64::from_be_bytes(input[offset..offset + 8].try_into().unwrap()) 200 | } 201 | 202 | #[derive(Debug)] 203 | pub struct PositionedPage { 204 | pub page: Arc, 205 | pub cell: usize, 206 | } 207 | 208 | impl PositionedPage { 209 | pub fn next_cell(&mut self) -> Option<&Cell> { 210 | let cell = self.page.get(self.cell); 211 | self.cell += 1; 212 | cell 213 | } 214 | 215 | pub fn next_page(&mut self) -> Option { 216 | if self.page.header.page_type == PageType::TableInterior 217 | && self.cell == self.page.cells.len() 218 | { 219 | self.cell += 1; 220 | self.page.header.rightmost_pointer 221 | } else { 222 | None 223 | } 224 | } 225 | } 226 | 227 | #[derive(Debug)] 228 | pub struct Scanner { 229 | initial_page: usize, 230 | page_stack: Vec, 231 | pager: Pager, 232 | } 233 | 234 | impl Scanner { 235 | pub fn new(page: usize, pager: Pager) -> Scanner { 236 | Scanner { 237 | initial_page: page, 238 | page_stack: Vec::new(), 239 | pager, 240 | } 241 | } 242 | 243 | pub fn next_record(&mut self) -> anyhow::Result> { 244 | loop { 245 | match self.next_elem() { 246 | Ok(Some(ScannerElem::Cursor(cursor))) => return Ok(Some(cursor)), 247 | Ok(Some(ScannerElem::Page(page_num))) => { 248 | let new_page = self.pager.read_page(page_num as usize)?.clone(); 249 | self.page_stack.push(PositionedPage { 250 | page: new_page, 251 | cell: 0, 252 | }); 253 | } 254 | Ok(None) if self.page_stack.len() > 1 => { 255 | self.page_stack.pop(); 256 | } 257 | Ok(None) => return Ok(None), 258 | Err(e) => return Err(e), 259 | } 260 | } 261 | } 262 | 263 | fn next_elem(&mut self) -> anyhow::Result> { 264 | let pager = self.pager.clone(); 265 | 266 | let Some(page) = self.current_page()? else { 267 | return Ok(None); 268 | }; 269 | 270 | if let Some(page) = page.next_page() { 271 | return Ok(Some(ScannerElem::Page(page))); 272 | } 273 | 274 | let Some(cell) = page.next_cell() else { 275 | return Ok(None); 276 | }; 277 | 278 | match cell { 279 | Cell::TableLeaf(cell) => { 280 | let header = parse_record_header(&cell.payload)?; 281 | Ok(Some(ScannerElem::Cursor(Cursor { 282 | header, 283 | payload: cell.payload.clone(), 284 | pager, 285 | next_overflow_page: cell.first_overflow, 286 | }))) 287 | } 288 | Cell::TableInterior(cell) => Ok(Some(ScannerElem::Page(cell.left_child_page))), 289 | } 290 | } 291 | 292 | fn current_page(&mut self) -> anyhow::Result> { 293 | if self.page_stack.is_empty() { 294 | let page = match self.pager.read_page(self.initial_page) { 295 | Ok(page) => page.clone(), 296 | Err(e) => return Err(e), 297 | }; 298 | 299 | self.page_stack.push(PositionedPage { page, cell: 0 }); 300 | } 301 | 302 | Ok(self.page_stack.last_mut()) 303 | } 304 | } 305 | 306 | #[derive(Debug)] 307 | enum ScannerElem { 308 | Page(u32), 309 | Cursor(Cursor), 310 | } 311 | 312 | #[derive(Debug)] 313 | struct OverflowScanner { 314 | pager: Pager, 315 | } 316 | 317 | impl OverflowScanner { 318 | pub fn new(pager: Pager) -> Self { 319 | Self { pager } 320 | } 321 | 322 | pub fn read(&self, first_page: usize, size: usize) -> anyhow::Result<(Option, Vec)> { 323 | let mut next_page = Some(first_page); 324 | let mut buffer = Vec::with_capacity(size); 325 | 326 | while buffer.len() < size 327 | && let Some(next) = next_page 328 | { 329 | let overflow = self.pager.read_overflow(next)?; 330 | next_page = overflow.next; 331 | buffer.extend_from_slice(&overflow.payload); 332 | } 333 | 334 | Ok((next_page, buffer)) 335 | } 336 | } 337 | -------------------------------------------------------------------------------- /blog/part4.md: -------------------------------------------------------------------------------- 1 | ### Build your own SQLite, Part 4: reading tables metadata 2 | 3 | As we saw in the [opening post](/build-your-own-sqlite-part-1-listing-tables), 4 | SQLite stores metadata about tables in a special "schema table" starting on page 1. 5 | We've been reading records from this table to list the tables in the current database, 6 | but before we can start evaluating SQL queries against user-defined tables, we need to 7 | extract more information from the schema table. 8 | 9 | For each table, we need to know: 10 | 11 | * the table name 12 | * the root page 13 | * the name and type of each column 14 | 15 | The first two are very easy to extract, as they are directly stored in fields 1 and 3 16 | of the schema table's records. But column names and types will be a bit trickier, as they are 17 | not neatly separated into record fields, but are stored in a single field in the 18 | form of a `CREATE TABLE` statement that we'll need to parse. 19 | 20 | The complete source code is available 21 | on [GitHub](https://github.com/geoffreycopin/rqlite/tree/4e098ca03b814448eb1a2650d64cda12227e9300). 22 | 23 | ## Parsing `CREATE TABLE` statements 24 | 25 | The first step in extending our SQL parser to support `CREATE TABLE` statements it to 26 | add the necessary token types to the tokenizer. We'll support `CREATE TABLE` statements 27 | of the following form: 28 | 29 | ```sql 30 | CREATE TABLE table_name 31 | ( 32 | column1_name column1_type, 33 | column2_name column2_type, . 34 | . 35 | . 36 | ) 37 | ``` 38 | 39 | The following tokens are new and need to be added to the `Token` enum: `CREATE`, `TABLE`, `(`, `)`. 40 | 41 | ```diff 42 | // sql/tokenizer.rs 43 | 44 | #[derive(Debug, Eq, PartialEq)] 45 | pub enum Token { 46 | + Create, 47 | + Table, 48 | Select, 49 | As, 50 | From, 51 | + LPar, 52 | + RPar, 53 | Star, 54 | Comma, 55 | SemiColon, 56 | Identifier(String), 57 | } 58 | 59 | //[...] 60 | 61 | pub fn tokenize(input: &str) -> anyhow::Result> { 62 | let mut tokens = Vec::new(); 63 | let mut chars = input.chars().peekable(); 64 | 65 | while let Some(c) = chars.next() { 66 | match c { 67 | + '(' => tokens.push(Token::LPar), 68 | + ')' => tokens.push(Token::RPar), 69 | '*' => tokens.push(Token::Star), 70 | ',' => tokens.push(Token::Comma), 71 | ';' => tokens.push(Token::SemiColon), 72 | c if c.is_whitespace() => continue, 73 | c if c.is_alphabetic() => { 74 | let mut ident = c.to_string().to_lowercase(); 75 | while let Some(cc) = chars.next_if(|&cc| cc.is_alphanumeric() || cc == '_') { 76 | ident.extend(cc.to_lowercase()); 77 | } 78 | 79 | match ident.as_str() { 80 | + "create" => tokens.push(Token::Create), 81 | + "table" => tokens.push(Token::Table), 82 | "select" => tokens.push(Token::Select), 83 | "as" => tokens.push(Token::As), 84 | "from" => tokens.push(Token::From), 85 | _ => tokens.push(Token::Identifier(ident)), 86 | } 87 | } 88 | _ => bail!("unexpected character: {}", c), 89 | } 90 | } 91 | 92 | Ok(tokens) 93 | } 94 | ``` 95 | 96 | Next, we need to extend our AST to represent the new statement type. 97 | Our representation will be based on the [SQLite documentation](https://www.sqlite.org/lang_createtable.html). 98 | 99 | ```diff 100 | // sql/ast.rs 101 | 102 | //[...] 103 | 104 | #[derive(Debug, Clone, Eq, PartialEq)] 105 | pub enum Statement { 106 | Select(SelectStatement), 107 | + CreateTable(CreateTableStatement), 108 | } 109 | + 110 | +#[derive(Debug, Clone, Eq, PartialEq)] 111 | +pub struct CreateTableStatement { 112 | + pub name: String, 113 | + pub columns: Vec, 114 | +} 115 | + 116 | +#[derive(Debug, Clone, Eq, PartialEq)] 117 | +pub struct ColumnDef { 118 | + pub name: String, 119 | + pub col_type: Type, 120 | +} 121 | + 122 | +#[derive(Debug, Clone, Eq, PartialEq)] 123 | +pub enum Type { 124 | + Integer, 125 | + Real, 126 | + Text, 127 | + Blob, 128 | +} 129 | 130 | //[...] 131 | ``` 132 | 133 | Parsing types is straightforward: we can simply match the incoming identifier 134 | token with a predefined set of types. For now, we'll restrict ourselves to 135 | `INTEGER`, `REAL`, `TEXT`, `STRING`, and `BLOB`. 136 | Once our `parse_type` method is implemented, constructing `ColumnDef` nodes 137 | is trivial. 138 | 139 | ```rust 140 | // sql/parser.rs 141 | 142 | //[...] 143 | impl ParserState { 144 | // [...] 145 | fn parse_column_def(&mut self) -> anyhow::Result { 146 | Ok(ColumnDef { 147 | name: self.expect_identifier()?.to_string(), 148 | col_type: self.parse_type()?, 149 | }) 150 | } 151 | 152 | fn parse_type(&mut self) -> anyhow::Result { 153 | let type_name = self.expect_identifier()?; 154 | let t = match type_name.to_lowercase().as_str() { 155 | "integer" => Type::Integer, 156 | "real" => Type::Real, 157 | "blob" => Type::Blob, 158 | "text" | "string" => Type::Text, 159 | _ => bail!("unsupported type: {type_name}"), 160 | }; 161 | Ok(t) 162 | } 163 | // [...] 164 | } 165 | 166 | //[...] 167 | ``` 168 | 169 | In our implementation if the `parse_create_table` method, we'll parse column definitions 170 | using the same pattern as in the `parse_result_colums` method: 171 | 172 | ```rust 173 | // sql/parser.rs 174 | 175 | //[...] 176 | impl ParserState { 177 | // [...] 178 | fn parse_create_table(&mut self) -> anyhow::Result { 179 | self.expect_eq(Token::Create)?; 180 | self.expect_eq(Token::Table)?; 181 | let name = self.expect_identifier()?.to_string(); 182 | self.expect_eq(Token::LPar)?; 183 | let mut columns = vec![self.parse_column_def()?]; 184 | while self.next_token_is(Token::Comma) { 185 | self.advance(); 186 | columns.push(self.parse_column_def()?); 187 | } 188 | self.expect_eq(Token::RPar)?; 189 | Ok(CreateTableStatement { name, columns }) 190 | } 191 | // [...] 192 | } 193 | //[...] 194 | ``` 195 | 196 | Finally, we need to update the `parse_statement` method to handle the new statement type. 197 | We'll also update the `parse_statement` utility function to make the semicolon terminator 198 | optional, as the `CREATE TABLE` statements stored in the schema table lack a trailing semicolon. 199 | 200 | ```diff 201 | // sql/parser.rs 202 | 203 | //[...] 204 | 205 | impl ParserState { 206 | // [...] 207 | 208 | fn parse_statement(&mut self) -> anyhow::Result { 209 | - Ok(ast::Statement::Select(self.parse_select()?)) 210 | + match self.peak_next_token().context("unexpected end of input")? { 211 | + Token::Select => self.parse_select().map(Statement::Select), 212 | + Token::Create => self.parse_create_table().map(Statement::CreateTable), 213 | + token => bail!("unexpected token: {token:?}"), 214 | + } 215 | } 216 | 217 | // [...] 218 | } 219 | 220 | // [...] 221 | 222 | -pub fn parse_statement(input: &str) -> anyhow::Result { 223 | +pub fn parse_statement(input: &str, trailing_semicolon: bool) -> anyhow::Result { 224 | let tokens = tokenizer::tokenize(input)?; 225 | let mut state = ParserState::new(tokens); 226 | let statement = state.parse_statement()?; 227 | + if trailing_semicolon { 228 | state.expect_eq(Token::SemiColon)?; 229 | + } 230 | Ok(statement) 231 | } 232 | 233 | +pub fn parse_create_statement( 234 | + input: &str, 235 | +) -> anyhow::Result { 236 | + match parse_statement(input, false)? { 237 | + Statement::CreateTable(c) => Ok(c), 238 | + Statement::Select(_) => bail!("expected a create statement"), 239 | + } 240 | +} 241 | ``` 242 | 243 | ## Reading metadata 244 | 245 | Now that we have the necessary building blocks to read table metadata, 246 | we can extend our `Database` struct to store this information. 247 | The `TableMetadata::from_cursor` method builds a `TableMetadata` struct 248 | from a `Cursor` object, which represents a record in the schema table. 249 | The create statement and first page are extracted from fields 4 and 3, respectively. 250 | 251 | As records from the schema table contain informations about other kinds 252 | of objects, such as triggers, we check the `type` field at index 0 to ensure 253 | we're dealing with a table. 254 | 255 | Finally, in `Db::collect_metadata`, we iterate over all the records in the schema table, 256 | collecting table metadata for each table record we encounter. 257 | 258 | ```diff 259 | // db.rs 260 | 261 | +#[derive(Debug, Clone)] 262 | +pub struct TableMetadata { 263 | + pub name: String, 264 | + pub columns: Vec, 265 | + pub first_page: usize, 266 | +} 267 | 268 | +impl TableMetadata { 269 | + fn from_cursor(cursor: Cursor) -> anyhow::Result> { 270 | + let type_value = cursor 271 | + .field(0) 272 | + .context("missing type field") 273 | + .context("invalid type field")?; 274 | 275 | + if type_value.as_str() != Some("table") { 276 | + return Ok(None); 277 | + } 278 | 279 | + let create_stmt = cursor 280 | + .field(4) 281 | + .context("missing create statement") 282 | + .context("invalid create statement")? 283 | + .as_str() 284 | + .context("table create statement should be a string")? 285 | + .to_owned(); 286 | 287 | + let create = sql::parse_create_statement(&create_stmt)?; 288 | 289 | + let first_page = cursor 290 | + .field(3) 291 | + .context("missing table first page")? 292 | + .as_int() 293 | + .context("table first page should be an integer")? as usize; 294 | 295 | + Ok(Some(TableMetadata { 296 | + name: create.name, 297 | + columns: create.columns, 298 | + first_page, 299 | + })) 300 | + } 301 | +} 302 | 303 | pub struct Db { 304 | pub header: DbHeader, 305 | + pub tables_metadata: Vec, 306 | pager: Pager, 307 | } 308 | 309 | impl Db { 310 | pub fn from_file(filename: impl AsRef) -> anyhow::Result { 311 | let mut file = std::fs::File::open(filename.as_ref()).context("open db file")?; 312 | 313 | let mut header_buffer = [0; pager::HEADER_SIZE]; 314 | file.read_exact(&mut header_buffer) 315 | .context("read db header")?; 316 | 317 | let header = pager::parse_header(&header_buffer).context("parse db header")?; 318 | 319 | + let tables_metadata = Self::collect_tables_metadata(&mut Pager::new( 320 | + file.try_clone()?, 321 | + header.page_size as usize, 322 | + ))?; 323 | 324 | let pager = Pager::new(file, header.page_size as usize); 325 | 326 | Ok(Db { 327 | header, 328 | pager, 329 | + tables_metadata, 330 | }) 331 | } 332 | 333 | + fn collect_tables_metadata(pager: &mut Pager) -> anyhow::Result> { 334 | + let mut metadata = Vec::new(); 335 | + let mut scanner = Scanner::new(pager, 1); 336 | 337 | + while let Some(record) = scanner.next_record()? { 338 | + if let Some(m) = TableMetadata::from_cursor(record)? { 339 | + metadata.push(m); 340 | + } 341 | + } 342 | 343 | + Ok(metadata) 344 | + } 345 | 346 | // [...] 347 | } 348 | ``` 349 | 350 | Our initial implementation of the `.table` command can be updated to use the new metadata: 351 | 352 | ```diff 353 | // main.rs 354 | 355 | fn display_tables(db: &mut db::Db) -> anyhow::Result<()> { 356 | - let mut scanner = db.scanner(1); 357 | - 358 | - while let Some(mut record) = scanner.next_record()? { 359 | - let type_value = record 360 | - .field(0) 361 | - .context("missing type field") 362 | - .context("invalid type field")?; 363 | 364 | - if type_value.as_str() == Some("table") { 365 | - let name_value = record 366 | - .field(1) 367 | - .context("missing name field") 368 | - .context("invalid name field")?; 369 | 370 | - print!("{} ", name_value.as_str().unwrap()); 371 | - } 372 | - } 373 | + for table in &db.tables_metadata { 374 | + print!("{} ", &table.name) 375 | + } 376 | 377 | Ok(()) 378 | } 379 | ``` 380 | 381 | ## Conclusion 382 | 383 | We've extended our SQL parser to support `CREATE TABLE` statements and used it to 384 | extract metadata from the schema table. By parsing the schema, we now have a 385 | way to understand the structure of tables in our database. 386 | 387 | In the next post, we'll leverage this metadata to build a query evaluator 388 | that can execute simple `SELECT` queries against user-defined tables, 389 | bringing us one step closer to a fully functional database engine. 390 | -------------------------------------------------------------------------------- /blog/part5.md: -------------------------------------------------------------------------------- 1 | ### Build your own SQLite, Part 5: Evaluating queries 2 | 3 | In the previous posts, we've explored the 4 | [SQLite file format](/build-your-own-sqlite-part-1-listing-tables) and built a 5 | simple [SQL parser](/build-your-own-sqlite-part-3-sql-parsing-101). It's time 6 | to put these pieces together and implement a query evaluator! 7 | In this post, we'll lay the groundwork for evaluating SQL queries and build a 8 | query evaluator that can handle basic SELECT statements. While our initial implementation 9 | won't support filtering, sorting, grouping, or joins yet, it will give us the 10 | foundation to add these features in future posts. 11 | 12 | As usual, the complete source code for this post is available 13 | on [GitHub](https://github.com/geoffreycopin/rqlite/commit/c7dfeeea6956e209ccbd50a727c2b9352c246082). 14 | 15 | ## Setting up our test database 16 | 17 | Before we can evaluate queries, we need a database to query. We'll start by 18 | creating a simple database with a single table, `table1`, with two columns, 19 | `id` and `value`: 20 | 21 | ```bash 22 | sqlite3 queries_test.db 23 | sqlite> create table table1(id integer, value text); 24 | sqlite> insert into table1(id, value) values 25 | ...> (1, '11'), 26 | ...> (2, '12'), 27 | ...> (3, '13'); 28 | sqlite> .exit 29 | ``` 30 | 31 | ⚠️ You might be tempted to use an existing SQLite database to test your queries, 32 | but keep in mind that our implementation does not support overflow pages yet, 33 | so it might not be able to read the data from your database file. 34 | 35 | ## Making the pager shareable 36 | 37 | --- 38 | This section is specific to the Rust implementation. If you're following along 39 | with another language, you can safely skip it! 40 | 41 | --- 42 | 43 | Currently, our pager can only be used through an exclusive mutable reference. 44 | This was fine for our initial use cases, but as we start building more complex 45 | features, maintaining this restriction will constrain our design. 46 | We'll make the pager shareable by wrapping its inner mutable fields in an 47 | `Arc>` and `Arc>`. This will allow us to effectively clone the pager and 48 | use it from multiple places without running into borrow checker issues. 49 | At this stage of the project we could have chosen to use a simple `Rc>`, 50 | but we'll eventually need to support concurrent access to the pager, so we'll 51 | use thread-safe counterparts from the start. 52 | 53 | ```diff 54 | // src/pager.rs 55 | 56 | - #[derive(Debug, Clone)] 57 | + #[derive(Debug)] 58 | pub struct Pager { 59 | - input: I, 60 | + input: Arc> 61 | page_size: usize, 62 | - pages: HashMap, 63 | + pages: Arc>>>, 64 | } 65 | ``` 66 | 67 | The `read_page` and `load_page` methods need to be updated accordingly: 68 | 69 | ```rust 70 | impl Pager { 71 | // [...] 72 | pub fn read_page(&self, n: usize) -> anyhow::Result> { 73 | { 74 | let read_pages = self 75 | .pages 76 | .read() 77 | .map_err(|_| anyhow!("failed to acquire pager read lock"))?; 78 | 79 | if let Some(page) = read_pages.get(&n) { 80 | return Ok(page.clone()); 81 | } 82 | } 83 | 84 | let mut write_pages = self 85 | .pages 86 | .write() 87 | .map_err(|_| anyhow!("failed to acquire pager write lock"))?; 88 | 89 | if let Some(page) = write_pages.get(&n) { 90 | return Ok(page.clone()); 91 | } 92 | 93 | let page = self.load_page(n)?; 94 | write_pages.insert(n, page.clone()); 95 | Ok(page) 96 | } 97 | 98 | fn load_page(&self, n: usize) -> anyhow::Result> { 99 | let offset = n.saturating_sub(1) * self.page_size; 100 | 101 | let mut input_guard = self 102 | .input 103 | .lock() 104 | .map_err(|_| anyhow!("failed to lock pager mutex"))?; 105 | 106 | input_guard 107 | .seek(SeekFrom::Start(offset as u64)) 108 | .context("seek to page start")?; 109 | 110 | let mut buffer = vec![0; self.page_size]; 111 | input_guard.read_exact(&mut buffer).context("read page")?; 112 | 113 | Ok(Arc::new(parse_page(&buffer, n)?)) 114 | } 115 | } 116 | ``` 117 | 118 | Two things to note regarding the `read_page` method: 119 | 120 | - the initial attempt to read the page from the cache is nested in a block to 121 | limit the scope of the read lock, ensuring that it is released before we try 122 | to acquire the write lock 123 | - after acquiring the write lock, we check again if the page is already in the 124 | cache, in case it was inserted in between the two lock acquisitions 125 | 126 | Similarly, we'll define an owned version of our `Value` enum that we'll use 127 | in the query evaluator: 128 | 129 | ```rust 130 | // src/value.rs 131 | 132 | // [...] 133 | 134 | #[derive(Debug, Clone)] 135 | pub enum OwnedValue { 136 | Null, 137 | String(Rc), 138 | Blob(Rc>), 139 | Int(i64), 140 | Float(f64), 141 | } 142 | 143 | impl<'p> From> for OwnedValue { 144 | fn from(value: Value<'p>) -> Self { 145 | match value { 146 | Value::Null => Self::Null, 147 | Value::Int(i) => Self::Int(i), 148 | Value::Float(f) => Self::Float(f), 149 | Value::Blob(b) => Self::Blob(Rc::new(b.into_owned())), 150 | Value::String(s) => Self::String(Rc::new(s.into_owned())), 151 | } 152 | } 153 | } 154 | 155 | impl std::fmt::Display for OwnedValue { 156 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 157 | match self { 158 | OwnedValue::Null => write!(f, "null"), 159 | OwnedValue::String(s) => s.fmt(f), 160 | OwnedValue::Blob(items) => { 161 | write!( 162 | f, 163 | "{}", 164 | items 165 | .iter() 166 | .filter_map(|&n| char::from_u32(n as u32).filter(char::is_ascii)) 167 | .collect::() 168 | ) 169 | } 170 | OwnedValue::Int(i) => i.fmt(f), 171 | OwnedValue::Float(x) => x.fmt(f), 172 | } 173 | } 174 | } 175 | ``` 176 | 177 | Finally, we'll enrich our `Cursor` struct with a method that returns the value 178 | of a field as an `OwnedValue`: 179 | 180 | ```rust 181 | // src/cursor.rs 182 | 183 | impl Cursor { 184 | // [...] 185 | pub fn owned_field(&self, n: usize) -> Option { 186 | self.field(n).map(Into::into) 187 | } 188 | // [...] 189 | } 190 | ``` 191 | 192 | ## Evaluating `SELECT` statements 193 | 194 | Our query engine will be composed of two main components: 195 | 196 | - an iterator-like `Operator` enum that represents nestable operations on the 197 | database, such as scanning a table or filtering rows. Our initial implementation 198 | will only contain a `SeqScan` operator that yields all rows from a table. 199 | - a `Planner` struct that takes a parsed SQL query and produces an `Operator` that 200 | can be evaluated to produce the query result. 201 | 202 | Let's start by defining the `Operator` enum: 203 | 204 | ```rust 205 | // src/engine/operator.rs 206 | use anyhow::Context; 207 | 208 | use crate::{cursor::Scanner, value::OwnedValue}; 209 | 210 | #[derive(Debug)] 211 | pub enum Operator { 212 | SeqScan(SeqScan), 213 | } 214 | 215 | impl Operator { 216 | pub fn next_row(&mut self) -> anyhow::Result> { 217 | match self { 218 | Operator::SeqScan(s) => s.next_row(), 219 | } 220 | } 221 | } 222 | ``` 223 | 224 | The result of evaluating a query will be obtained by repeatedly calling the 225 | `next_row` method on the `Operator` until it returns `None`. Each value 226 | in the returned slice corresponds to a column in the query result. 227 | 228 | The `SeqScan` struct will be responsible for scanning a table and yielding 229 | its rows: 230 | 231 | ```rust 232 | // src/engine/operator.rs 233 | 234 | // [...] 235 | 236 | #[derive(Debug)] 237 | pub struct SeqScan { 238 | fields: Vec, 239 | scanner: Scanner, 240 | row_buffer: Vec, 241 | } 242 | 243 | impl SeqScan { 244 | pub fn new(fields: Vec, scanner: Scanner) -> Self { 245 | let row_buffer = vec![OwnedValue::Null; fields.len()]; 246 | 247 | Self { 248 | fields, 249 | scanner, 250 | row_buffer, 251 | } 252 | } 253 | 254 | fn next_row(&mut self) -> anyhow::Result> { 255 | let Some(record) = self.scanner.next_record()? else { 256 | return Ok(None); 257 | }; 258 | 259 | for (i, &n) in self.fields.iter().enumerate() { 260 | self.row_buffer[i] = record.owned_field(n).context("missing record field")?; 261 | } 262 | 263 | Ok(Some(&self.row_buffer)) 264 | } 265 | } 266 | ``` 267 | 268 | The `SeqScan` struct is initialized with a list of field indices to read from 269 | each record and a `Scanner` that will yield the records for every row in the 270 | table to be scanned. As the number of fields to read is identical for every row, 271 | we can preallocate a buffer to store the values of the selected fields. 272 | The next_row method retrieves the next record from the scanner, extracts 273 | the requested fields (specified by their indices), and stores them in our buffer. 274 | 275 | Now that we have an `Operator` to evaluate `SELECT` statements, let's move on 276 | to the `Planner` struct that will produce the `Operator` from a parsed SQL query: 277 | 278 | ```rust 279 | // src/engine/plan.rs 280 | 281 | use anyhow::{bail, Context, Ok}; 282 | 283 | use crate::{ 284 | db::Db, 285 | sql::ast::{self, SelectFrom}, 286 | }; 287 | 288 | use super::operator::{Operator, SeqScan}; 289 | 290 | pub struct Planner<'d> { 291 | db: &'d Db, 292 | } 293 | 294 | impl<'d> Planner<'d> { 295 | pub fn new(db: &'d Db) -> Self { 296 | Self { db } 297 | } 298 | 299 | pub fn compile(self, statement: &ast::Statement) -> anyhow::Result { 300 | match statement { 301 | ast::Statement::Select(s) => self.compile_select(s), 302 | stmt => bail!("unsupported statement: {stmt:?}"), 303 | } 304 | } 305 | } 306 | ``` 307 | 308 | The `Planner` struct is initialized with a reference to the database and 309 | provides a `compile` method that takes a parsed SQL statement and returns 310 | the corresponding `Operator`. 311 | The `compile` method dispatches to a specific method for each type of SQL statement. 312 | 313 | Let's see how to build an `Operator` for a `SELECT` statement: 314 | 315 | ```rust 316 | 317 | // src/engine/plan.rs 318 | 319 | impl<'d> Planner<'d> { 320 | // [...] 321 | 322 | fn compile_select(self, select: &ast::SelectStatement) -> anyhow::Result { 323 | let SelectFrom::Table(table_name) = &select.core.from; 324 | 325 | let table = self 326 | .db 327 | .tables_metadata 328 | .iter() 329 | .find(|m| &m.name == table_name) 330 | .with_context(|| format!("invalid table name: {table_name}"))?; 331 | 332 | let mut columns = Vec::new(); 333 | 334 | for res_col in &select.core.result_columns { 335 | match res_col { 336 | ast::ResultColumn::Star => { 337 | for i in 0..table.columns.len() { 338 | columns.push(i); 339 | } 340 | } 341 | ast::ResultColumn::Expr(e) => { 342 | let ast::Expr::Column(col) = &e.expr; 343 | let (index, _) = table 344 | .columns 345 | .iter() 346 | .enumerate() 347 | .find(|(_, c)| c.name == col.name) 348 | .with_context(|| format!("invalid column name: {}", col.name))?; 349 | columns.push(index); 350 | } 351 | } 352 | } 353 | 354 | Ok(Operator::SeqScan(SeqScan::new( 355 | columns, 356 | self.db.scanner(table.first_page), 357 | ))) 358 | } 359 | } 360 | ``` 361 | 362 | First, we find a table metadata entry that matches the table name in the `SELECT` 363 | statement. Then we iterate over the statement's result columns and build a list of 364 | field indices to read from each record, either by expanding `*` to all columns or 365 | by looking up the column name in the table metadata. 366 | 367 | Finally, we create a `SeqScan` operator that will scan the entire tabl and yield 368 | the selected fields for each row. 369 | 370 | ## Query evaluation in the REPL 371 | 372 | It's time to put our query evaluator to the test! 373 | We'll create a simple function that reads a raw SQL query and evaluates it: 374 | 375 | ```rust 376 | 377 | // src/main.rs 378 | 379 | // [...] 380 | 381 | fn eval_query(db: &db::Db, query: &str) -> anyhow::Result<()> { 382 | let parsed_query = sql::parse_statement(query, false)?; 383 | let mut op = engine::plan::Planner::new(db).compile(&parsed_query)?; 384 | 385 | while let Some(values) = op.next_row()? { 386 | let formated = values 387 | .iter() 388 | .map(ToString::to_string) 389 | .collect::>() 390 | .join("|"); 391 | 392 | println!("{formated}"); 393 | } 394 | 395 | Ok(()) 396 | } 397 | ``` 398 | 399 | This function creates a pipeline: it parses the SQL query, builds an 400 | `Operator` with our Planner, and then repeatedly calls next_row() on the resulting operator 401 | to retrieve and display each row of the result. 402 | 403 | The final step is to use this function in the REPL loop: 404 | 405 | ```diff 406 | // src/main.rs 407 | 408 | // [...] 409 | 410 | fn cli(mut db: db::Db) -> anyhow::Result<()> { 411 | print_flushed("rqlite> ")?; 412 | 413 | let mut line_buffer = String::new(); 414 | 415 | while stdin().lock().read_line(&mut line_buffer).is_ok() { 416 | match line_buffer.trim() { 417 | ".exit" => break, 418 | ".tables" => display_tables(&mut db)?, 419 | + stmt => eval_query(&db, stmt)?, 420 | - stmt => match sql::parse_statement(stmt, true) { 421 | - Ok(stmt) => { 422 | - println!("{:?}", stmt); 423 | - } 424 | - Err(e) => { 425 | - println!("Error: {}", e); 426 | - } 427 | - }, 428 | } 429 | 430 | print_flushed("\nrqlite> ")?; 431 | 432 | line_buffer.clear(); 433 | } 434 | 435 | Ok(()) 436 | } 437 | ``` 438 | 439 | Now we can run the REPL and evaluate some simple `SELECT` statements: 440 | 441 | ```bash 442 | cargo run -- queries_test.db 443 | rqlite> select * from table1; 444 | ``` 445 | 446 | If everything went well, you should see the following output: 447 | 448 | ```bash 449 | 1|11 450 | 2|12 451 | 3|13 452 | ``` 453 | 454 | ## Conclusion 455 | 456 | Our small database engine is starting to take shape! We can now parse and evaluate 457 | simple `SELECT` queries. But there's still a lot to cover before we can call it 458 | a fully functional database engine. 459 | In the next posts, we'll discover how to filter rows, read indexes, and implement 460 | sorting and grouping. 461 | -------------------------------------------------------------------------------- /blog/part2.md: -------------------------------------------------------------------------------- 1 | ### Build your own SQLite, Part 2: Scanning large tables 2 | 3 | In the previous post, we discovered the SQLite file format and implemented a toy version 4 | of the `.tables` command, allowing us to display the list of tables in a database. 5 | But our implementation has a jarring limitation: it assumes that all the data fits into the first 6 | page of the file. In this post, we'll discover how SQLite represents tables that are too large to fit 7 | into a single page, this will make our `.tables` command more useful, but also lay the groundwork for 8 | our query engine. 9 | 10 | ## Erratum 11 | 12 | If you're one of the early readers of the first post, and you coded along, a small mistake might 13 | have slipped into your code: in the `load_page` method of the `Pager` struct, there is 14 | no need to add `HEADER_SIZE` to the `offset` parameter. Here is the beginning of the corrected version: 15 | 16 | ```diff 17 | // src/pager.rs 18 | 19 | fn load_page(&mut self, n: usize) -> anyhow::Result { 20 | - let offset = HEADER_SIZE + n.saturating_sub(1) * self.page_size; 21 | + let offset = n.saturating_sub(1) * self.page_size; 22 | // the rest of the method stays the same 23 | // [...] 24 | } 25 | ``` 26 | 27 | ## A motivating example 28 | 29 | Let's begin our journey with a much larger test database: 30 | 31 | ```bash 32 | for i in {1..1000}; do 33 | sqlite3 res/test.db "create table table$i(id integer)" 34 | done 35 | 36 | cargo run --release -- res/test.db 37 | rqlite> .tables 38 | ``` 39 | 40 | Without much surprise, our small program isn't able to display the list of tables. 41 | The reason for that is quite simple: database pages are typically 4096 bytes long, which 42 | is far from enough to store 1000 tables. 43 | But why did our code fail, instead of displaying the first records that fit into the first page? 44 | 45 | ## B-tree interior pages 46 | 47 | When a table is too large to fit into a single page, SQLite splits it into multiple pages, of 48 | different types: 49 | 50 | - leaf pages, that contains the actual records 51 | - interior pages, that store information about which page contains the records for which table. 52 | 53 | Interior tables have the same high-level structure as leaf pages, with two key differences: 54 | 55 | - instead of storing record, they store a tuple `(child_page_number, key)` where `child_page_number` is 56 | a 32 bits unsigned integer representing the page number of the "root" page of a subtree that contains 57 | records with keys lower or equal to `key`. 58 | Cells in interior pages are logically ordered by `key` in ascending order. 59 | - the header contains an extra field, the "rightmost pointer", which is the page number of the "root" of the 60 | subtree that contains records with keys greater than the largest key in the page. 61 | 62 | With this new knowledge, we can update our page data structure. We'll start by adding 63 | the new optional `rightmost_pointer` field to the page header. We'll also add a `byte_size` 64 | method that returns the size of the header, depending on wheter the `rightmost_pointer` field is set or not, 65 | and add a new variant to our `PageType` enum to represent interior pages. 66 | 67 | ```diff 68 | // src/page.rs 69 | 70 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 71 | pub enum PageType { 72 | TableLeaf, 73 | + TableInterior, 74 | } 75 | 76 | #[derive(Debug, Copy, Clone)] 77 | pub struct PageHeader { 78 | pub page_type: PageType, 79 | pub first_freeblock: u16, 80 | pub cell_count: u16, 81 | pub cell_content_offset: u32, 82 | pub fragmented_bytes_count: u8, 83 | + pub rightmost_pointer: Option, 84 | } 85 | 86 | +impl PageHeader { 87 | + pub fn byte_size(&self) -> usize { 88 | + if self.rightmost_pointer.is_some() { 89 | + 12 90 | + } else { 91 | + 8 92 | + } 93 | + } 94 | +} 95 | ``` 96 | 97 | Let's modify the parsing function to take the new field into account: 98 | 99 | ```diff 100 | // src/pager.rs 101 | 102 | + const PAGE_LEAF_TABLE_ID: u8 = 0x0d; 103 | + const PAGE_INTERIOR_TABLE_ID: u8 = 0x05; 104 | 105 | fn parse_page_header(buffer: &[u8]) -> anyhow::Result { 106 | - let page_type = match buffer[0] { 107 | - 0x0d => page::PageType::TableLeaf, 108 | + let (page_type, has_rightmost_ptr) = match buffer[0] { 109 | + PAGE_LEAF_TABLE_ID => (page::PageType::TableLeaf, false), 110 | + PAGE_INTERIOR_TABLE_ID => (page::PageType::TableInterior, true), 111 | _ => anyhow::bail!("unknown page type: {}", buffer[0]), 112 | }; 113 | 114 | let first_freeblock = read_be_word_at(buffer, PAGE_FIRST_FREEBLOCK_OFFSET); 115 | let cell_count = read_be_word_at(buffer, PAGE_CELL_COUNT_OFFSET); 116 | let cell_content_offset = match read_be_word_at(buffer, PAGE_CELL_CONTENT_OFFSET) { 117 | 0 => 65536, 118 | n => n as u32, 119 | }; 120 | let fragmented_bytes_count = buffer[PAGE_FRAGMENTED_BYTES_COUNT_OFFSET]; 121 | 122 | + let rightmost_pointer = if has_rightmost_ptr { 123 | + Some(read_be_double_at(buffer, PAGE_RIGHTMOST_POINTER_OFFSET)) 124 | + } else { 125 | + None 126 | + }; 127 | 128 | Ok(page::PageHeader { 129 | page_type, 130 | first_freeblock, 131 | cell_count, 132 | cell_content_offset, 133 | fragmented_bytes_count, 134 | + rightmost_pointer, 135 | }) 136 | } 137 | ``` 138 | 139 | We decide whether to parse the `rightmost_pointer` field depending on the value of the `page_type` 140 | byte (`0x0d` for leaf pages, `0x05` for interior pages). 141 | 142 | Next, we'll update the `Page` struct to reflect the fact that both leaf and interior pages 143 | share the same structure, with the only difference being the content of the cells: 144 | 145 | ```diff 146 | // src/page.rs 147 | 148 | #[derive(Debug, Clone)] 149 | - pub struct TableLeafPage { 150 | + pub struct Page { 151 | pub header: PageHeader, 152 | pub cell_pointers: Vec, 153 | - pub cells: Vec, 154 | + pub cells: Vec, 155 | } 156 | 157 | - #[derive(Debug, Clone)] 158 | - pub enum Page { 159 | - TableLeaf(TableLeafPage), 160 | - } 161 | 162 | + #[derive(Debug, Clone)] 163 | + pub enum Cell { 164 | + TableLeaf(TableLeafCell), 165 | + TableInterior(TableInteriorCell), 166 | + } 167 | 168 | + impl From for Cell { 169 | + fn from(cell: TableLeafCell) -> Self { 170 | + Cell::TableLeaf(cell) 171 | + } 172 | + } 173 | 174 | + impl From for Cell { 175 | + fn from(cell: TableInteriorCell) -> Self { 176 | + Cell::TableInterior(cell) 177 | + } 178 | + } 179 | 180 | + pub struct TableInteriorCell { 181 | + pub left_child_page: u32, 182 | + pub key: i64, 183 | + } 184 | ``` 185 | 186 | This change calls for a major update of our parsing functions, reproduced below: 187 | 188 | ```rust 189 | // src/pager.rs 190 | 191 | fn parse_page(buffer: &[u8], page_num: usize) -> anyhow::Result { 192 | let ptr_offset = if page_num == 1 { HEADER_SIZE as u16 } else { 0 }; 193 | let content_buffer = &buffer[ptr_offset as usize..]; 194 | let header = parse_page_header(content_buffer)?; 195 | let cell_pointers = parse_cell_pointers( 196 | &content_buffer[header.byte_size()..], 197 | header.cell_count as usize, 198 | ptr_offset, 199 | ); 200 | 201 | let cells_parsing_fn = match header.page_type { 202 | page::PageType::TableLeaf => parse_table_leaf_cell, 203 | page::PageType::TableInterior => parse_table_interior_cell, 204 | }; 205 | 206 | let cells = parse_cells(content_buffer, &cell_pointers, cells_parsing_fn)?; 207 | 208 | Ok(page::Page { 209 | header, 210 | cell_pointers, 211 | cells, 212 | }) 213 | } 214 | 215 | fn parse_cells( 216 | buffer: &[u8], 217 | cell_pointers: &[u16], 218 | parse_fn: impl Fn(&[u8]) -> anyhow::Result, 219 | ) -> anyhow::Result> { 220 | cell_pointers 221 | .iter() 222 | .map(|&ptr| parse_fn(&buffer[ptr as usize..])) 223 | .collect() 224 | } 225 | 226 | fn parse_table_leaf_cell(mut buffer: &[u8]) -> anyhow::Result { 227 | let (n, size) = read_varint_at(buffer, 0); 228 | buffer = &buffer[n as usize..]; 229 | 230 | let (n, row_id) = read_varint_at(buffer, 0); 231 | buffer = &buffer[n as usize..]; 232 | 233 | let payload = buffer[..size as usize].to_vec(); 234 | 235 | Ok(page::TableLeafCell { 236 | size, 237 | row_id, 238 | payload, 239 | } 240 | .into()) 241 | } 242 | 243 | fn parse_table_interior_cell(mut buffer: &[u8]) -> anyhow::Result { 244 | let left_child_page = read_be_double_at(buffer, 0); 245 | buffer = &buffer[4..]; 246 | 247 | let (_, key) = read_varint_at(buffer, 0); 248 | 249 | Ok(page::TableInteriorCell { 250 | left_child_page, 251 | key, 252 | } 253 | .into()) 254 | } 255 | ``` 256 | 257 | ## Scanning logic 258 | 259 | Our scanning logic will need to be updated to handle interior pages. We can no longer 260 | simply iterate over the cells of a page and call it a day. Instead, we'll need to 261 | implement a depth-first algorithm that recursively explores the tree, starting from 262 | the root page. 263 | 264 | To make our task easier, let's introduce a new `PositionedPage` struct that 265 | stores a page, along with the index of the `current` cell we're looking at: 266 | 267 | ```rust 268 | // src/pager.rs 269 | 270 | #[derive(Debug)] 271 | pub struct PositionedPage { 272 | pub page: Page, 273 | pub cell: usize, 274 | } 275 | 276 | impl PositionedPage { 277 | pub fn next_cell(&mut self) -> Option<&Cell> { 278 | let cell = self.page.get(self.cell); 279 | self.cell += 1; 280 | cell 281 | } 282 | 283 | pub fn next_page(&mut self) -> Option { 284 | if self.page.header.page_type == PageType::TableInterior 285 | && self.cell == self.page.cells.len() 286 | { 287 | self.cell += 1; 288 | self.page.header.rightmost_pointer 289 | } else { 290 | None 291 | } 292 | } 293 | } 294 | ``` 295 | 296 | The `next_cell` method returns the content of the current cell and increments the cell index, 297 | so calling it repeatedly will yiels the content of all the cells in the page. 298 | 299 | The `next_page` method is a bit more complex: it returns the `rightmost_pointer` of the current 300 | page if it's an interior page and we just visited the last cell, otherwise it 301 | it returns `None`. 302 | 303 | We'll also update our `Cursor` so that it owns it's payload instead of borrowing it through a `Pager`: 304 | 305 | ```dist 306 | // src/pager.rs 307 | 308 | #[derive(Debug)] 309 | - pub struct Cursor<'p> { 310 | + pub struct Cursor { 311 | header: RecordHeader, 312 | - pager: &'p mut Pager, 313 | - page_index: usize, 314 | - page_cell: usize, 315 | + payload: Vec, 316 | } 317 | ``` 318 | 319 | This change will allow us to avoid borrowing the `Pager` mutably from the 320 | `Cursor` and the `Scanner` at the same time, which would lead to a 321 | difficult-to-solve lifetime issue. 322 | 323 | With that out of the way, we can focus on the tree scanning logic. 324 | We'll maintain a stack of `PositionedPage` to keep track of the parent 325 | pages we've visited. 326 | At every step of the walk, there are a few cases to consider: 327 | 328 | - if the current page is a leaf page and we haven't visited all the cells yet, 329 | we'll just have to build a `Cursor` with the current cell's payload and return it. 330 | - if the current page is an interior page, we'll push the next page (either from the 331 | current cell or the rightmost pointer) to the stack and continue the walk. 332 | - if we've visited all the cells of the current page, we'll pop the stack and continue 333 | the walk from the parent page. 334 | 335 | This logic is implemented in the new `Scanner` struct: 336 | 337 | ```rust 338 | // src/pager.rs 339 | 340 | #[derive(Debug)] 341 | pub struct Scanner<'p> { 342 | pager: &'p mut Pager, 343 | initial_page: usize, 344 | page_stack: Vec, 345 | } 346 | 347 | impl<'p> Scanner<'p> { 348 | pub fn new(pager: &'p mut Pager, page: usize) -> Scanner<'p> { 349 | Scanner { 350 | pager, 351 | initial_page: page, 352 | page_stack: Vec::new(), 353 | } 354 | } 355 | 356 | pub fn next_record(&mut self) -> anyhow::Result> { 357 | loop { 358 | match self.next_elem() { 359 | Ok(Some(ScannerElem::Cursor(cursor))) => return Ok(Some(cursor)), 360 | Ok(Some(ScannerElem::Page(page_num))) => { 361 | let new_page = self.pager.read_page(page_num as usize)?.clone(); 362 | self.page_stack.push(PositionedPage { 363 | page: new_page, 364 | cell: 0, 365 | }); 366 | } 367 | Ok(None) if self.page_stack.len() > 1 => { 368 | self.page_stack.pop(); 369 | } 370 | Ok(None) => return Ok(None), 371 | Err(e) => return Err(e), 372 | } 373 | } 374 | } 375 | 376 | fn next_elem(&mut self) -> anyhow::Result> { 377 | let Some(page) = self.current_page()? else { 378 | return Ok(None); 379 | }; 380 | 381 | if let Some(page) = page.next_page() { 382 | return Ok(Some(ScannerElem::Page(page))); 383 | } 384 | 385 | let Some(cell) = page.next_cell() else { 386 | return Ok(None); 387 | }; 388 | 389 | match cell { 390 | Cell::TableLeaf(cell) => { 391 | let header = parse_record_header(&cell.payload)?; 392 | Ok(Some(ScannerElem::Cursor(Cursor { 393 | header, 394 | payload: cell.payload.clone(), 395 | }))) 396 | } 397 | Cell::TableInterior(cell) => Ok(Some(ScannerElem::Page(cell.left_child_page))), 398 | } 399 | } 400 | 401 | fn current_page(&mut self) -> anyhow::Result> { 402 | if self.page_stack.is_empty() { 403 | let page = match self.pager.read_page(self.initial_page) { 404 | Ok(page) => page.clone(), 405 | Err(e) => return Err(e), 406 | }; 407 | 408 | self.page_stack.push(PositionedPage { page, cell: 0 }); 409 | } 410 | 411 | Ok(self.page_stack.last_mut()) 412 | } 413 | } 414 | 415 | #[derive(Debug)] 416 | enum ScannerElem { 417 | Page(u32), 418 | Cursor(Cursor), 419 | } 420 | ``` 421 | 422 | ## Putting it all together 423 | 424 | The only change that remains to be made is to update the `display_tables` function 425 | to account for the change in `next_record` signature: 426 | 427 | ```diff 428 | // src/main.rs 429 | 430 | fn display_tables(db: &mut db::Db) -> anyhow::Result<()> { 431 | let mut scanner = db.scanner(1); 432 | 433 | - while let Some(Ok(mut record)) = scanner.next_record() { 434 | + while let Some(mut record) = scanner.next_record()? { 435 | let type_value = record 436 | .field(0) 437 | .context("missing type field") 438 | .context("invalid type field")?; 439 | 440 | if type_value.as_str() == Some("table") { 441 | let name_value = record 442 | .field(1) 443 | .context("missing name field") 444 | .context("invalid name field")?; 445 | 446 | print!("{} ", name_value.as_str().unwrap()); 447 | } 448 | } 449 | 450 | Ok(()) 451 | } 452 | ``` 453 | 454 | We can now display our (long!) list of tables: 455 | 456 | ```bash 457 | cargo run --release -- res/test.db 458 | rqlite> .tables 459 | ``` 460 | 461 | ## Conclusion 462 | 463 | Our scanning logic is now able to handle tables that span multiple pages, thanks to the introduction 464 | of interior pages. This is a major milestone in our journey to build a fully functional 465 | database! In the next post, we'll learn how to parse simple SQL queries and will lay 466 | the groundwork for our query engine. 467 | -------------------------------------------------------------------------------- /blog/part3.md: -------------------------------------------------------------------------------- 1 | ### Build your own SQLite, Part 3: SQL parsing 101 2 | 3 | After discovering the SQLite file format and implementing the `.tables` command 4 | in [part 1](/build-your-own-sqlite-part-1-listing-tables) 5 | and [part 2](/build-your-own-sqlite-part-2-scanning-large-tables) of this series, 6 | we're ready to tackle the next big challenge: writing our own SQL parser from scratch. 7 | 8 | As the SQL dialect supported by SQLite is quite large and complex, we'll initially limit ourselves to 9 | a subset that comprises only the `select` statement, in a striped-down form. Only expressions 10 | of the form `select from ` will be supported, where `` is either `*` or a 11 | comma-separated list of columns names (with an optional `as` alias), and `
` is the name of a table. 12 | 13 | The full SQL syntax, as implemented in SQLite is described in great detail in 14 | the [SQL As Understood By SQLite](https://www.sqlite.org/lang.html) document. 15 | 16 | ## Parsing Basics 17 | 18 | Our SQL parser will follow a conventional 2 steps process: lexical analysis (or tokenization) 19 | and syntax analysis (or parsing). 20 | 21 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1731883774774/be10e087-f8dd-44be-bde1-452d01092447.png align=" 22 | center") 23 | 24 | The lexical analysis step takes the input SQL string and groups individual characters 25 | into tokens, which are meaningful units of the language. For example, the character 26 | sequence S-E-L-E-C-T will be grouped into a single token of type `select`, and the 27 | sequence `*` will be grouped into a token of type `star`. This stage is also responsible 28 | for discarding whitespace and normalizing the case of the input. 29 | 30 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1731883804905/626ca57d-9426-4c5c-914e-b8d4e81df119.png align=" 31 | center") 32 | 33 | The syntax analysis step takes the stream of tokens produced by the lexical analysis, 34 | and tries to match them against the syntax rules of the language. Its output is an 35 | abstract syntax tree (AST), which is a hierarchical representation of the input SQL. 36 | 37 | ## Writing the tokenizer 38 | 39 | The first step in writing our tokenizer is to define a `Token` type that will represent 40 | the individual tokens of our SQL dialect. This definition will live in a new module: 41 | `sql::tokenizer`. 42 | 43 | ```rust 44 | // sql/tokenizer.rs 45 | #[derive(Debug, Eq, PartialEq)] 46 | pub enum Token { 47 | Select, 48 | As, 49 | From, 50 | Star, 51 | Comma, 52 | SemiColon, 53 | Identifier(String), 54 | } 55 | 56 | impl Token { 57 | pub fn as_identifier(&self) -> Option<&str> { 58 | match self { 59 | Token::Identifier(ident) => Some(ident), 60 | _ => None, 61 | } 62 | } 63 | } 64 | ``` 65 | 66 | We also define a utility function `as_identifier` that will return the string value of 67 | a token if it is an `Identifier`, and `None` otherwise. 68 | 69 | The logic of the tokenize function is quite simple: we iterate over the input string's 70 | characters, and based on the current character we decide which token to emit: 71 | 72 | - if the character matches a single-character token, we emit it immediately 73 | - if the character is a whitespace, it is discarded 74 | - finally, if the character is a letter, we start a new identifier token and keep accumulating 75 | characters until we reach a character that is not a valid identifier character. At this point, 76 | if the accumulated string is a keyword, we emit the corresponding token, otherwise, we emit 77 | a raw `Identifier` token. 78 | 79 | ```rust 80 | // sql/tokenizer.rs 81 | use anyhow::bail; 82 | 83 | pub fn tokenize(input: &str) -> anyhow::Result> { 84 | let mut tokens = Vec::new(); 85 | let mut chars = input.chars().peekable(); 86 | 87 | while let Some(c) = chars.next() { 88 | match c { 89 | '*' => tokens.push(Token::Star), 90 | ',' => tokens.push(Token::Comma), 91 | ';' => tokens.push(Token::SemiColon), 92 | c if c.is_whitespace() => continue, 93 | c if c.is_alphabetic() => { 94 | let mut ident = c.to_string().to_lowercase(); 95 | while let Some(cc) = chars.next_if(|&cc| cc.is_alphanumeric() || cc == '_') { 96 | ident.extend(cc.to_lowercase()); 97 | } 98 | 99 | match ident.as_str() { 100 | "select" => tokens.push(Token::Select), 101 | "as" => tokens.push(Token::As), 102 | "from" => tokens.push(Token::From), 103 | _ => tokens.push(Token::Identifier(ident)), 104 | } 105 | } 106 | _ => return Err(anyhow::anyhow!("unexpected character: {}", c)), 107 | } 108 | } 109 | 110 | Ok(tokens) 111 | } 112 | ``` 113 | 114 | Since SQL is case-insensitive, all identifiers are normalized to lower case. 115 | 116 | ## Representing SQL statements 117 | 118 | Before we dive into the implementation of the parser, we need to decide how to 119 | represent SQL statements in our code. We'll settle on a conventional representation, 120 | based on the description of the SQL syntax in the SQLite documentation, and write 121 | the corresponding Rust types in a new module `sql::ast`. 122 | 123 | ```rust 124 | // sql/ast.rs 125 | 126 | #[derive(Debug, Clone, Eq, PartialEq)] 127 | pub enum Statement { 128 | Select(SelectStatement), 129 | } 130 | 131 | #[derive(Debug, Clone, Eq, PartialEq)] 132 | pub struct SelectStatement { 133 | pub core: SelectCore, 134 | } 135 | 136 | #[derive(Debug, Clone, Eq, PartialEq)] 137 | pub struct SelectCore { 138 | pub result_columns: Vec, 139 | pub from: SelectFrom, 140 | } 141 | 142 | #[derive(Debug, Clone, Eq, PartialEq)] 143 | pub enum ResultColumn { 144 | Star, 145 | Expr(ExprResultColumn), 146 | } 147 | 148 | #[derive(Debug, Clone, Eq, PartialEq)] 149 | pub struct ExprResultColumn { 150 | pub expr: Expr, 151 | pub alias: Option, 152 | } 153 | 154 | #[derive(Debug, Clone, Eq, PartialEq)] 155 | pub enum Expr { 156 | Column(Column), 157 | } 158 | 159 | #[derive(Debug, Clone, Eq, PartialEq)] 160 | pub struct Column { 161 | pub name: String, 162 | } 163 | 164 | #[derive(Debug, Clone, Eq, PartialEq)] 165 | pub enum SelectFrom { 166 | Table(String), 167 | } 168 | ``` 169 | 170 | The following query: 171 | 172 | ```sql 173 | select col1 as first, col2 174 | from table 175 | ``` 176 | 177 | Will be parsed into the following rust structure: 178 | 179 | ```rust 180 | Statement::Select(SelectStatement { 181 | core: SelectCore { 182 | result_columns: vec![ 183 | ResultColumn::Expr(ExprResultColumn { 184 | expr: Expr::Column(Column { 185 | name: "col1".to_string() 186 | }), 187 | alias: Some("first".to_string()) 188 | }), 189 | ResultColumn::Expr(ExprResultColumn { 190 | expr: Expr::Column(Column { 191 | name: "col2".to_string() 192 | }), 193 | alias: None 194 | }), 195 | ], 196 | from: SelectFrom::Table("table".to_string()), 197 | }, 198 | }) 199 | ``` 200 | 201 | You may notice a few redundancies in this representation, such as the `Expr` enum 202 | that comprises a single variant. This is intentional, as it will allow us to add 203 | new syntactic constructs in future episodes without breaking too much of the 204 | existing code. 205 | 206 | ## Writing the parser 207 | 208 | Parsing algorithms come in all shapes and sizes, and a full discussion of the topic 209 | if beyond the scope of this article. The one we'll use here is called recursive descent 210 | and is reasonably simple to understand and implement: 211 | 212 | - for every node type, we'll define a function that tries to build the node from the current input 213 | tokens, and fails if it is not possible. For example, we'll define a method that builds a `Column` node 214 | by consuming an `Identifier` token, and fails if the current token is not an `Identifier` token. 215 | - complex "nested" nodes are build by delegating the parsing of their child nodes to other functions. 216 | For example, `ExprResultColmn` is build by parsing an `Expr` node and an optional `as` token followed 217 | by an `Identifier` token. 218 | 219 | In a fully-fledged parser, these functions can be mutually recursive. 220 | 221 | First, let's define a `ParserState` struct that will hold the state of the parser: 222 | the list of tokens, and the current position in the list. 223 | 224 | ```rust 225 | // sql/parser.rs 226 | 227 | use anyhow::{bail, Context}; 228 | 229 | use crate::sql::{ 230 | ast::{ 231 | Column, Expr, ExprResultColumn, ResultColumn, SelectCore, SelectFrom, SelectStatement, 232 | Statement, 233 | }, 234 | tokenizer::{self, Token}, 235 | }; 236 | 237 | #[derive(Debug)] 238 | struct ParserState { 239 | tokens: Vec, 240 | pos: usize, 241 | } 242 | 243 | impl ParserState { 244 | fn new(tokens: Vec) -> Self { 245 | Self { tokens, pos: 0 } 246 | } 247 | 248 | fn next_token_is(&self, expected: Token) -> bool { 249 | self.tokens.get(self.pos) == Some(&expected) 250 | } 251 | 252 | fn expect_identifier(&mut self) -> anyhow::Result<&str> { 253 | self.expect_matching(|t| matches!(t, Token::Identifier(_))) 254 | .map(|t| t.as_identifier().unwrap()) 255 | } 256 | 257 | fn expect_eq(&mut self, expected: Token) -> anyhow::Result<&Token> { 258 | self.expect_matching(|t| *t == expected) 259 | } 260 | 261 | fn expect_matching(&mut self, f: impl Fn(&Token) -> bool) -> anyhow::Result<&Token> { 262 | match self.next_token() { 263 | Some(token) if f(token) => Ok(token), 264 | Some(token) => bail!("unexpected token: {:?}", token), 265 | None => bail!("unexpected end of input"), 266 | } 267 | } 268 | 269 | fn peak_next_token(&self) -> anyhow::Result<&Token> { 270 | self.tokens.get(self.pos).context("unexpected end of input") 271 | } 272 | 273 | fn next_token(&mut self) -> Option<&Token> { 274 | let token = self.tokens.get(self.pos); 275 | if token.is_some() { 276 | self.advance(); 277 | } 278 | token 279 | } 280 | 281 | fn advance(&mut self) { 282 | self.pos += 1; 283 | } 284 | } 285 | ``` 286 | 287 | - `current_token_is` checks if the current token is equal to the expected token 288 | - `expect_identifier` returns the content of the current token if it is an `Identifier`, 289 | and fails otherwise 290 | - `expect_eq` checks if the current token is equal to the expected token, and fails otherwise 291 | - `peak_next_token` allows us to look at the next token without consuming it, 292 | and fails if there are no more tokens 293 | - `next_token` returns the current token and advances the parser's position 294 | - `advance` increments the parser's position 295 | 296 | Armed with these primitives, we can write our simplest parser function: `parse_expr`! 297 | As the only expressions that we support for now are identifiers, the parsing function 298 | only has to check that the current token is an `Identifier` token and build a `Expr` node 299 | from its value. 300 | 301 | ```rust 302 | // sql/parser.rs 303 | 304 | impl ParserState { 305 | //... 306 | fn parse_expr(&mut self) -> anyhow::Result { 307 | Ok(Expr::Column(Column { 308 | name: self.expect_identifier()?.to_string(), 309 | })) 310 | } 311 | //... 312 | } 313 | ``` 314 | 315 | A bit more involved, the `parse_expr_result_column` function parses terms of 316 | the form `columnName` or `columnName as alias`. It starts by parsing the 317 | initial `Expr` node (`columnName`, in our examples), then if the next 318 | token is `as`, it consumes it and parses the `Identifier` token that follows. 319 | 320 | ```rust 321 | // sql/parser.rs 322 | 323 | impl ParserState { 324 | //... 325 | fn parse_expr_result_column(&mut self) -> anyhow::Result { 326 | let expr = self.parse_expr()?; 327 | let alias = if self.next_token_is(Token::As) { 328 | self.advance(); 329 | Some(self.expect_identifier()?.to_string()) 330 | } else { 331 | None 332 | }; 333 | Ok(ExprResultColumn { expr, alias }) 334 | } 335 | //... 336 | } 337 | ``` 338 | 339 | `ResultColumn` can represent terms of the form described above, or `*` to represent 340 | all columns of a table. The `parse_result_column` function checks if the current token 341 | is `*`, and returns a `Star` node if it is. Otherwise, it delegates the parsing of the 342 | `ExprResultColumn` node to the `parse_expr_result_column` function. 343 | 344 | ```rust 345 | // sql/parser.rs 346 | 347 | impl ParserState { 348 | //... 349 | fn parse_result_column(&mut self) -> anyhow::Result { 350 | if self.peak_next_token()? == &Token::Star { 351 | self.advance(); 352 | return Ok(ResultColumn::Star); 353 | } 354 | 355 | Ok(ResultColumn::Expr(self.parse_expr_result_column()?)) 356 | } 357 | //... 358 | } 359 | ``` 360 | 361 | Another interesting example is the `parse_result_colums` function, which parses 362 | a list of columns separated by commas. It starts by parsing the first column, 363 | then iterates over the following tokens as long as the token following 364 | a result column is a comma, accumulating the parsed columns in a vector. 365 | 366 | ```rust 367 | // sql/parser.rs 368 | 369 | impl ParserState { 370 | //... 371 | fn parse_result_columns(&mut self) -> anyhow::Result> { 372 | let mut result_coluns = vec![self.parse_result_column()?]; 373 | while self.next_token_is(Token::Comma) { 374 | self.advance(); 375 | result_coluns.push(self.parse_result_column()?); 376 | } 377 | Ok(result_coluns) 378 | } 379 | //... 380 | } 381 | ``` 382 | 383 | As you are probably getting the hang of it, implementing the remaining parsing 384 | functions can be a fun exercise. In any case, here is my implementation 385 | for reference: 386 | 387 | ```rust 388 | // sql/parser.rs 389 | 390 | impl ParserState { 391 | //... 392 | fn parse_statement(&mut self) -> anyhow::Result { 393 | Ok(Statement::Select(self.parse_select()?)) 394 | } 395 | 396 | fn parse_select(&mut self) -> anyhow::Result { 397 | self.expect_eq(Token::Select)?; 398 | let result_columns = self.parse_result_columns()?; 399 | self.expect_eq(Token::From)?; 400 | let from = self.parse_select_from()?; 401 | Ok(SelectStatement { 402 | core: SelectCore { 403 | result_columns, 404 | from, 405 | }, 406 | }) 407 | } 408 | 409 | fn parse_select_from(&mut self) -> anyhow::Result { 410 | let table = self.expect_identifier()?; 411 | Ok(SelectFrom::Table(table.to_string())) 412 | } 413 | //... 414 | } 415 | ``` 416 | 417 | The final piece of the puzzle is a function that ties everything together, 418 | taking an input SQL string, tokenizing it, and parsing it into an AST: 419 | 420 | ```rust 421 | // sql/parser.rs 422 | 423 | //... 424 | 425 | pub fn parse_statement(input: &str) -> anyhow::Result { 426 | let tokens = tokenizer::tokenize(input)?; 427 | let mut state = ParserState::new(tokens); 428 | let statement = state.parse_statement()?; 429 | state.expect_eq(Token::SemiColon)?; 430 | Ok(statement) 431 | } 432 | ``` 433 | 434 | ## Putting it all together 435 | 436 | We've covered a lot of ground! Now is the time to test our parser on 437 | some actual SQL queries. To that end, let's alter our REPL loop 438 | to parse then input as an SQL statement if it does not match a know command, and 439 | print it. 440 | 441 | ```diff 442 | // src/main.rs 443 | 444 | + mod sql; 445 | 446 | //... 447 | 448 | fn cli(mut db: db::Db) -> anyhow::Result<()> { 449 | print_flushed("rqlite> ")?; 450 | 451 | let mut line_buffer = String::new(); 452 | 453 | while stdin().lock().read_line(&mut line_buffer).is_ok() { 454 | match line_buffer.trim() { 455 | ".exit" => break, 456 | ".tables" => display_tables(&mut db)?, 457 | + stmt => match sql::parse_statement(stmt) { 458 | + Ok(stmt) => { 459 | + println!("{:?}", stmt); 460 | + } 461 | + Err(e) => { 462 | + println!("Error: {}", e); 463 | + } 464 | + }, 465 | - _ => { 466 | - println!("Unrecognized command '{}'", line_buffer.trim()); 467 | - } 468 | } 469 | 470 | print_flushed("\nrqlite> ")?; 471 | 472 | line_buffer.clear(); 473 | } 474 | 475 | Ok(()) 476 | } 477 | 478 | //... 479 | ``` 480 | 481 | ## Conclusion 482 | 483 | Our database can read data and parse very simple SQL statements. 484 | In the next part of this series, we'll bridge the gap between these two functionalities 485 | and build a small query engine that compiles SQL queries into execution plans and 486 | executes these plans against the persisted data. 487 | -------------------------------------------------------------------------------- /blog/part1.md: -------------------------------------------------------------------------------- 1 | ### Build your own SQLite, Part 1: Listing tables 2 | 3 | As developers, we use databases all the time. But how do they work? 4 | In this series, we'll try to answer that question by building our own 5 | SQLite-compatible database from scratch. 6 | 7 | Source code examples will be provided in Rust, but you are encouraged to 8 | follow along using your language of choice, as we won't be relying 9 | on many language-specific features or libraries. 10 | 11 | As an introduction, we'll implement the simplest version of the `tables` command, 12 | which lists the names of all the tables in a database. While this looks simple, we'll 13 | see that it requires us to make our first deep dive into the SQLite file format. 14 | 15 | ## Building the test database 16 | 17 | To keep things as simple as possible, let's build a minimalistic 18 | test database: 19 | 20 | ```bash 21 | sqlite3 minimal_test.db 22 | sqlite> create table table1(id integer); 23 | sqlite> create table table2(id integer); 24 | sqlite> .exit 25 | ``` 26 | 27 | This creates a database with two tables, `table1` and `table2`, each with a single 28 | column, `id`. We can verify this by running the `tables` command in the SQLite shell: 29 | 30 | ```bash 31 | sqlite3 minimal_test.db 32 | sqlite> .tables 33 | table1 table2 34 | sqlite> .exit 35 | ``` 36 | 37 | ## Bootstrapping the project 38 | 39 | Let's start by creating a new Rust project. We'll use the `cargo add` to add our only dependency 40 | for now, `anyhow`: 41 | 42 | ```bash 43 | cargo new rsqlite 44 | cd rsqlite 45 | cargo add anyhow 46 | ``` 47 | 48 | ## The SQLite file format 49 | 50 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1721572171598/5c4195b6-5472-4ba1-826d-d8f5b6660527.png align=" 51 | center") 52 | 53 | SQLite databases are stored in a single file, the format of which is 54 | documented in the [SQLite File Format Specification](https://www.sqlite.org/fileformat.html). 55 | The file is divided into pages, with each page having the same size: a power of 2, between 56 | 512 and 65536 bytes. 57 | The first 100 bytes of the first page contain the database header, which includes 58 | information such as the page size and the file format version. In this first part, we'll only 59 | be interested in the page size. 60 | Pages can be of different types, but for this first article, we'll only be interested in 61 | `table btree leaf` pages, which store the actual table data. 62 | 63 | Our first task will be to implement a `Pager` struct that reads and caches pages from the 64 | database file. But before we do, we'll have to read the page size from the database header. 65 | Let's start by defining our `Header` struct: 66 | 67 | ```rust 68 | // src/page.rs 69 | #[derive(Debug, Copy, Clone)] 70 | pub struct DbHeader { 71 | pub page_size: u32, 72 | } 73 | ``` 74 | 75 | The header starts with the magic string `SQLite format 3\0`, followed by the page size 76 | encoded as a big-endian 2-byte integer at offset 16. With this information, we can 77 | implement a function that reads the header from a buffer: 78 | 79 | ```rust 80 | // src/pager.rs 81 | pub const HEADER_SIZE: usize = 100; 82 | const HEADER_PREFIX: &[u8] = b"SQLite format 3\0"; 83 | const HEADER_PAGE_SIZE_OFFSET: usize = 16; 84 | 85 | const PAGE_MAX_SIZE: u32 = 65536; 86 | 87 | pub fn parse_header(buffer: &[u8]) -> anyhow::Result { 88 | if !buffer.starts_with(HEADER_PREFIX) { 89 | let prefix = String::from_utf8_lossy(&buffer[..HEADER_PREFIX.len()]); 90 | anyhow::bail!("invalid header prefix: {prefix}"); 91 | } 92 | 93 | let page_size_raw = read_be_word_at(buffer, HEADER_PAGE_SIZE_OFFSET); 94 | let page_size = match page_size_raw { 95 | 1 => PAGE_MAX_SIZE, 96 | n if ((n & (n - 1)) == 0) && n != 0 => n as u32, 97 | _ => anyhow::bail!("page size is not a power of 2: {}", page_size_raw), 98 | }; 99 | 100 | Ok(page::Header { page_size }) 101 | } 102 | 103 | fn read_be_word_at(input: &[u8], offset: usize) -> u16 { 104 | u16::from_be_bytes(input[offset..offset + 2].try_into().unwrap()) 105 | } 106 | ``` 107 | 108 | Two things to note here: 109 | 110 | - As the maximum page size cannot be represented as a 2-byte integer, a page size of 1 is use to represent the maximum 111 | page size. 112 | - We use a somewhat convoluted expression to check if the page size is a power of 2. 113 | The expression `n & (n - 1) == 0` is true if and only if `n` is a power of 2, except for `n = 0`. 114 | 115 | ## Decoding Table B-tree leaf pages 116 | 117 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1721571943115/f84ad91d-d3a3-462e-8f2b-1b1975badb1a.png align=" 118 | center") 119 | 120 | Now that we have the minimum information we need to read pages from the disk, let's explore 121 | the content of a `table btree-leaf` page. 122 | `table btree-leaf` pages start with an 8-byte header, followed by an sequence of "cell pointers" 123 | containing the offset of every cell in the page. The cells contain the table data, and we 124 | can think of them as key-value pairs, where the key is a 64-bits integer encoded as 125 | a [varint](https://carlmastrangelo.com/blog/lets-make-a-varint) 126 | (the `rowid`) and the value is an arbitrary sequence of bytes representing the row data. 127 | The header contains the following fields: 128 | 129 | - `page_type`: byte representing the page type. For `table btree-leaf` pages, this is 0x0D. 130 | - `first_freeblock`: 2-byte integer representing the offset of the first free block in the page, or zero if there is no 131 | freeblock. 132 | - `cell_count`: 2-byte integer representing the number of cells in the page. 133 | - `cell_content_offset`: 2-byte integer representing the offset of the first cell. 134 | - `fragmented_bytes_count`: 1-byte integer representing the number of fragmented free bytes in the page (we won't make 135 | use of it for now). 136 | 137 | We'll start by defining a `Page` enum representing a parsed page, along with 138 | the necessary structs to represent the page header and the cell pointers: 139 | 140 | ```rust 141 | #[derive(Debug, Clone)] 142 | pub enum Page { 143 | TableLeaf(TableLeafPage), 144 | } 145 | 146 | #[derive(Debug, Clone)] 147 | pub struct TableLeafPage { 148 | pub header: PageHeader, 149 | pub cell_pointers: Vec, 150 | pub cells: Vec, 151 | } 152 | 153 | #[derive(Debug, Copy, Clone)] 154 | pub struct PageHeader { 155 | pub page_type: PageType, 156 | pub first_freeblock: u16, 157 | pub cell_count: u16, 158 | pub cell_content_offset: u32, 159 | pub fragmented_bytes_count: u8, 160 | } 161 | 162 | #[derive(Debug, Copy, Clone)] 163 | pub enum PageType { 164 | TableLeaf, 165 | } 166 | 167 | #[derive(Debug, Clone)] 168 | pub struct TableLeafCell { 169 | pub size: i64, 170 | pub row_id: i64, 171 | pub payload: Vec, 172 | } 173 | ``` 174 | 175 | The corresponding parsing functions are quite straightforward. Note the offset handling 176 | in `parse_page`: since the first page contains the database header, we start parsing 177 | the page at offset 100. 178 | 179 | ```rust 180 | /// pager.rs 181 | const PAGE_LEAF_HEADER_SIZE: usize = 8; 182 | const PAGE_FIRST_FREEBLOCK_OFFSET: usize = 1; 183 | const PAGE_CELL_COUNT_OFFSET: usize = 3; 184 | const PAGE_CELL_CONTENT_OFFSET: usize = 5; 185 | const PAGE_FRAGMENTED_BYTES_COUNT_OFFSET: usize = 7; 186 | 187 | fn parse_page(buffer: &[u8], page_num: usize) -> anyhow::Result { 188 | let ptr_offset = if page_num == 1 { HEADER_SIZE as u16 } else { 0 }; 189 | 190 | match buffer[0] { 191 | PAGE_LEAF_TABLE_ID => parse_table_leaf_page(buffer, ptr_offset), 192 | _ => Err(anyhow::anyhow!("unknown page type: {}", buffer[0])), 193 | } 194 | } 195 | 196 | fn parse_table_leaf_page(buffer: &[u8], ptr_offset: u16) -> anyhow::Result { 197 | let header = parse_page_header(buffer)?; 198 | 199 | let content_buffer = &buffer[PAGE_LEAF_HEADER_SIZE..]; 200 | let cell_pointers = parse_cell_pointers(content_buffer, header.cell_count as usize, ptr_offset); 201 | 202 | let cells = cell_pointers 203 | .iter() 204 | .map(|&ptr| parse_table_leaf_cell(&buffer[ptr as usize..])) 205 | .collect::>>()?; 206 | 207 | Ok(page::Page::TableLeaf(page::TableLeafPage { 208 | header, 209 | cell_pointers, 210 | cells, 211 | })) 212 | } 213 | 214 | 215 | fn parse_page_header(buffer: &[u8]) -> anyhow::Result { 216 | let page_type = match buffer[0] { 217 | 0x0d => page::PageType::TableLeaf, 218 | _ => anyhow::bail!("unknown page type: {}", buffer[0]), 219 | }; 220 | 221 | let first_freeblock = read_be_word_at(buffer, PAGE_FIRST_FREEBLOCK_OFFSET); 222 | let cell_count = read_be_word_at(buffer, PAGE_CELL_COUNT_OFFSET); 223 | let cell_content_offset = match read_be_word_at(buffer, PAGE_CELL_CONTENT_OFFSET) { 224 | 0 => 65536, 225 | n => n as u32, 226 | }; 227 | let fragmented_bytes_count = buffer[PAGE_FRAGMENTED_BYTES_COUNT_OFFSET]; 228 | 229 | Ok(page::PageHeader { 230 | page_type, 231 | first_freeblock, 232 | cell_count, 233 | cell_content_offset, 234 | fragmented_bytes_count, 235 | }) 236 | } 237 | 238 | 239 | fn parse_cell_pointers(buffer: &[u8], n: usize, ptr_offset: u16) -> Vec { 240 | let mut pointers = Vec::with_capacity(n); 241 | for i in 0..n { 242 | pointers.push(read_be_word_at(buffer, 2 * i) - ptr_offset); 243 | } 244 | pointers 245 | } 246 | 247 | fn parse_table_leaf_cell(mut buffer: &[u8]) -> anyhow::Result { 248 | let (n, size) = read_varint_at(buffer, 0); 249 | buffer = &buffer[n as usize..]; 250 | 251 | let (n, row_id) = read_varint_at(buffer, 0); 252 | buffer = &buffer[n as usize..]; 253 | 254 | let payload = buffer[..size as usize].to_vec(); 255 | 256 | Ok(page::TableLeafCell { 257 | size, 258 | row_id, 259 | payload, 260 | }) 261 | } 262 | 263 | pub fn read_varint_at(buffer: &[u8], mut offset: usize) -> (u8, i64) { 264 | let mut size = 0; 265 | let mut result = 0; 266 | 267 | while size < 9 { 268 | let current_byte = buffer[offset] as i64; 269 | if size == 8 { 270 | result = (result << 8) | current_byte; 271 | } else { 272 | result = (result << 7) | (current_byte & 0b0111_1111); 273 | } 274 | 275 | offset += 1; 276 | size += 1; 277 | 278 | if current_byte & 0b1000_0000 == 0 { 279 | break; 280 | } 281 | } 282 | 283 | (size, result) 284 | } 285 | ``` 286 | 287 | To read a varint, we copy the 7 least significant bits of each byte to the result, as long as the most significant bit is set. As the maximum length of a varint is 9 bytes, keep track of 288 | the number of bytes visited and stop after a maximum of 9 bytes. Note that to 289 | complete a 64 bits value, we need the first 7 bits of the first 8 bytes 290 | and all the bits of the last byte. That's why we test the current size 291 | of the varint at each iteration and add a special case for the last byte (when `size == 8`). 292 | 293 | 294 | We can finally implement the pager itself. For now, it only loads and caches pages without 295 | any eviction policy: 296 | 297 | ```rust 298 | // pager.rs 299 | #[derive(Debug, Clone)] 300 | pub struct Pager { 301 | input: I, 302 | page_size: usize, 303 | pages: HashMap, 304 | } 305 | 306 | impl Pager { 307 | pub fn new(input: I, page_size: usize) -> Self { 308 | Self { 309 | input, 310 | page_size, 311 | pages: HashMap::new(), 312 | } 313 | } 314 | 315 | pub fn read_page(&mut self, n: usize) -> anyhow::Result<&page::Page> { 316 | if self.pages.contains_key(&n) { 317 | return Ok(self.pages.get(&n).unwrap()); 318 | } 319 | 320 | let page = self.load_page(n)?; 321 | self.pages.insert(n, page); 322 | Ok(self.pages.get(&n).unwrap()) 323 | } 324 | 325 | fn load_page(&mut self, n: usize) -> anyhow::Result { 326 | let offset = n.saturating_sub(1) * self.page_size; 327 | 328 | self.input 329 | .seek(SeekFrom::Start(offset as u64)) 330 | .context("seek to page start")?; 331 | 332 | let mut buffer = vec![0; self.page_size]; 333 | self.input.read_exact(&mut buffer).context("read page")?; 334 | 335 | parse_page(&buffer, n) 336 | } 337 | } 338 | ``` 339 | 340 | ## Records 341 | 342 | We now have a way to read pages, and to access the pages cells. But how to decode the values of the cells? 343 | Each cell contains the value of a row in the table, encoded using 344 | the [SQLite record format](https://www.sqlite.org/fileformat2.html#record_format). 345 | The record format is quite simple: a record consists of a header, followed by a sequence of field values. 346 | The header starts with a varint representing the byte size of the headerm followed by a sequence 347 | of varints -one per column- determining the type of each column according to the following table: 348 | 349 | - 0: NULL 350 | - 1: 8-bits signed integer 351 | - 2: 16-bits signed integer 352 | - 3: 24-bits signed integer 353 | - 4: 32-bits signed integer 354 | - 5: 48-bits signed integer 355 | - 6: 64-bits signed integer 356 | - 7: 64-bits IEEE floating point number 357 | - 8: value is the integer 0 358 | - 9: value is the integer 1 359 | - 10 & 11: reserved for internal use 360 | - n with n even and n > 12: BLOB of size (n - 12) / 2 361 | - n with n odd and n > 13: text of size (n - 13) / 2 362 | 363 | We now have all the informations we need to parse and represent record's headers: 364 | 365 | ```rust 366 | // src/cursor.rs 367 | #[derive(Debug, Copy, Clone)] 368 | pub enum RecordFieldType { 369 | Null, 370 | I8, 371 | I16, 372 | I24, 373 | I32, 374 | I48, 375 | I64, 376 | Float, 377 | Zero, 378 | One, 379 | String, 380 | Blob, 381 | } 382 | 383 | #[derive(Debug, Clone)] 384 | pub struct RecordField { 385 | pub offset: usize, 386 | pub field_type: RecordFieldType, 387 | } 388 | 389 | #[derive(Debug, Clone)] 390 | pub struct RecordHeader { 391 | pub fields: Vec, 392 | } 393 | 394 | fn parse_record_header(mut buffer: &[u8]) -> anyhow::Result { 395 | let (varint_size, header_length) = crate::pager::read_varint_at(buffer, 0); 396 | buffer = &buffer[varint_size as usize..header_length as usize]; 397 | 398 | let mut fields = Vec::new(); 399 | let mut current_offset = header_length as usize; 400 | 401 | while !buffer.is_empty() { 402 | let (discriminant_size, discriminant) = crate::pager::read_varint_at(buffer, 0); 403 | buffer = &buffer[discriminant_size as usize..]; 404 | 405 | let (field_type, field_size) = match discriminant { 406 | 0 => (RecordFieldType::Null, 0), 407 | 1 => (RecordFieldType::I8, 1), 408 | 2 => (RecordFieldType::I16, 2), 409 | 3 => (RecordFieldType::I24, 3), 410 | 4 => (RecordFieldType::I32, 4), 411 | 5 => (RecordFieldType::I48, 6), 412 | 6 => (RecordFieldType::I64, 8), 413 | 7 => (RecordFieldType::Float, 8), 414 | 8 => (RecordFieldType::Zero, 0), 415 | 9 => (RecordFieldType::One, 0), 416 | n if n >= 12 && n % 2 == 0 => { 417 | let size = ((n - 12) / 2) as usize; 418 | (RecordFieldType::Blob(size), size) 419 | } 420 | n if n >= 13 && n % 2 == 1 => { 421 | let size = ((n - 13) / 2) as usize; 422 | (RecordFieldType::String(size), size) 423 | } 424 | n => anyhow::bail!("unsupported field type: {}", n), 425 | }; 426 | 427 | fields.push(RecordField { 428 | offset: current_offset, 429 | field_type, 430 | }); 431 | 432 | current_offset += field_size; 433 | } 434 | 435 | Ok(RecordHeader { fields }) 436 | } 437 | ``` 438 | 439 | To make it easier to work with records, we'll define a `Value` type, representing field values 440 | and a `Cursor` struct that uniquely identifies a record within a database file. The `Cursor` 441 | will expose a `field` method, returning the value of the record's n-th field: 442 | 443 | ```rust 444 | // src/value.rs 445 | use std::borrow::Cow; 446 | 447 | #[derive(Debug, Clone)] 448 | pub enum Value<'p> { 449 | Null, 450 | String(Cow<'p, str>), 451 | Blob(Cow<'p, [u8]>), 452 | Int(i64), 453 | Float(f64), 454 | } 455 | 456 | impl<'p> Value<'p> { 457 | pub fn as_str(&self) -> Option<&str> { 458 | if let Value::String(s) = self { 459 | Some(s.as_ref()) 460 | } else { 461 | None 462 | } 463 | } 464 | } 465 | 466 | ``` 467 | 468 | ```rust 469 | // src/cursor.rs 470 | #[derive(Debug)] 471 | pub struct Cursor<'p> { 472 | header: RecordHeader, 473 | pager: &'p mut Pager, 474 | page_index: usize, 475 | page_cell: usize, 476 | } 477 | 478 | impl<'p> Cursor<'p> { 479 | pub fn field(&mut self, n: usize) -> Option { 480 | let record_field = self.header.fields.get(n)?; 481 | 482 | let payload = match self.pager.read_page(self.page_index) { 483 | Ok(Page::TableLeaf(leaf)) => &leaf.cells[self.page_cell].payload, 484 | _ => return None, 485 | }; 486 | 487 | match record_field.field_type { 488 | RecordFieldType::Null => Some(Value::Null), 489 | RecordFieldType::I8 => Some(Value::Int(read_i8_at(payload, record_field.offset))), 490 | RecordFieldType::I16 => Some(Value::Int(read_i16_at(payload, record_field.offset))), 491 | RecordFieldType::I24 => Some(Value::Int(read_i24_at(payload, record_field.offset))), 492 | RecordFieldType::I32 => Some(Value::Int(read_i32_at(payload, record_field.offset))), 493 | RecordFieldType::I48 => Some(Value::Int(read_i48_at(payload, record_field.offset))), 494 | RecordFieldType::I64 => Some(Value::Int(read_i64_at(payload, record_field.offset))), 495 | RecordFieldType::Float => Some(Value::Float(read_f64_at(payload, record_field.offset))), 496 | RecordFieldType::String(length) => { 497 | let value = std::str::from_utf8( 498 | &payload[record_field.offset..record_field.offset + length], 499 | ).expect("invalid utf8"); 500 | Some(Value::String(Cow::Borrowed(value))) 501 | } 502 | RecordFieldType::Blob(length) => { 503 | let value = &payload[record_field.offset..record_field.offset + length]; 504 | Some(Value::Blob(Cow::Borrowed(value))) 505 | } 506 | _ => panic!("unimplemented"), 507 | } 508 | } 509 | } 510 | 511 | fn read_i8_at(input: &[u8], offset: usize) -> i64 { 512 | input[offset] as i64 513 | } 514 | 515 | fn read_i16_at(input: &[u8], offset: usize) -> i64 { 516 | i16::from_be_bytes(input[offset..offset + 2].try_into().unwrap()) as i64 517 | } 518 | 519 | fn read_i24_at(input: &[u8], offset: usize) -> i64 { 520 | (i32::from_be_bytes(input[offset..offset + 3].try_into().unwrap()) & 0x00FFFFFF) as i64 521 | } 522 | 523 | fn read_i32_at(input: &[u8], offset: usize) -> i64 { 524 | i32::from_be_bytes(input[offset..offset + 4].try_into().unwrap()) as i64 525 | } 526 | 527 | fn read_i48_at(input: &[u8], offset: usize) -> i64 { 528 | i64::from_be_bytes(input[offset..offset + 6].try_into().unwrap()) & 0x0000FFFFFFFFFFFF 529 | } 530 | 531 | fn read_i64_at(input: &[u8], offset: usize) -> i64 { 532 | i64::from_be_bytes(input[offset..offset + 8].try_into().unwrap()) 533 | } 534 | 535 | fn read_f64_at(input: &[u8], offset: usize) -> f64 { 536 | f64::from_be_bytes(input[offset..offset + 8].try_into().unwrap()) 537 | } 538 | ``` 539 | 540 | To simplify iteration over a page's records, we'll also implement a `Scanner` struct that 541 | wraps a page and allows us to get a `Cursor` for each record: 542 | 543 | ```rust 544 | // src/cursor.rs 545 | #[derive(Debug)] 546 | pub struct Scanner<'p> { 547 | pager: &'p mut Pager, 548 | page: usize, 549 | cell: usize, 550 | } 551 | 552 | impl<'p> Scanner<'p> { 553 | pub fn new(pager: &'p mut Pager, page: usize) -> Scanner<'p> { 554 | Scanner { 555 | pager, 556 | page, 557 | cell: 0, 558 | } 559 | } 560 | pub fn next_record(&mut self) -> Option> { 561 | let page = match self.pager.read_page(self.page) { 562 | Ok(page) => page, 563 | Err(e) => return Some(Err(e)), 564 | }; 565 | 566 | match page { 567 | Page::TableLeaf(leaf) => { 568 | let cell = leaf.cells.get(self.cell)?; 569 | 570 | let header = match parse_record_header(&cell.payload) { 571 | Ok(header) => header, 572 | Err(e) => return Some(Err(e)), 573 | }; 574 | 575 | let record = Cursor { 576 | header, 577 | pager: self.pager, 578 | page_index: self.page, 579 | page_cell: self.cell, 580 | }; 581 | 582 | self.cell += 1; 583 | 584 | Some(Ok(record)) 585 | } 586 | } 587 | } 588 | } 589 | ``` 590 | 591 | ## Table descriptions 592 | 593 | With most of the leg work out of the way, we can get back to our original goal: listing tables. 594 | SQLite stores the schema of a database in a special table called `sqlite_master`. 595 | The schema for the `sqlite_master` table is as follows: 596 | 597 | ```sql 598 | CREATE TABLE sqlite_schema( 599 | type text, 600 | name text, 601 | tbl_name text, 602 | rootpage integer, 603 | sql text 604 | ); 605 | ``` 606 | 607 | Theses columns are used as follows: 608 | 609 | - `type`: the type of the schema object. For tables, this will always be `table`. 610 | - `name`: the name of the schema object. 611 | - `tbl_name`: the name of the table the schema object is associated with. In the case of tables, this will be the same 612 | as `name`. 613 | - `rootpage`: root page of the table, we'll use it later to read the table's content. 614 | - `sql`: the SQL statement used to create the table. 615 | 616 | Since our simple database only handles basic schemas for now, we can assume that the entire 617 | schema fits in the first page of our database file. 618 | In order to list the tables in the database, we'll need to: 619 | 620 | - initialize the pager with the database file 621 | - create a `Scanner` for the first page 622 | - iterate over the records, and print the value of the `name` field (at index 1) for each record. 623 | 624 | First, we'll define a `Db` struct to hold our global state: 625 | 626 | ```rust 627 | // src/db.rs 628 | use std::{io::Read, path::Path}; 629 | 630 | use anyhow::Context; 631 | 632 | use crate::{cursor::Scanner, page::DbHeader, pager, pager::Pager}; 633 | 634 | pub struct Db { 635 | pub header: DbHeader, 636 | pager: Pager, 637 | } 638 | 639 | impl Db { 640 | pub fn from_file(filename: impl AsRef) -> anyhow::Result { 641 | let mut file = std::fs::File::open(filename.as_ref()).context("open db file")?; 642 | 643 | let mut header_buffer = [0; pager::HEADER_SIZE]; 644 | file.read_exact(&mut header_buffer) 645 | .context("read db header")?; 646 | 647 | let header = pager::parse_header(&header_buffer).context("parse db header")?; 648 | 649 | let pager = Pager::new(file, header.page_size as usize); 650 | 651 | Ok(Db { header, pager }) 652 | } 653 | 654 | pub fn scanner(&mut self, page: usize) -> Scanner { 655 | Scanner::new(&mut self.pager, page) 656 | } 657 | } 658 | ``` 659 | 660 | The implementation of a basic REPL supporting the `tables` and `tables` commands is straightforward: 661 | 662 | ```rust 663 | use std::io::{stdin, BufRead, Write}; 664 | 665 | use anyhow::Context; 666 | 667 | mod cursor; 668 | mod db; 669 | mod page; 670 | mod pager; 671 | mod value; 672 | 673 | fn main() -> anyhow::Result<()> { 674 | let database = db::Db::from_file(std::env::args().nth(1).context("missing db file")?)?; 675 | cli(database) 676 | } 677 | 678 | fn cli(mut db: db::Db) -> anyhow::Result<()> { 679 | print_flushed("rqlite> ")?; 680 | 681 | let mut line_buffer = String::new(); 682 | 683 | while stdin().lock().read_line(&mut line_buffer).is_ok() { 684 | match line_buffer.trim() { 685 | ".exit" => break, 686 | ".tables" => display_tables(&mut db)?, 687 | _ => { 688 | println!("Unrecognized command '{}'", line_buffer.trim()); 689 | } 690 | } 691 | 692 | print_flushed("\nrqlite> ")?; 693 | 694 | line_buffer.clear(); 695 | } 696 | 697 | Ok(()) 698 | } 699 | 700 | fn display_tables(db: &mut db::Db) -> anyhow::Result<()> { 701 | let mut scanner = db.scanner(1); 702 | 703 | while let Some(Ok(mut record)) = scanner.next_record() { 704 | let type_value = record 705 | .field(0) 706 | .context("missing type field") 707 | .context("invalid type field")?; 708 | 709 | if type_value.as_str() == Some("table") { 710 | let name_value = record 711 | .field(1) 712 | .context("missing name field") 713 | .context("invalid name field")?; 714 | 715 | print!("{} ", name_value.as_str().unwrap()); 716 | } 717 | } 718 | 719 | Ok(()) 720 | } 721 | 722 | fn print_flushed(s: &str) -> anyhow::Result<()> { 723 | print!("{}", s); 724 | std::io::stdout().flush().context("flush stdout") 725 | } 726 | ``` 727 | 728 | ## Conclusion 729 | 730 | The first part of our SQLite-compatible database is now complete. We can read the database header, 731 | parse table btree-leaf pages and decode records, but we still have a long way to go before we can 732 | support rich queries. In the next part, we'll learn how to parse the SQL language and make 733 | our first stides towards implementing the `SELECT` statement! 734 | --------------------------------------------------------------------------------