├── .github
    └── workflows
    │   └── run-ci.yml
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── architecture.md
    ├── btree-structural-ops.txt
    ├── btree-variable-length-cells.txt
    ├── datatypes.md
    ├── eng-journal.txt
    ├── file-format.txt
    ├── file-header.txt
    ├── functions.txt
    ├── future-work.md
    ├── leardb_architecture.png
    ├── learndb-sql-notes.txt
    ├── reference.md
    ├── scoping-and-name-resolution.txt
    ├── select-clause-evaluation.txt
    ├── sql-lang.md
    ├── storing-diff-datatypes.txt
    ├── stress-tests.txt
    ├── to_multiple_dynamic_tables.txt
    ├── tutorial.md
    └── why.md
├── learndb
    ├── __init__.py
    ├── btree.py
    ├── constants.py
    ├── cursor.py
    ├── dataexchange.py
    ├── datatypes.py
    ├── expression_interpreter.py
    ├── functions.py
    ├── interface.py
    ├── lang_parser
    │   ├── __init__.py
    │   ├── grammar.py
    │   ├── sqlhandler.py
    │   ├── symbols.py
    │   ├── utils.py
    │   └── visitor.py
    ├── name_registry.py
    ├── pager.py
    ├── pipe.py
    ├── record_utils.py
    ├── schema.py
    ├── semantic_analysis.py
    ├── serde.py
    ├── statemanager.py
    ├── stress.py
    ├── value_generators.py
    ├── virtual_machine.py
    └── vm_utils.py
├── requirements.txt
├── run_learndb.py
├── setup.py
├── sqls
    ├── employees.sql
    └── employees2.sql
├── tasks.md
└── tests
    ├── __init__.py
    ├── btree_tests.py
    ├── context.py
    ├── e2e_suite1_tests.py
    ├── e2e_suite2_tests.py
    ├── lang_tests.py
    ├── pager_tests.py
    ├── serde_tests.py
    └── test_constants.py


/.github/workflows/run-ci.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Learndb build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ main ]
 9 |   pull_request:
10 |     branches: [ main ]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python 3.11
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: 3.11
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         python -m pip install -r requirements.txt
25 |         python3 -m pip install -e .
26 |         pip install flake8 pytest # pytest-cov
27 |         pip install pytest
28 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
29 |     - name: Flake8
30 |       run: |
31 |         flake8 learndb --ignore E501,E203,W503
32 |     - name: All Tests
33 |       run: |
34 |         python -m pytest ./tests/*.py --cache-clear -v -x
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Begin license text.
 2 | Copyright (c) [2023] [Spandan Bemby]
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 5 | 
 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 7 | 
 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 9 | 
10 | End license text.
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: black
 2 | black:
 3 | 	black learndb
 4 | 
 5 | .PHONY: lint
 6 | lint:
 7 | 	flake8 ./learndb --ignore E501,E203,W503
 8 | 
 9 | .PHONY: repl
10 | repl:
11 | 	python run_learndb.py repl
12 | 
13 | .PHONY: tests
14 | tests:
15 | 	python -m pytest tests/*.py
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LearnDB
  2 | 
  3 | > What I Cannot Create, I Do Not Understand -Richard Feynman
  4 | 
  5 | In the spirit of Feynman's immortal words, the goal of this project is to better understand the internals of databases by
  6 | implementing a relational database management system (RDBMS) (sqlite clone) from scratch. 
  7 | 
  8 | This project was motivated by a desire to: 1) understand databases more deeply and 2) work on a fun project. These dual
  9 | goals led to a:
 10 | - relatively simple code base 
 11 | - relatively complete RDBMS implementation
 12 | - written in pure python
 13 |   - No build step
 14 | - zero configuration
 15 |   - configuration can be overriden
 16 | 
 17 | This makes the learndb codebase great for tinkering with. But the product has some key limitations that means it 
 18 | shouldn't be used as an actual storage solution.
 19 | 
 20 | ### Features
 21 | 
 22 | Learndb supports the following:
 23 | 
 24 | - it has a rich sql (learndb-sql) with support for `select, from, where, group by, having, limit, order by` 
 25 | - custom lexer and parser built using [`lark`](https://github.com/lark-parser/lark)
 26 | - at a high-level, there is an engine that can accept some SQL statements. These statements expresses operations on a 
 27 |   database (a collection of tables which contain data)
 28 | - allows users/agents to connect to RDBMS in multiple ways: 
 29 |   - REPL
 30 |   - importing python module  
 31 |   - passing a file of commands to the engine  
 32 | - on-disk btree implementation as backing data structure
 33 | 
 34 | ### Limitations
 35 | 
 36 | - Very simplified [^1] implementation of floating point number arithmetic, e.g. compared to
 37 |   [IEEE754](https://en.wikipedia.org/wiki/IEEE_754)). 
 38 | - No support for common utility features, like wildcard column expansion, e.g. `select * ...`
 39 | - More [limitations](./docs/tutorial.md)
 40 | 
 41 | ## Getting Started: Tinkering and Beyond
 42 | 
 43 | - To get started with `learndb` first start with [`tutorial.md`](docs/tutorial.md). 
 44 | - Then to understand the system at a deeper technical level read [`reference.md`](docs/reference.md). 
 45 | This is essentially a complete reference manual directed at a user of the system. This outlines the operations and 
 46 | capabilities of the system. It also describes what is (un)supported and undefined behavior. 
 47 | - `Architecture.md`` - this provides a component level breakdown of the repo and the system
 48 | 
 49 | ## Hacking
 50 | 
 51 | ### Install
 52 | - System requirements
 53 |   - requires a linux/macos system, since it uses `fcntl` to get exclusive read access on database file
 54 |   - python >= 3.9
 55 | - To install for development, i.e. src can be edited from without having to reinstall:
 56 |     - `cd <repo_root>`
 57 |     - create virtualenv: `python3 -m venv venv `
 58 |     - activate venv: `source venv/bin/activate`
 59 |     - install requirements: `python -m pip install -r requirements.txt`
 60 |     - install `Learndb` in edit mode: `python3 -m pip install -e .`
 61 | 
 62 | ### Run REPL
 63 | 
 64 | ```
 65 | source venv/bin/activate
 66 | python run_learndb.py repl
 67 | ```
 68 | 
 69 | ### Run Tests
 70 | 
 71 | - Run all tests:
 72 | - `python -m pytest tests/*.py`
 73 | 
 74 | - Run btree tests:
 75 | -`python -m pytest -s tests/btree_tests.py`  # stdout
 76 | - `python -m pytest tests/btree_tests.py`  # suppressed out
 77 | 
 78 | - Run end-to-end tests:
 79 | `python -m pytest -s  tests/e2e_tests.py`
 80 | 
 81 | - Run end-to-end tests (employees):
 82 | `python -m pytest -s  tests/e2e_tests_employees.py`
 83 | 
 84 | `python -m pytest -s  tests/e2e_tests_employees.py -k test_equality_select`
 85 | 
 86 | - Run serde tests:
 87 | `... serde_tests.py`
 88 | 
 89 | - Run language parser tests:
 90 | `... lang_tests.py`
 91 | 
 92 | - Run specific test:
 93 | `python -m pytest tests.py -k test_name`
 94 | 
 95 | - Clear pytest cache
 96 | `python -m pytest --cache-clear`
 97 | 
 98 | 
 99 | ## References consulted
100 | 
101 | - I started this project by following cstack's awesome [tutorial](https://cstack.github.io/db_tutorial/)
102 | - Later I was primarily referencing: [SQLite Database System: Design and Implementation (1st ed)](https://books.google.com/books?id=9Z6IQQnX1JEC&source=gbs_similarbooks)
103 | - Sqlite file format: [docs](https://www.sqlite.org/fileformat2.html)
104 | - Postgres for how certain SQL statements are implemented and how their [documentation](https://www.postgresql.org/docs/11/index.html) is organized
105 | 
106 | ## Project Management
107 | - immanent work/issues are tracked in `tasks.md`
108 | - long-term ideas are tracked in `docs/future-work.md`
109 | 
110 | [^1]: When evaluating the difference between two floats, e.g. `3.2 > 4.2`, I consider the condition True if the 
111 | difference between the two is some fixed delta. The accepted epsilon should scale with the magnitude of the number
112 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
  1 | # Architecture of LearnDB
  2 | 
  3 | The goal of this document is to give a breakdown of the different components of `Learndb`.
  4 | 
  5 | We can consider Learndb an RDBMS (relational database management system)- a system for managing the storage of structured data.
  6 | 
  7 | ## Data Flow
  8 | 
  9 | ![](./leardb_architecture.png)
 10 | 
 11 | To better understand the architecture, let's consider how the user interacts with the system in general.
 12 | 1. The user interacts via one of the many interfaces
 13 | 2. Interface receives either: 1) meta command (administrative tasks) or 2) sql program (operate on database)
 14 | 3. If the input in 2 was SQL, this is parsed into an AST by the SQL Parser module.
 15 | 4. ^
 16 | 5. AST is executed by Virtual Machine
 17 | 6. Which operates on a stack of abstractions of storage, which ground out on a single file on the local file system
 18 |     - State Manager
 19 |     - B-Tree
 20 |     - Pager
 21 |     - File
 22 | 
 23 | ## Component Breakdown
 24 | 
 25 | Learndb can be decomposed into the four logical areas for:
 26 | - storing data
 27 | - interfacing with user
 28 | - parsing user input (SQL)
 29 | - computing user queries over stored data
 30 | 
 31 | ### Storage
 32 | 
 33 | #### Filesystem
 34 | 
 35 | - File system provides access to create database file
 36 | - lowest layer of storage hierarchy
 37 | - a single db corresponds to a single file
 38 | - state of the database is persisted in a single file. But for the execution
 39 | of some statement, the state is held across memory and disk. Only when the system is closed, is the state of the database
 40 | persisted to disk.
 41 | 
 42 | #### Pager
 43 | - manages IO to database file
 44 | - expose db file as a set of pages (fixed size blocks)
 45 | - pages are referenced by their page_number
 46 | - a page (with page number page_num) are the bytes in the file from byte offset `page_num * PAGE_SIZE` to  `(page_num + 1) * PAGE_SIZE]`
 47 | 
 48 | #### B-tree
 49 | - represents an ordered set of key-value pairs
 50 | - on-disk data structure, optimized for efficient insertion and retrieval of ordered data (O(lgn))
 51 | - one table corresponds to one b-tree
 52 | - a b-tree consists of multiple nodes organized in a tree structure
 53 | - a node contains many key-value pairs.
 54 | - key is the unique, orderable primary key associated with the table; value is the structure/mapping of the other column names and values.
 55 | - b-tree interfaces with pager to get pages. 
 56 | - each b-tree node corresponds 1:1 with one page
 57 | - each row (in a table) is encoded such that the key is the primary key of the table, and the value is an encoding of the rest of the columns/fields in the row 
 58 | 
 59 | #### State Manager
 60 | - provides a higher level abstraction over database
 61 | - understands that a database has many tables; each with a different schema and b-tree
 62 | - provides lookup to schema and b-tree, by table name, such that virtual machine can operate on them
 63 | - Understands how to read the catalog
 64 |   - catalog is a special table with a hardcorded location (page_number), which contains the definition of other tables and their locations
 65 |   - location refers to the page number
 66 | - Two levels of access: metadata (resolving table tree from name) and data (operate on a sequence of rows, that allow fast (lgn) search along certain dimensions 
 67 | 
 68 | 
 69 | ### User Interface
 70 | 
 71 | - REPL
 72 | - file
 73 | 
 74 | ### SQL Parser
 75 | 
 76 | ### Parser
 77 | - converts user specified sql into an AST.
 78 | - The AST is the representation that the VM operates on.
 79 | 
 80 | ### Compute
 81 | 
 82 | #### Virtual Machine (VM)
 83 | - VM executes user sql on database state
 84 | - The VM takes an AST (instructions), and a database
 85 | (represented by a file) and runs the instructions over the database, in the process evolving the database.
 86 | 
 87 | 
 88 | ## Flows
 89 | Next, we will consider some typical flow, to highlight how different components interact
 90 | - define a database
 91 | - define a table
 92 | - insert some records into table
 93 | - delete some records
 94 | - read contents of a table
 95 | 
 96 | ### Creating a Database
 97 | Currently, a database is associated with a single db file. So a database file implicitly corresponds to one database.
 98 | The database file has a header; when the header is set- database is initialized
 99 | - a file has pages
100 | - a page is a fixed size contiguous chunk of the file.
101 | 
102 | ### Defining a Table
103 | There is a hardcoded table called catalog. Hardcoded means that it has a fixed root page number for the tree.
104 | When the user requests a new table be created, the Virtual Machine creates a b-tree corresponding to this table; i.e. the root node of the tree is allocated and page number of the root is the location of the table.
105 | The location along with the schema are stored in the catalog.
106 | 
107 | ### Inserting Record
108 | 
109 | Virtual Machine (VM) looks up the schema of the target tables. And checks the input for schema compatibility, e.g. primary key is unique.
110 | The key (the primary key of the table) and value (struct/dictionary of all other column name-value pairs) are serialized.
111 | The VM then invokes the insert method on the b-tree.
112 | 
113 | ### Reading Records
114 | Virtual Machine (VM) looks up the b-tree of the requested tables. It iterates over the b-tree using a cursor and fills
115 | a buffer with record objects. It then does a second pass over the records to create new record objects with only the requested fields.
116 | 
117 | ### Deleting Records
118 | Virtual Machine (VM) looks up the b-tree of the requested tables. It then invokes delete on the b-tree.
119 | 


--------------------------------------------------------------------------------
/docs/btree-structural-ops.txt:
--------------------------------------------------------------------------------
 1 | The goal of this doc is to explain how the btree structure changes when keys are inserted
 2 | and deleted.
 3 | 
 4 | The ops in questions are insert, split, delete, and compact operations, and create and delete root
 5 | operations on leaf and internal node
 6 | 
 7 | # Insert And Split
 8 | 
 9 | Given a new cell (and an included key), we find the node and cell_num on the tree
10 | where the new cell can be placed.
11 | 
12 | Note, free space is maintained in 2 places: allocation block, and the free (linked) list.
13 | 
14 | For the new cell, we first try to provision it on the free list
15 |     To do so, we iterate over free list and find the first block of equal or greater size.
16 |     If we find a block, we fragment it (being mindful of minimum block size), update the free
17 |     list, and total_space_on_free_list
18 | 
19 | If this does not satisfy, we check if the allocation block has enough space. If it does, we allocate
20 | the cell there. Then, update the alloc_ptr
21 | 
22 | If this does not satisfy, we check if total free space (i.e. sum of all space in allocation block + free list) will
23 | satisfy the cell. If so, we compact the node, and insert the cell.
24 | 
25 | If, the total free space on a cell does not satisfy. Then we must "split" the node. The split operation will create
26 | 2, 3 new nodes and recycle the old node.
27 | 
28 | The children nodes must be passed to parent insert method, i.e. internal_node_insert_node will have to be updated to handle
29 | 2 or 3 children.
30 | 
31 | # todo: complete me


--------------------------------------------------------------------------------
/docs/btree-variable-length-cells.txt:
--------------------------------------------------------------------------------
 1 | Currently, the btree supports fixed length cells.
 2 | To support variable length cells, the pages will be organized like:
 3 |  - (low address) header, cell pointer array, unallocated space, cells (high address)
 4 | 
 5 | The current leaf node header layout is:
 6 | 
 7 | nodetype .. is_root .. parent_pointer
 8 | num_keys .. key 0 .. val 0 .. key N-1 val N-1
 9 | 
10 | ---
11 | 
12 | To support variable length cells, the leaf must be
13 | formatted like:
14 | 
15 | nodetype .. is_root .. parent_pointer
16 | num_cells .. alloc_ptr .. free_list_head_ptr .. total_bytes_in_free_list
17 | ...
18 | cellptr_0, cellptr_1,... cellptr_N
19 | ...
20 | unallocated-space
21 | ...
22 | cells
23 | 
24 | ---
25 | Header field description
26 | 
27 | NOTE: free space is managed in two ways:
28 | - first is the allocation block. On an empty page, this starts at the high address and grows towards low addresses,
29 |   until it reaches the cell ptrs.
30 | - second is converting the fixed single block into multiple fixed-sized free lists.
31 |     - When records are deleted, there will be free spaces between allocated cells
32 |     - these are managed through a free-list
33 |         - each node needs to store: 1) size of current block, 2) offset to next block
34 |         - I'm sticking with a whole word (for ease) for both of these, i.e. 8Bytes.
35 |         - That means blocks below 8B cannot be allocated
36 | 
37 | - we will need to defragement the space
38 |     - either periodically or lazily as needed
39 | 
40 | alloc_ptr
41 |     - this is the beginning of unallocated space
42 |     - the free space grows towards lower addresses
43 |     - the free space ends (implicit) at the end of cell ptr area
44 | 
45 | free_block_ptr
46 |     - offset to first free block, i.e. head of free list
47 |     - free blocks are randomly interspersed between cells
48 | 
49 | cell_ptr
50 |     - absolute (?) offset to cell location
51 | 
52 | 
53 | NOTE: sqlite uses an "offset to first byte of cell content area". This seems to be only needed if cellptr is not big enough
54 | to provide a direct/absolute offset. For a page of size 4096, I need 12 bits to index anywhere. A word sized (32 bits) ptr
55 | can index into a page of size 4B. One reason sqlite may do so is because they use variable-length integer encoding,
56 | which would save much space.
57 | 
58 | 
59 | - here the cell ptr is a fixed size value representing an offset on page
60 | 
61 | 
62 | - how should free space be managed?
63 | - one approach is to keep a massive blob of free space, and then allocate chunks as needed
64 |     - this requires periodic defragmentation
65 | - second approach is to keep multiple lists of different fix sizes. Then space allocation is just about looking
66 |   in the closest sized list
67 |   - this leads to faster allocation, but wasted space
68 |   - another challenge here is that I need to have a sense of what sizes list to create
69 | - there will always be a space/time tradeoffs between these and countless variations:
70 |     - e.g. I can store more control info, e.g. keep more granular size lists but this has
71 |     a direct space cost, but leads to faster allocation
72 | 


--------------------------------------------------------------------------------
/docs/datatypes.md:
--------------------------------------------------------------------------------
 1 | This documents all the datatypes.
 2 | 
 3 | integer: 
 4 |     - integral number
 5 |     - lower_bound: ???
 6 |     - upper_bound: ???
 7 | 
 8 | 
 9 | real: 
10 |     - floating point number
11 |     - lower_bound
12 |     - upper_bound
13 |     - notes: real numbers within real_epsilon of each other
14 |     are handled as equal
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/file-format.txt:
--------------------------------------------------------------------------------
 1 | Overview
 2 | --------
 3 | I want to explicate the file format details for learndb. It's largely inspired by sqlite- with many simplifications.
 4 | NOTE: the details of the file format may be contained in other docs; but this doc is standalone and authoritative.
 5 | 
 6 | Internal Node
 7 | -------------
 8 | The current internal nodes will largely be unchanged i.e.
 9 | 
10 |     nodetype .. is_root .. parent_pointer
11 |     num_keys .. right-child-ptr
12 |     ptr 0 .. key 0 .. ptr N-1 key N-1
13 | 
14 | Internal nodes require keys to be word-sized integers
15 | 
16 | 
17 | Leaf Node Layout
18 | -----------------
19 | Leaf nodes will be formatted like:
20 | 
21 | - (low address) header, cell pointer array, unallocated space, cells (high address)
22 | - header:
23 |     nodetype .. is_root .. parent_pointer
24 |     num_cells .. alloc_ptr .. free_list_head_ptr .. total_free_list_space
25 |     ...
26 |     cellptr_0, cellptr_1,... cellptr_N
27 |     ...
28 |     unallocated-space
29 |     ...
30 |     cells
31 | 
32 | Leaf Node Cell Ptrs
33 | -------------------
34 | - cell ptrs are sorted by key (word size);
35 | - contain absolute page offset to cell
36 | 
37 | Leaf Node Cell
38 | --------------
39 | - cell -> [key-size, data-size, key-payload, data-payload)]
40 |     -- data payload -> [header, body]
41 |     -- data header -> [size of header, serial types (size of variable length value)]
42 |     -- data body -> concatenated bytes of serialized values (in definition order)
43 |     -- all data must fit in a cell, i.e. no overflow- this limits the max content size to what can fit in a single cell
44 |     -- NOTE: the key size is fixed to word size for now
45 |         -- encoding key size will make supporting variable length keys easier
46 |         -- variable sized keys would also require updating internal node logic
47 | 
48 | 
49 | Serial Types
50 | ------------
51 |     sqlite for inspiration (https://www.sqlite.org/fileformat2.html#record_format)
52 | 
53 |         serial-type  byte-length  datatype
54 |         0            0            Null
55 |         1            4            Integer
56 |         2            4            Real
57 |         3            var          Text
58 |         4            var          Blob
59 | 
60 |     Types with a fixed-value, e.g. null will not be encoded in the data payload.
61 | 
62 | 
63 | Free Space on Leaf Nodes
64 | ------------------------
65 | Free space is managed via:
66 | 1) Allocation block -
67 |     - bounded by the right most (maximal address) of a cell ptr
68 |     - bounded by first allocation cell
69 |     - Allocation ptr (alloc_ptr) moves towards smaller addresses
70 |     - alloc_ptr points past the first allocatable byte, i.e. on page of size 4096 it is initialized to 4096.
71 |         - the last indexable byte is 4095
72 |         - to insert a blob of size N, e.g. 8, we write to address: [alloc_ptr - N: alloc_ptr]
73 | 
74 | 2) Free list - singly-linked list
75 |     - each free block stores: 1) size of current block, 2) location (abs offset) of next block
76 |         i.e. block-header -> block-size, next-block-ptr
77 |     - also store total number of free bytes (in header)
78 |     - NOTE: cells are never fragmented
79 | 
80 | 
81 | Storage
82 | -------
83 | Most fields will be a word length, unless otherwise noted.


--------------------------------------------------------------------------------
/docs/file-header.txt:
--------------------------------------------------------------------------------
 1 | This outlines the file header for the learndb file.
 2 | 
 3 | Motivation
 4 | ----------
 5 | The file header is motivated by how free pages are managed. When a node is removed from a btree, the underlying page
 6 | is returned to the pager to be recycled. This list of free pages is kept in an in-memory list. While running,
 7 | the pager can re-provision these pages. However, when the virtual machine and pager are shutdown:
 8 |     - the db file is truncated to remove any free pages at the end of the file
 9 |     - the page number information of intermediate free pages is lost, i.e. the space in regions of the file cannot be allocated.
10 |       (* Unless we walk through the file and determine which pages are in use and which can be recycled)
11 | 
12 | Solution
13 | ---------
14 | There are three broad ways to address this:
15 |     - 1. track free pages on-disk
16 |         - e.g. via an on-disk linked list.
17 |     - 2. compact database file on closing, so that there are no intermediate free pages.
18 |         - this has performance and timing concerns
19 |     - 3. reindex all blocks on the database file and compact database file to remove unused pages
20 |         - could be done offline
21 | 
22 | I will implement option 1 for it's overall simplicity and runtime cost.
23 | 
24 | Specifically, we will maintain a linked list. Logically the linked list has a nullable head, and nodes with
25 | payload, i.e. the free page number, and a reference to the next free page.
26 | 
27 | The head of this linked list is set in the header.
28 | Specifically,
29 |     - one field in the header encodes whether the linked is non-empty, and
30 |     - another field encodes the page number of the head node
31 | 
32 | The header points to a free page (logically a node in the linked list). Which points to the next free page.
33 | Pages use the first two bytes to encode:
34 |     1. whether the next pointer was set
35 |     2. location of next node
36 |         - i.e. page number of next free node, (as opposed to byte offset)
37 | 
38 | File Header
39 | --------------
40 | 
41 | The file header, will also store a version number. The version is forward-thinking and will be useful
42 | for distinguishing different incompatible file formats and what version a given engine can operate on- albeit isn't
43 | directly needed now.
44 | 
45 | File Header Fields:
46 |  file header -> version_string next_free_page padding
47 |  version_string  -> "learndb v<VersionNum>"
48 |  next_free_page -> int, next page_num
49 |  has_free_page_list -> bool, whether free_page has contents
50 | 
51 | VersionNum start at 1 and increments by 1 after every incompatible change.
52 | 
53 | The file header will be padded with empty bytes such that the total length of the file header is 100 bytes.
54 | This gives us the ability to add new fields into the header.
55 | 
56 | A null value in the linked list is specified with an offset of 0.


--------------------------------------------------------------------------------
/docs/functions.txt:
--------------------------------------------------------------------------------
 1 | This document elaborates on what is a function in the scope of learndb, how they are modelled, defined, and evaluated
 2 | by the vm.
 3 | 
 4 | Function Arguments
 5 | ------------------
 6 | functions can accept:
 7 |  - 0 args, e.g. current_time
 8 |  - scalar args, e.g. square(2)
 9 |  - or recordset args, e.g. any aggregation functions, e.g. max
10 | 
11 | 
12 | Function Model
13 | --------------
14 | What is the interface of a function object?
15 |     - determine the return type, e.g. when generating output schema
16 |     - determine params, and validate args
17 |     - evaluate function for a given set of args
18 | 
19 | How are functions represented?
20 | Depends on function. There are two types of functions:
21 |  1) native, i.e. those implemented in Python
22 |  2) non-native, i.e. those implemented in learndb-sql.
23 | 
24 | 
25 | We represent a function with the following object:
26 | 
27 | ```
28 | class FunctionDefinition:
29 |     def __init__(self,
30 |                  func_name: str,
31 |                  pos_params: List[Union[Type[DataType], List[Type[DataType]]]],
32 |                  named_params: Dict[str, Type[DataType]],
33 |                  func_body: Callable,
34 |                  return_type: Type[DataType]):
35 |         ...
36 | ```
37 | 
38 | Every function
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/future-work.md:
--------------------------------------------------------------------------------
 1 | # Future Work
 2 | The below is some ideas for future work. 
 3 | 
 4 | - Indexing
 5 | — default primary key
 6 | -— rowid can be alias for primary key; if no pkey- rowid has autoincr behavior. Catalog can store max-row-id
 7 | — multiple column key
 8 | — Secondary indices- store rowid /pkey + indexed column into primary tree
 9 | —- secondary index can use btree class. Key will be column indexed; data will be rowid of row in primary idx. Will require support for var len keys eg if creating an index on a text field
10 | 
11 | - More complete lang support
12 | — create db cmd
13 | —- db identified via filepath
14 | — select + where, joins, group by
15 | — joins will be nested for loop
16 | — nested subquery
17 | — update columns; 
18 | 
19 | - Query execution- name resolution, query optimization 
20 | — QO: use primary index, and-clause can use secondary index if needed; else full table scan
21 | 
22 | - Transactions
23 | — needed for: 
24 | —- multiple table/index updates, eg if one table fails (atomicity)
25 | —- Multi-user reads/writes (consistency + isolation)
26 | —- protect against inconsistent state due to failure (durability)
27 | —- Sqlite implement trans via pager controlling access to page
28 | —- simple trans could be impl with WAL for durability, read and write locks. Locks can be implemented as native FS locks. There can be many read locks. One write lock. 
29 | 
30 | 
31 | - Temp tables for large result sets- otherwise OOM. 
32 | - Pager should autoflush pages - ie page cache should be bounded


--------------------------------------------------------------------------------
/docs/leardb_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spandanb/learndb-py/242884e2418a09f480f17eac65e9c88e518bab1a/docs/leardb_architecture.png


--------------------------------------------------------------------------------
/docs/learndb-sql-notes.txt:
--------------------------------------------------------------------------------
1 | This documents aspects of learndb-sql that may lead to gotchas.
2 | 
3 | * Simple select with alias
4 | If an alias is used in a simple select, e.g. select * from foo f where f.cola > 4,
5 | then all column references must be fully scoped, i.e. f.<col>. Thus, the following would be invalid:
6 | select * from foo f where f.cola > 4
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/reference.md:
--------------------------------------------------------------------------------
  1 | # Reference 
  2 | 
  3 | The goal of this document is intended to provide a complete reference to learndb from the perspective of a user of 
  4 | the system. 
  5 | 
  6 | ## Preface
  7 | 
  8 | ### Overview
  9 | 
 10 | _Learndb_ is a RDBMS (relational database management system). 
 11 | 
 12 | Let's unpack this, _relational_ means it can be used to express relations between different entities, e.g. between
 13 | `transactions` and `users` involved in them. In some databases this is expressed through a foreign key constraint, 
 14 | which constrains the behavior/evolution of one table based on another table(s). 
 15 | This is not supported/yet-implemented in learndb [^1].
 16 | 
 17 | _Database_ is a collection of tables, each of which has a schema, and zero or more rows of records. The 
 18 | schema defines:
 19 | - the names of columns/fields 
 20 | - what types of data are supported in each field
 21 | - any constraint, e.g. if field data can be null or if the field is primary (i.e. must be unique and not null).
 22 | 
 23 | The _state_ of a single database (i.e. the schema of the tables in it, and the data within the tables) is persisted 
 24 | in a single file on the host filesystem.
 25 | 
 26 | The _management system_ manages multiple databases, i.e. multiple isolated collections of tables. The system exposes
 27 | interface(s) for: 
 28 | - creating and deleting databases
 29 | - creating, modifying, and deleting tables in a database
 30 | - adding, and removing data from tables
 31 | 
 32 | 
 33 | ### Setup
 34 | 
 35 | Learndb can only be setup from the source repo (i.e. no installation from package repository, e.g. PyPI). The 
 36 | instructions are outlined in [README](../README. md) section `Hacking -> Install`
 37 | 
 38 | ## Interacting with the Database 
 39 | 
 40 | Learndb is an _embedded database_. This means there is no standalone server process. The user/agent connects to the 
 41 | RDMBS via: 
 42 | - REPL
 43 | - python language library
 44 | - passing a file of commands to the engine  
 45 | 
 46 | Fundamentally, the system takes as input a set of statements and creates and modifies a database based on the system.
 47 | 
 48 | ### REPL
 49 | 
 50 | The _REPL_ (read-evaluate-print loop) provides an interactive interface to provide statements the system can execute.
 51 | The user can provide: 1) SQL statements (spec below) or 2) meta commands. SQL statements operate on
 52 | 
 53 | #### Meta Commands
 54 | Meta commands are special commands that are processed by core engine. These include, commands like `.quit` which exits
 55 | the terminal.
 56 | 
 57 | But these commands more broadly expose non-standard commands, i.e. not part of sql spec - parser. Why some commands 
 58 | are meta commands, rather than part of the sql, e.g. `.nuke` which deletes the content of a database, is a 
 59 | peculiarity of how this codebase evolved.   
 60 | 
 61 | #### Output
 62 | 
 63 | Output is printed to console.
 64 | 
 65 | ### Python Language Library
 66 | 
 67 | `interface.py` defines the `Learndb` class entity which can be imported.
 68 | 
 69 | TODO: generate code docs, and link interface.py::Learndb, Pipe here
 70 | 
 71 | Two important entities needed to programmatically interact with the database are `Learndb`, i.e. the class that 
 72 | represents a handle to the database, and `Pipe`
 73 | 
 74 | ```
 75 | Learndb
 76 |   - 
 77 |  
 78 |  Pipe
 79 |   - 
 80 | ```
 81 | 
 82 | ```
 83 | # create handler instance
 84 | db = LearnDB(db_filepath)
 85 | 
 86 | # submit statement
 87 | resp = db.handle_input("select col_a from foo")
 88 | assert resp.success
 89 | 
 90 | # below are only needed to read results of statements that produce output
 91 | # get output pipe
 92 | pipe = db.get_pipe()
 93 | 
 94 | # print rows
 95 | while pipe.has_msgs():
 96 |     print(pipe.read())
 97 |     
 98 | # close handle - flushes any in-memory state
 99 | db.close()
100 | ```
101 | 
102 | #### Output
103 | 
104 | `Pipe` contains all records.
105 | 
106 | ### Filesystem Storage 
107 | 
108 | The state of entire DB is stored on a single file. The database can be thought of as a logical entity, that is 
109 | stored in some physical medium.
110 | 
111 | There is a 1 to 1 correspondence between a file and its database. Hence, we can consider the implied database, when 
112 | discussing a database file, and vice versa. Within the context of a single file, there is a single, global, unnamed 
113 | database. 
114 | 
115 | This means the language only has 1 part names for tables, i.e. no schema, no namespacing.
116 | 
117 | Further, deleting the `db.file` effectively equals dropping the entire database.
118 | 
119 | ### ACID compliance
120 | 
121 | Atomic - not atomic. No transactions. Also, no guarantee database isn't left in an inconsistent state due to 
122 | partial statement execution.
123 | 
124 | Consistent - strong consistency; storage layer updated synchronously
125 | 
126 | Isolated - guaranteed by database file being opened in exclusive read/write mode, and hence only a single connection to 
127 | database exists.
128 | 
129 | Durable - As durable as files on underlying filesystem.
130 | 
131 | ## The SQL Language (learndb-sql)
132 | 
133 | The learndb-sql grammar can be found at: `<repo_root>/learndb/lang_parser/grammar.py`. 
134 | 
135 | ### Learndb-sql grammar specification
136 | 
137 | The grammar for learndb-sql is written using [lark](https://github.com/lark-parser/lark). Lark is a parsing library 
138 | that allows defining a custom grammar, and parsers for text based on the grammar into an 
139 | [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree). We'll go over Lark basics because statements in learndb-sql 
140 |  are specified in lark [grammar language](https://lark-parser.readthedocs.io/en/latest/grammar.html). 
141 | 
142 | - Grammar rules are specified in a form similar to [EBNR notation](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form).
143 | - the grammar is made up of terminals and rules.
144 | - terminals are named with an uppercase name, and are defined with a literal or regex expression 
145 |   - e.g. `IDENTIFIER : ("_" | ("a".."z") | ("A".."Z"))* ("_" | ("a".."z") | ("A".."Z") | ("0".."9"))+`
146 |   - these define value literals, and keywords of the language
147 | - grammar rules consist of `left-hand-side : right-hand-side`, where the left side has the name of the terminal or 
148 |   rule, and the right side has one or more matching definition expressions   
149 | - rules are named with a lowercase name, and are patterns of literals and symbols (terminals and rules)
150 | - e.g. ```create_stmnt     : "create"i "table"i table_name "(" column_def_list ")" ```
151 | - Here `"create"i`, `"("`, and `")"` are literals that matche `create`, `(`, an`)`,  respectively.
152 |   - `table_name` and `column_def_list` are other rules with their own definitions
153 | 
154 | 
155 | ### Data Definition
156 | 
157 | #### Constraints
158 | 
159 | Tables can have the following constraints:
160 | 
161 | - `Not Null` - value cannot be null
162 | - `Primary Key` - value cannot be not and must be unique
163 | 
164 | #### Data Types
165 | 
166 | Table columns can have the following types:
167 | 
168 | - `Integer`
169 |   - 32 bit integer
170 | - `Real`
171 |   - single precision floating point number 
172 | - `Text`
173 |   - unlimited length character string
174 | - `Boolean`
175 | - `Null`
176 | 
177 | Note, how `Real` typed data is handled is different from how floats are typically
178 | handled (i.e. [IEEE754]( https://en.wikipedia.org/wiki/IEEE_754)).
179 | 
180 | #### Create Table Statement
181 | 
182 | ```
183 | create_stmnt     : "create"i "table"i table_name "(" column_def_list ")"
184 | 
185 | ?column_def_list  : (column_def ",")* column_def
186 | ?column_def       : column_name datatype primary_key? not_null?
187 | datatype         : INTEGER | TEXT | BOOL | NULL | REAL
188 | primary_key      : "primary"i "key"i
189 | not_null         : "not"i "null"i
190 | table_name       : SCOPED_IDENTIFIER
191 | IDENTIFIER       : ("_" | ("a".."z") | ("A".."Z"))* ("_" | ("a".."z") | ("A".."Z") | ("0".."9"))+
192 | SCOPED_IDENTIFIER : (IDENTIFIER ".")* IDENTIFIER
193 | ```
194 | An example is 
195 | ```
196 | Create table fruits (id integer primary key, name text, avg_weight real)
197 | ```
198 | 
199 | > NOTE: an integer primary key must be declared, i.e. it's declaration and datatype are mandatory 
200 | 
201 | #### Drop Table Statement
202 | 
203 | ```
204 |   drop_stmnt       : "drop"i "table"i table_name
205 | ```
206 | An example is 
207 | ```
208 | Drop table fruits
209 | ```
210 | 
211 | ### Data Manipulation
212 | 
213 | #### Data Insertion
214 | 
215 | ```
216 | insert_stmnt     : "insert"i "into"i table_name "(" column_name_list ")" "values"i "(" value_list ")"
217 | 
218 | column_name_list : (column_name ",")* column_name
219 | value_list       : (literal ",")* literal
220 | column_name      : SCOPED_IDENTIFIER
221 | literal          : INTEGER_NUMBER | REAL_NUMBER | STRING | TRUE | FALSE | NULL
222 | ```
223 | 
224 | An example is:
225 | 
226 | ```
227 | insert into fruits (id, name, avg_weight) values (1, 'apple', 4.2);
228 | ```
229 | 
230 | 
231 | #### Data Deletion
232 | 
233 | ```
234 | delete_stmnt     : "delete"i "from"i table_name where_clause?
235 | 
236 | where_clause     : "where"i condition
237 | condition        : or_clause
238 | or_clause        : and_clause
239 |                  | or_clause "or"i and_clause
240 | and_clause       : predicate
241 |                  | and_clause "and"i predicate
242 | predicate        : comparison
243 |                  | predicate ( EQUAL | NOT_EQUAL ) comparison
244 | comparison       : term
245 |                  | comparison ( LESS_EQUAL | GREATER_EQUAL | LESS | GREATER ) term
246 | term             : factor
247 |                  | term ( MINUS | PLUS ) factor
248 | factor           : unary
249 |                  | factor ( SLASH | STAR ) unary
250 | unary            : primary
251 |                  | ( BANG | MINUS ) unary
252 | 
253 | primary          : literal
254 |                  | nested_select
255 |                  | column_name
256 |                  | func_call
257 | ```
258 | 
259 | An example is:
260 | 
261 | ```
262 | delete from fruits where id = 1;
263 | ```
264 | 
265 | ### Queries
266 | 
267 | Let's consider how we can query tables.
268 | 
269 | ```
270 | select_stmnt     : select_clause from_clause?
271 | select_clause    : "select"i selectable ("," selectable)*
272 | selectable       : expr
273 | 
274 | from_clause      : "from"i source where_clause? group_by_clause? having_clause? order_by_clause? limit_clause?
275 | where_clause     : "where"i condition
276 | group_by_clause  : "group"i "by"i column_name ("," column_name)*
277 | having_clause    : "having"i condition
278 | order_by_clause  : "order"i "by"i (column_name ("asc"i|"desc"i)?)*
279 | limit_clause     : "limit"i INTEGER_NUMBER ("offset"i INTEGER_NUMBER)?
280 | 
281 | source            : single_source
282 |                   | joining
283 | 
284 | single_source      : table_name table_alias?
285 | 
286 | //split conditioned and unconditioned (cross) join as cross join does not have an on-clause
287 | ?joining          : unconditioned_join | conditioned_join
288 | conditioned_join  : source join_modifier? "join"i single_source "on"i condition
289 | unconditioned_join : source "cross"i "join"i single_source
290 | 
291 | join_modifier    : inner | left_outer | right_outer | full_outer
292 | 
293 | inner            : "inner"i
294 | left_outer       : "left"i ["outer"i]
295 | right_outer      : "right"i ["outer"i]
296 | full_outer       : "full"i ["outer"i]
297 | cross            : "cross"i
298 | 
299 | // `expr` is the de-facto root of the expression hierarchy
300 | expr             : condition
301 | ```
302 | 
303 | #### Simple Queries
304 | 
305 | A select statement can contain `from`, `where`, `group by`, `having`, `limit` and `offset` clauses.
306 | 
307 | The simplest select statement has no `from` clause. This effectively, evaluates any expression. e.g.
308 | ```select 1+1```
309 | 
310 | The simplest select statement over a datasource is  a `select ... from ... ` without a where clause, e.g.
311 | ```select name from fruits```
312 | 
313 | This  will return all rows from the datasource.
314 | 
315 | #### Query with Conditions
316 | 
317 | Consider a query with a simple condition
318 | 
319 | ```select name from fruits where id = 1```
320 | 
321 | Consider a query with a simple condition
322 | 
323 | ```select name from fruits where avg_weight > 2.0 and avg_weight < 5.0 ```
324 | 
325 | Note, the condition can be composed of arbitrary logical operations, e.g.
326 | 
327 | ```select name from fruits where avg_weight > 2.0 and avg_weight < 5.0 or name = 'apple' ```
328 | 
329 | #### Scoping
330 | 
331 | There is a global, assumed scope. All table names live in this global scope. 
332 | 
333 | Further, aliases for tables in the context of a query, are defined for the duration of the query.
334 | 
335 | ### Functions
336 | 
337 | #### User-Defined Functions
338 | 
339 | Theoretically, a user can define functions in one of two ways: 
340 |   - in learndb-sql (non-native); however, this is not yet implemented
341 |   - in the implementation language, i.e. Python (native). For more details see [./functions.txt](./functions.txt)
342 |   
343 | ## Internals 
344 | 
345 | ### Storage Layer 
346 | 
347 | The storage layer consists of an on-disk btree. The btree is accessed through the below API. Any other backing data structure,
348 | that implements the above API could easily replace the current implementation.
349 | 
350 | #### Storage API
351 | 
352 | The Storage API, is the implicit (not formally required by virtual machine) API exposed by the storage layer data structure.
353 | The API consists of:
354 | - insert(key, value)
355 | - get(key)
356 | - delete(key)
357 | 
358 | 
359 | #### Btree implementation notes
360 | - Many constants that control the layout of the btree are set in `constants.py`
361 | - `LEAF_NODE_MAX_CELLS`, `INTERNAL_NODE_MAX_CELLS` control how many max children, leaf and internal nodes can have, respectively
362 | 
363 | 
364 | 
365 | ## Unsupported Features
366 | - at a single time, only a writer, per db; i.e. no multi writer
367 | - no authentication
368 | - floats implemented very crudely; expression eval uses a fixed epsilon
369 | 
370 | ## Footnotes
371 | 
372 | [^1]: Arguably, a system can't be called _relational_ without foreign key constraints. But relations can still be 
373 | modelled and foreign keys can still be used- just that the integrity of the constraints can't be enforced. So for 
374 | simplicity, I will call this system an RDBMS. 


--------------------------------------------------------------------------------
/docs/scoping-and-name-resolution.txt:
--------------------------------------------------------------------------------
 1 | This document is intended to scoping and name resolution works from user interaction perspective,
 2 | and architecturally.
 3 | 
 4 | # Scoping Logic
 5 | 
 6 | ## Scope
 7 | -----
 8 | A `scope` is a logical namespace/environment which contains different names. A name could be a function name,
 9 | a table_name, and intermediate materialized dataset (e.g. a joining of two tables) or a column_name.
10 | Note, these are the current types of names in learndb, but there could be other types in the future.
11 | 
12 | ## Global Scope
13 | ------------
14 | There will be a global scope, that always exists.
15 | Functions will live in a global scope. As well tables.
16 | 
17 | ## Non-global Scope
18 | -----------------
19 | All statements will define a local scopes, which will contain
20 | 
21 | ## Table Definition
22 | -----------------
23 | When a table is defined, it will be added to a global scope.
24 | 
25 | 
26 | # Architecture
27 | 
28 | Name Registry - responsible for registering name, and resolving names
29 | 
30 | StateManager, and NameRegistry are both doing name lookups.
31 | StateManager already has more of that code. So move all scoping and related ops under StateManager.
32 | This includes recordsets, group_recordsets, etc.
33 | 
34 | NameRegistry can be kept as is for now; but later it may make sense to combine
35 | with StateManager
36 | 
37 | Scope management:
38 | - There will be a scope created for each statement.
39 | - And the end of a statement, the scope should be popped, and all objects within it should be recycled.
40 | - If statements are nested, this will lead to nested scopes.
41 | - Name resolution of names proceeds from inner most scope to outermost scope.
42 | - All long living objects/names, will belong to a logical global scope
43 | 


--------------------------------------------------------------------------------
/docs/select-clause-evaluation.txt:
--------------------------------------------------------------------------------
 1 | This documents the details for how a select-clause is evaluated.
 2 | 
 3 | First we distinguish between select over grouped from select over ungrouped recordset.
 4 | Consider a ungrouped recordset, e.g.
 5 | 
 6 | select upper(u.first_name), u.last_name
 7 | from users u
 8 | 
 9 | The select would get evaluated after the recordset is built. Once it's built, we iterate over each record in the recordset
10 | and convert it to an output recordset.
11 | 
12 | The output recordset has one column for each column/expr in the select clause.
13 | 
14 | Each column of the output recordset has a corresponding generator.
15 | 
16 | The generator will take zero or more columns from the input record, and map
17 | 
18 | When the select expr is parsed, we construct a mapping from the function params to column reference, e.g.
19 | since upper takes a single positional arg, our mapping looks like:
20 |     - pos_args: [u.first_name]
21 |     - named_args: {}
22 | 
23 | Let's consider how the evaluation will work with pseudo code:
24 | 
25 | - scalar case:
26 | # generate output schema
27 | value_generators = []
28 | for selectable in selectables:
29 |     valgen = make_value_gen(selectable)
30 |     value_generators.append(valgen)
31 | 
32 | # generate output records
33 | output_records = []
34 | for rec in input_records:
35 |     # one val_gen for each output column
36 |     column_vals = []
37 |     for val_gen in value_generators:
38 |         column_vals.append(val_gen.generate(rec))
39 |     out_record = RecordGen.from_columns(column_vals)
40 |     output_records.append(out_record)
41 | 
42 | 
43 | - vector case:
44 | what does the vector case look like?
45 | - in both vector and scalar cases, we want the function to operate on formal params, i.e. in the scalar case
46 | the func operates on arg, and not record.
47 | - in the vector case, we want the function to operate on array[values], or iterableOnce[values], but not on recordset.
48 | 
49 | pseudo code:
50 | 
51 | for group_key, group_recordset for input_records.groups:
52 | 
53 | ----
54 | 
55 | Modelling ValueGenerators
56 | ----------------------
57 | ValueGenerators are generated when parsing the select expr.
58 | - There is one generator per output columns
59 | - Each generator tracks the formal references to column names, and non-column values, e.g. select sha2(customer_id, bits=512) from customers
60 | here the generators would track sha2(pos_args: customer.customer_id, named_args: 512)
61 | 
62 | What is the interface between recordset and valuemapper?
63 | 
64 | ScalarValueMapper is constructed before iteration; tracks func, column name to func arg mapping,
65 | Eg select upper(col_name); mapper tracks func: upper, arg-col-map: {0: col_name}
66 | Where:
67 | Def upper(arg): arg.upper()
68 | 
69 | 


--------------------------------------------------------------------------------
/docs/sql-lang.md:
--------------------------------------------------------------------------------
 1 | This outlines the input language (~subset of ANSI sql) that is supported.
 2 | 
 3 | The grammar can be found in the <src>/lang_parser/grammar.py file
 4 | 
 5 | Some key callouts,
 6 | 
 7 | - all columns must be explicitly selected, i.e. no support for `select *`
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/storing-diff-datatypes.txt:
--------------------------------------------------------------------------------
 1 | how does a record get stored?
 2 | If I want to support nulls and variable length strings, seems like I'll need to store something like
 3 | the schema in the cells themselves.
 4 | 
 5 | sqlite handles this via Record Format.
 6 | Basically, the record, i.e. body of a leaf entry, has a header and a body.
 7 | the header is like:
 8 | - size of header (4b)
 9 | - one int per column (called serial types)-
10 |     - the serial type encodes the datatype
11 |     e.g.
12 |     serial-type  byte-length  datatype
13 |     0            0            Null
14 |     1            4            Integer
15 |     2            4            Real
16 |     2            var          Text
17 |     3            var          Blob
18 | 
19 | I'll also need to encode length for variable types. This could be a set of ints after the serial types, one for
20 | each variable length value, e.g.
21 | 
22 | [size of header, serial types, integer length of each value that is variable encoded]
23 | 
24 | Thus, since we have to store record details- I should also rethink what's the best place to handle serde.
25 | 


--------------------------------------------------------------------------------
/docs/stress-tests.txt:
--------------------------------------------------------------------------------
 1 | This document outlines stress testing.
 2 | 
 3 | Unit tests provide a sanity check on the code. But there are often many code paths, logical states that
 4 | are unexplored. To explore these, we could use some kind of randomized search. Specifically, first define a
 5 | directed graph, where by nodes represent a precedence relationship in terms of which ops are valid on a database, e.g.
 6 | 1) create database
 7 | 2) create table
 8 | 3) insert a record
 9 | 4) delete a record
10 | 
11 | Consider, specifically, we can only create a table, once we have created a database.
12 | 
13 | The graph could be modelled like a Markov process, whereby, e.g. state 1 (database created), randomly transitions
14 | to a valid op, e.g. create table, with some probability. The probabilities could be changed to incentive certain path
15 | explorations.
16 | 
17 | In addition, each node, i.e. each op will in some case generate random output, e.g. insert key {rand_int}, here rand_int
18 |  being a random int.
19 | 
20 | This should allow exploration of paths that explicit tests don't explore
21 | 
22 | This exploration would be bound be time or number of steps.
23 | Validations can be applied at various frequencies. Validations can be applied at each time step, at the end of the entire stress op,
24 | or perhaps somewhat randomly, like a sample.
25 | 
26 | Validations could extend the current btree validations. Specifically, we would also want to consider changes
27 | to the catalog- and other non-btree objects.


--------------------------------------------------------------------------------
/docs/to_multiple_dynamic_tables.txt:
--------------------------------------------------------------------------------
  1 | # overview: single, static table -> multiple, dynamic tables
  2 | 
  3 | Currently, learndb can support a single global table with a fixed schema (integer-key, fixed length body "hello database").
  4 | Next, I want to support: multiple tables, with a dynamic schema. The following is an exposition of what's needed to
  5 | enable that.
  6 | 
  7 | - SQL language frontend, i.e. tokenize and parse minimal DDL, DML, DQL (done)
  8 | - the front end outputs a parsed representation that can be executed by the vm (done)
  9 | - handle DDL (create table)
 10 | -- validate
 11 | --- check if primary key is defined
 12 | --- data types are valid
 13 | -- create entry (tabledef) in metadata catalog
 14 | - handle insert
 15 | -- validate
 16 | --- find table in catalog
 17 | --- check if col names match schema
 18 | --- check values are valid, type checking
 19 | - call tree to insert
 20 | - delete will be similar to insert
 21 | 
 22 | ---
 23 | 
 24 | # metadata catalog
 25 | 
 26 | metadata catalog could be one for all objects, e.g. tables.
 27 | the metadata catalog is a special table e.g.
 28 |  create table sqllite_master (
 29 |         type  text,
 30 |         name text,
 31 |         tbl_name text,
 32 |         rootpage integer,
 33 |         sql text
 34 |     )
 35 | 
 36 | ---
 37 | 
 38 | # DDL validations
 39 | 
 40 | validation(primary-key): initially, each table must have a single-column primary key
 41 |     - sqlite only supports integer key (if another column is set as primary key it creates a sister integer key (rowid))
 42 | later I can support more than one or no columns (in which case all the colums in table
 43 | definition order form the key)
 44 | 
 45 | validation(column-names): column names must match definition
 46 | 
 47 | ---
 48 | 
 49 | # data types/ typesystem
 50 | 
 51 | ## data type encoding
 52 | 
 53 | datatypes can be fixed or variable length. I will support variable for text type
 54 | sqllite also use variable length (huffman) encoding for ints- but for now focus on fixed length
 55 | encoding.
 56 | 
 57 | ## data types
 58 | 
 59 | name   type
 60 | ----   ----
 61 | integer 4bytes
 62 | float   4bytes
 63 | text    variable (upto reasonable upper bound)
 64 | null
 65 | blob    ?
 66 | ---
 67 | 
 68 | # btree changes
 69 | 
 70 | currently, a page can correspond to internal or leaf nodes. leaf nodes are organized like:
 71 | header,
 72 | key0
 73 | value0
 74 | key1
 75 | value1
 76 | ...
 77 | keyN
 78 | valueN
 79 | ... where value0 is the byte array corresponding to serialized row
 80 | 
 81 | to support dynamic schemas, the data length has to also be encoded.
 82 | consider how sqlite
 83 | 
 84 | sqlite stores leaf nodes like (6.4):
 85 | - (low address) header, cell pointer array, unallocated space, cells (high address)
 86 | - cell ptrs are sorted by key (2 bytes); contain page offset to cell
 87 | - cell -> [data_size, key_size, payload (key, data), overflow ptr]
 88 |     -- data can be divided into header and body
 89 |     -- data header -> [size of header, serial types, integer length of each value that is variable encoded]
 90 |     -- alt. data header -> [size of header, serial types (size of variable length value)?
 91 |     -- data body -> concatenated bytes of serialized values (in definition order)
 92 | 
 93 | - cells are arbitrary ordered, with freespace in between (due to deletes)
 94 | - the freespace block is stored in a linked list
 95 |     -- a free block needs 2b (location of next free block) + 2b (size of this free block)
 96 | 
 97 | - support datatype, i.e. string length upto what can fit in max length
 98 | 
 99 | ---
100 | 
101 | # vm - btree interface
102 | 
103 | tree operates on byte strings
104 | vm is responsible for any internal structure of keys, values
105 | 
106 | vm only interacts with tree (and state more broadly) via cursors
107 | - so create table -> is a an entry inserted into catalog directly by vm
108 | - I might want to make this abstracted via statemanager or catalog
109 |     - the difference is I'm working on an ast, vs. sqllite is working on bytecode
110 | ---
111 | 
112 | Right now I have on the front end, a parsed representation, and on the backend - an content agnostic data structures/indexes that
113 | operate on sortable byte arrays. The VM sits in the middle, and must map parsed datatype names, into underlying types.
114 | Boom, we must implement a type system. Another very exciting area.
115 | 
116 | 
117 | ---
118 | 
119 | Currently, I've been largely doing direct byte offset manipulation-- hmm is this true?
120 | But does it make sense to have some more abstractions with updating the cells
121 | 
122 | ---
123 | 
124 | 


--------------------------------------------------------------------------------
/docs/tutorial.md:
--------------------------------------------------------------------------------
  1 | # How to use learndb
  2 | 
  3 | This tutorial walks through the basic capabilities of learndb. 
  4 | It assumes reader has familiarity with (some dialect of) SQL.
  5 | 
  6 | Note: Commands below are shown in pairs of boxes- where the first box is the command to run,
  7 | and the second box is the expected output. The output is omitted where unnecessary.
  8 | 
  9 | 
 10 | ### Preamble
 11 | 
 12 | > Ensure learndb is [installed](../README.md)
 13 | 
 14 | 
 15 | ### Start the REPL
 16 | 
 17 | ```
 18 | python run_learndb.py repl
 19 | ```
 20 | ```
 21 | db >
 22 | ```
 23 | 
 24 | ### Create Table and Load Data
 25 | 
 26 | Create a table:
 27 | 
 28 | ```
 29 | db > create table fruits (id integer primary key, name text, avg_weight real)
 30 | ```
 31 | ```
 32 | Execution of command 'create table fruits (id integer primary key, name text, avg_weight real)' succeeded
 33 | ```
 34 | Insert records:
 35 | ```
 36 | db > insert into fruits (id, name, avg_weight) values (1, 'apple', 4.2);
 37 | ```
 38 | ```
 39 | Execution of command 'insert into fruits (id, name, avg_weight) values (1, 'apple', 4.2);' succeeded
 40 | ```
 41 | 
 42 | > Note: There is no auto incrementing key, and each table requires a primary integer key. Hence, we must specify the id.
 43 | 
 44 | Insert more records:
 45 | ```
 46 | db > insert into fruits (id, name, avg_weight) values (2, 'mangoes', 3.5);
 47 | ...
 48 | db > insert into fruits  (id, name, avg_weight) values (3, 'carrots', 3.3);
 49 | ...
 50 | ```
 51 | 
 52 | ### Query records
 53 | Note, there is no support wildcard column expansion, i.e. `select * ...`
 54 | ```
 55 | db > select id, name, avg_weight from fruits
 56 | ```
 57 | ```
 58 | Execution of command 'select id, name, avg_weight from fruits' succeeded
 59 | Record(id: 1, name: apple, avg_weight: 4.199999809265137)
 60 | Record(id: 2, name: mangoes, avg_weight: 3.5)
 61 | Record(id: 3, name: carrots, avg_weight: 3.299999952316284)
 62 | ```
 63 | ### Query Catalog
 64 | 
 65 | Learndb maintains a table `catalog` which keeps track of all user defined tables and objects.
 66 | We can check what tables exist by querying `catalog` directly.
 67 | 
 68 | ```
 69 | db > select sql_text from catalog
 70 | ```
 71 | ```
 72 | Execution of command 'select sql_text from catalog' succeeded
 73 | Record(sql_text: CREATE TABLE fruits ( id Integer PRIMARY KEY, name Text , avg_weight Real  ))
 74 | ```
 75 | 
 76 | ### Filtering results
 77 | 
 78 | We can specify conditions of equality or inequality (less-or-equal, less, greater, greater-or-equal)
 79 | 
 80 | ```
 81 | db > select name, avg_weight from fruits where avg_weight >= 3.5
 82 | ```
 83 | ```
 84 | Execution of command 'select name, avg_weight from fruits where avg_weight >= 3.5' succeeded
 85 | Record(name: apple, avg_weight: 4.199999809265137)
 86 | Record(name: mangoes, avg_weight: 3.5)
 87 | ```
 88 | These conditions consist of a simple predicate where one side has a column reference, and the other side a value.
 89 | Learndb expects the two sides to be expressions, and this means they can consist of arbitrary algebraic operations.
 90 | For example, the previous condition could have been equivalently written as  `avg_weight + 1 >= 4.5`
 91 | 
 92 | Further simple predicates can be combined into complex conditions using boolean operators, example:
 93 | ```
 94 | db > select name, avg_weight from fruits where (avg_weight >= 3.6 and avg_weight <= 10.0) or name = 'mango' 
 95 | ```
 96 | 
 97 | ### Joining Tables
 98 | 
 99 | For this we'll introduce the `employees` schema
100 | ```
101 | db > CREATE TABLE employees (id INTEGER PRIMARY KEY, name TEXT, salary INTEGER, depid INTEGER);
102 | db > INSERT INTO employees(id, name, salary, depid) VALUES (1, 'John', 100, 1);
103 | db > INSERT INTO employees(id, name, salary, depid) VALUES (2, 'Anita', 200, 1);
104 | db > INSERT INTO employees(id, name, salary, depid) VALUES (3, 'Gab', 100, 2);
105 | 
106 | db > CREATE TABLE department (depid INTEGER PRIMARY KEY, name TEXT);
107 | 
108 | db > INSERT INTO department(depid, name) VALUES (1, 'accounting');
109 | db > INSERT INTO department(depid, name) VALUES (2, 'sales');
110 | db > INSERT INTO department(depid, name) VALUES (3, 'engineering');
111 | ```
112 | 
113 | Next, we can do join the two tables:
114 | Note: the explicit use of "inner" when specifying the join
115 | ```
116 | db > select e.name, d.name from employees e inner join department d on e.depid = d.depid
117 | ```
118 | ```
119 | Execution of command 'select e.name, d.name from employees e inner join department d on e.depid = d.depid' succeeded
120 | Record(e.name: John, d.name: accounting)
121 | Record(e.name: Anita, d.name: accounting)
122 | Record(e.name: Gab, d.name: sales)
123 | ...
124 | ```
125 | 
126 | ### Group by clause
127 | 
128 | ```
129 | db > select count(e.name), d.name from employees e inner join department d on e.depid = d.depid group by d.name
130 | ```
131 | 
132 | ```
133 | Execution of command 'select count(e.name), d.name from employees e inner join department d on e.depid = d.depid group by d.name' succeeded
134 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 2, d.name: accounting)
135 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 1, d.name: sales)
136 | ```
137 | Note, this only has the rows for departments with at least one employee.
138 | In order to display departments with no employees we need to do left or right join, e.g.
139 | ```sql
140 | db > select count(e.name), d.name from  department d left join employees e on e.depid = d.depid group by d.name
141 | ```
142 | ```
143 | Execution of command 'select count(e.name), d.name from  department d left join employees e on e.depid = d.depid group by d.name;' succeeded
144 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 2, d.name: accounting)
145 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 1, d.name: sales)
146 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 0, d.name: engineering)
147 | ```
148 | Or equivalently 
149 | ```sql
150 | db > select count(e.name), d.name from employees e right join department d on e.depid = d.depid group by d.name
151 | ```
152 | 
153 | ### Having clause 
154 | We may use a `having` clause to filter groups based on the output of an aggregate function.
155 | For example, we may want to list all "small" departments; where we define "small" as having strictly less than 2 employees.
156 | We can use a having clause like:
157 | 
158 | ```
159 | db > select count(e.name), d.name from employees e inner join department d on e.depid = d.depid group by d.name having count(e.name) < 2
160 | ```
161 | 
162 | ```
163 | Execution of command 'select count(e.name), d.name from employees e inner join department d on e.depid = d.depid group by d.name having count(e.name) < 2' succeeded
164 | Record(expr(expr=funccall(name=token('identifier', 'count'), args=[expr(expr=columnname(name=token('scoped_identifier', 'e.name')))])): 1, d.name: sales)```
165 | ```
166 | 
167 | ### Order By Clause
168 | Now consider that we may want to order the results based on the value of one or more columns. 
169 | We can achieve this using an `order by` clause.
170 | 
171 | ```
172 | db > select name, salary from employees order by salary desc
173 | ```
174 | 
175 | Order by multiple columns
176 | ```
177 | db > select name, salary from employees order by salary desc, name asc
178 | ```
179 | 
180 | Note, however, the columns referred to in order by clause must be in the select clause. To illustrate this point, consider:
181 | 
182 | ```
183 | db > CREATE TABLE fruits (id INTEGER PRIMARY KEY, name TEXT, avg_weight INTEGER);
184 | ...       
185 | db > INSERT INTO fruits (id, name, avg_weight) values (1, 'apple', 200);
186 | ...
187 | db > INSERT INTO fruits (id, name, avg_weight) values (2, 'orange', 140);
188 | ...
189 | ```
190 | 
191 | Specifically, the below will fail:
192 | ```
193 | select name from fruits order by id
194 | ```
195 | However, by including the `id` in the select clause, this issue can be overcome, i.e.
196 | ```
197 | select name, id from fruits order by id
198 | ```
199 | 
200 | ### Limit Clause
201 | ```
202 | db > select name, salary from employees order by salary desc, name asc limit 10
203 | ```
204 | 
205 | ## Supported meta-commands:
206 | quit REPl
207 | > .quit
208 | 
209 | print btree
210 | > .btree
211 | 
212 | performs internal consistency checks on tree
213 | > .validate
214 | 
215 | 
216 | ## Hacking/Development.md
217 |     - Instructions here to how to start developing, i.e. how to setup an ide, and step through code and tests
218 | 
219 | ## Current Limitations
220 | - repl can only accept a single line, i.e. command can not be split, over multiple lines.
221 |   - No support for select star, i.e. `select * from foo`
222 |   - Input sql can contain column names in mixed case. However, internally names are stored and accessed with the lower case version of the name.
223 | - join type must be explicit, i.e. for inner join, "inner" is required
224 | - any column used in order by clause, must appear in select


--------------------------------------------------------------------------------
/docs/why.md:
--------------------------------------------------------------------------------
1 | # Why
2 | The motivation was to build a complete enough DBMS system, that was minimally complex. 


--------------------------------------------------------------------------------
/learndb/__init__.py:
--------------------------------------------------------------------------------
1 | # these are exposed to user
2 | from .interface import LearnDB, repl, devloop, parse_args_and_start  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/learndb/constants.py:
--------------------------------------------------------------------------------
  1 | # operational constants
  2 | EXIT_SUCCESS = 0
  3 | EXIT_FAILURE = 1
  4 | 
  5 | DB_FILE = "db.file"
  6 | # TODO: nuke here
  7 | # TEST_DB_FILE = 'testdb.file'
  8 | 
  9 | # storage constants
 10 | # NOTE: storage and btree constants that affect how the
 11 | # db file is written, should not be changed once a db file is created.
 12 | PAGE_SIZE = 4096
 13 | WORD = 4
 14 | 
 15 | # file header constants
 16 | FILE_HEADER_OFFSET = 0
 17 | FILE_HEADER_SIZE = 100
 18 | FILE_PAGE_AREA_OFFSET = FILE_HEADER_SIZE
 19 | FILE_HEADER_VERSION_FIELD_OFFSET = 0
 20 | FILE_HEADER_VERSION_FIELD_SIZE = 16
 21 | # NOTE: The diff between size and len(FILE_HEADER_VERSION_VALUE) should be padding
 22 | FILE_HEADER_VERSION_VALUE = b"learndb v1"
 23 | # pointer to next node in free list
 24 | FILE_HEADER_NEXT_FREE_PAGE_HEAD_OFFSET = (
 25 |     FILE_HEADER_VERSION_FIELD_OFFSET + FILE_HEADER_VERSION_FIELD_SIZE
 26 | )
 27 | FILE_HEADER_NEXT_FREE_PAGE_HEAD_SIZE = WORD
 28 | # whether there is a free list
 29 | FILE_HEADER_HAS_FREE_PAGE_LIST_OFFSET = (
 30 |     FILE_HEADER_NEXT_FREE_PAGE_HEAD_OFFSET + FILE_HEADER_NEXT_FREE_PAGE_HEAD_SIZE
 31 | )
 32 | FILE_HEADER_HAS_FREE_PAGE_LIST_SIZE = WORD
 33 | FILE_HEADER_PADDING = (
 34 |     FILE_HEADER_SIZE
 35 |     - FILE_HEADER_VERSION_FIELD_SIZE
 36 |     - FILE_HEADER_NEXT_FREE_PAGE_HEAD_SIZE
 37 | )
 38 | assert FILE_HEADER_PADDING >= 0, "file header overflow"
 39 | # pager constants
 40 | FREE_PAGE_HAS_NEXT_FREE_PAGE_HEAD_OFFSET = 0
 41 | FREE_PAGE_HAS_NEXT_FREE_PAGE_HEAD_SIZE = WORD
 42 | FREE_PAGE_NEXT_FREE_PAGE_HEAD_OFFSET = 0
 43 | FREE_PAGE_NEXT_FREE_PAGE_HEAD_SIZE = WORD
 44 | 
 45 | # btree constants
 46 | TABLE_MAX_PAGES = 100
 47 | 
 48 | # represents a null value in header
 49 | NULLPTR = 0
 50 | 
 51 | # serialized data layout (tree nodes)
 52 | # common node header layout
 53 | NODE_TYPE_SIZE = WORD
 54 | NODE_TYPE_OFFSET = 0
 55 | IS_ROOT_SIZE = WORD
 56 | IS_ROOT_OFFSET = NODE_TYPE_SIZE
 57 | PARENT_POINTER_SIZE = WORD
 58 | PARENT_POINTER_OFFSET = NODE_TYPE_SIZE + IS_ROOT_SIZE
 59 | COMMON_NODE_HEADER_SIZE = NODE_TYPE_SIZE + IS_ROOT_SIZE + PARENT_POINTER_SIZE
 60 | 
 61 | # Internal node body layout
 62 | # layout:
 63 | # nodetype .. is_root .. parent_pointer
 64 | # num_keys .. right-child-ptr
 65 | # ptr 0 .. key 0 .. ptr N-1 key N-1
 66 | INTERNAL_NODE_NUM_KEYS_SIZE = WORD
 67 | INTERNAL_NODE_NUM_KEYS_OFFSET = COMMON_NODE_HEADER_SIZE
 68 | INTERNAL_NODE_RIGHT_CHILD_SIZE = WORD
 69 | INTERNAL_NODE_RIGHT_CHILD_OFFSET = (
 70 |     INTERNAL_NODE_NUM_KEYS_OFFSET + INTERNAL_NODE_NUM_KEYS_SIZE
 71 | )
 72 | INTERNAL_NODE_HAS_RIGHT_CHILD_SIZE = WORD
 73 | INTERNAL_NODE_HAS_RIGHT_CHILD_OFFSET = (
 74 |     INTERNAL_NODE_RIGHT_CHILD_OFFSET + INTERNAL_NODE_HAS_RIGHT_CHILD_SIZE
 75 | )
 76 | INTERNAL_NODE_HEADER_SIZE = (
 77 |     COMMON_NODE_HEADER_SIZE
 78 |     + INTERNAL_NODE_NUM_KEYS_SIZE
 79 |     + INTERNAL_NODE_RIGHT_CHILD_SIZE
 80 |     + INTERNAL_NODE_HAS_RIGHT_CHILD_SIZE
 81 | )
 82 | 
 83 | INTERNAL_NODE_KEY_SIZE = WORD
 84 | # Ptr to child re
 85 | INTERNAL_NODE_CHILD_SIZE = WORD
 86 | INTERNAL_NODE_CELL_SIZE = INTERNAL_NODE_CHILD_SIZE + INTERNAL_NODE_KEY_SIZE
 87 | INTERNAL_NODE_SPACE_FOR_CELLS = PAGE_SIZE - INTERNAL_NODE_HEADER_SIZE
 88 | # INTERNAL_NODE_MAX_CELLS =  INTERNAL_NODE_SPACE_FOR_CELLS / INTERNAL_NODE_CELL_SIZE
 89 | 
 90 | # NOTE: this is limited for debugging/dev
 91 | # NOTE: this should not dip below 3 due to the constraint of unary trees
 92 | # cells, i.e. key, child ptr in the body
 93 | INTERNAL_NODE_MAX_CELLS = 3
 94 | # the +1 is for the right child
 95 | INTERNAL_NODE_MAX_CHILDREN = INTERNAL_NODE_MAX_CELLS + 1
 96 | INTERNAL_NODE_RIGHT_SPLIT_CHILD_COUNT = (INTERNAL_NODE_MAX_CHILDREN + 1) // 2
 97 | INTERNAL_NODE_LEFT_SPLIT_CHILD_COUNT = (
 98 |     INTERNAL_NODE_MAX_CHILDREN + 1
 99 | ) - INTERNAL_NODE_RIGHT_SPLIT_CHILD_COUNT
100 | 
101 | # leaf node header layout
102 | # old layout:
103 | # nodetype .. is_root .. parent_pointer
104 | # num_keys .. key 0 .. val 0 .. key N-1 val N-1
105 | # key 0 .. val 0 .. key N-1 val N-1
106 | 
107 | # new layout
108 | # nodetype .. is_root .. parent_pointer
109 | # num_cells .. alloc_ptr .. free_list_head_ptr .. total_free_list_space
110 | # cellptr_0 .. cellptr_1 ... cellptr_N-1
111 | LEAF_NODE_NUM_CELLS_SIZE = WORD
112 | LEAF_NODE_NUM_CELLS_OFFSET = COMMON_NODE_HEADER_SIZE
113 | LEAF_NODE_ALLOC_POINTER_SIZE = WORD
114 | LEAF_NODE_ALLOC_POINTER_OFFSET = (
115 |     LEAF_NODE_NUM_CELLS_OFFSET + LEAF_NODE_ALLOC_POINTER_SIZE
116 | )
117 | LEAF_NODE_FREE_LIST_HEAD_POINTER_SIZE = WORD
118 | LEAF_NODE_FREE_LIST_HEAD_POINTER_OFFSET = (
119 |     LEAF_NODE_ALLOC_POINTER_OFFSET + LEAF_NODE_FREE_LIST_HEAD_POINTER_SIZE
120 | )
121 | LEAF_NODE_TOTAL_FREE_LIST_SPACE_SIZE = WORD
122 | LEAF_NODE_TOTAL_FREE_LIST_SPACE_OFFSET = (
123 |     LEAF_NODE_FREE_LIST_HEAD_POINTER_OFFSET + LEAF_NODE_TOTAL_FREE_LIST_SPACE_SIZE
124 | )
125 | 
126 | LEAF_NODE_HEADER_SIZE = (
127 |     COMMON_NODE_HEADER_SIZE
128 |     + LEAF_NODE_NUM_CELLS_SIZE
129 |     + LEAF_NODE_ALLOC_POINTER_SIZE
130 |     + LEAF_NODE_FREE_LIST_HEAD_POINTER_SIZE
131 |     + LEAF_NODE_TOTAL_FREE_LIST_SPACE_SIZE
132 | )
133 | 
134 | # location where cell point start
135 | LEAF_NODE_CELL_POINTER_START = LEAF_NODE_HEADER_SIZE
136 | LEAF_NODE_CELL_POINTER_SIZE = WORD
137 | 
138 | # cell constants
139 | 
140 | # NOTE: this is intended to support APIs that expect fixed size key
141 | # this is the older api; likely can be removed once btree older api is pruned
142 | LEAF_NODE_KEY_SIZE = WORD
143 | # NOTE: these are relative to beginning of cell
144 | CELL_KEY_SIZE_OFFSET = 0
145 | # the size of the key-size field
146 | CELL_KEY_SIZE_SIZE = WORD
147 | CELL_DATA_SIZE_OFFSET = CELL_KEY_SIZE_OFFSET + CELL_KEY_SIZE_SIZE
148 | CELL_DATA_SIZE_SIZE = WORD
149 | CELL_KEY_PAYLOAD_OFFSET = CELL_DATA_SIZE_OFFSET + CELL_DATA_SIZE_SIZE
150 | # space excluding headers, i.e. only space for cells and cellptr
151 | LEAF_NODE_NON_HEADER_SPACE = PAGE_SIZE - LEAF_NODE_HEADER_SIZE
152 | # max cell that can fit on page is non-header space and 1 cell ptr
153 | LEAF_NODE_MAX_CELL_SIZE = LEAF_NODE_NON_HEADER_SPACE - LEAF_NODE_CELL_POINTER_SIZE
154 | 
155 | # free-block constants
156 | # NOTE: these are relative to start of a free block
157 | FREE_BLOCK_SIZE_SIZE = WORD
158 | FREE_BLOCK_SIZE_OFFSET = 0
159 | FREE_BLOCK_NEXT_BLOCK_SIZE = WORD
160 | FREE_BLOCK_NEXT_BLOCK_OFFSET = FREE_BLOCK_SIZE_OFFSET + FREE_BLOCK_SIZE_SIZE
161 | FREE_BLOCK_HEADER_SIZE = FREE_BLOCK_SIZE_SIZE + FREE_BLOCK_NEXT_BLOCK_SIZE
162 | 
163 | # NOTE: this is limited for debugging/dev
164 | LEAF_NODE_MAX_CELLS = 3
165 | 
166 | 
167 | # serde constants
168 | # length of encoded bytes
169 | INTEGER_SIZE = WORD
170 | REAL_SIZE = WORD
171 | # when real numbers are stored there is some rounding error
172 | # hence two real numbers where the abs difference is less than `REAL_EPSILON`, are considered equal
173 | # NOTE: I just ballparked this epsilon; in actuality the diff will likely depend on the absolute
174 | # value of the real number
175 | REAL_EPSILON = 0.00001
176 | 
177 | # Higher-level constanst
178 | # name of catalog
179 | CATALOG = "catalog"
180 | CATALOG_ROOT_PAGE_NUM = 0
181 | 
182 | USAGE = """
183 | Supported meta-commands:
184 | ------------------------
185 | print usage
186 | .help
187 | 
188 | quit REPl
189 | > .quit
190 | 
191 | print btree for table <table-name>
192 | > .btree <table-name>
193 | 
194 | performs internal consistency checks on table <table-name>
195 | > .validate <table-name>
196 | 
197 | Supported commands:
198 | -------------------
199 | The following lists supported commands, and an example. For a complete grammar see docs/sql-lang.txt
200 | 
201 | Create table
202 | > create table customers ( cust_id integer primary key, cust_name text, cust_height float)
203 | 
204 | Insert records
205 | > insert into customers ( cust_id, cust_name, cust_height) values (1, 'Bob Maharaj', 162.5 )
206 | 
207 | Select some rows, only supports equality predicate
208 | > select cust_name, cust_height from customers
209 | 
210 | Delete (only single equality predicate supported)
211 | > delete from customers where cust_name = "Bob Maharaj"
212 | """
213 | 


--------------------------------------------------------------------------------
/learndb/cursor.py:
--------------------------------------------------------------------------------
  1 | from .constants import INTERNAL_NODE_MAX_CELLS
  2 | from .btree import Tree, NodeType
  3 | from .pager import Pager
  4 | 
  5 | 
  6 | class Cursor:
  7 |     """
  8 |     Represents a cursor. A cursor understands how to navigate
  9 |     a database (on-disk) page, i.e. reading and understanding header values.
 10 |     A cursor exposes an interface to read, insert and delete rows.
 11 |     """
 12 | 
 13 |     def __init__(self, pager: Pager, tree: Tree):
 14 |         self.tree = tree
 15 |         self.pager = pager
 16 |         self.page_num = tree.root_page_num
 17 |         self.cell_num = 0
 18 |         self.end_of_table = False
 19 |         self.first_leaf()
 20 | 
 21 |     def first_leaf(self):
 22 |         """
 23 |         set cursor location to left-most/first leaf
 24 |         """
 25 |         # start with root and descend until we hit left most leaf
 26 |         node = self.pager.get_page(self.page_num)
 27 |         while Tree.get_node_type(node) == NodeType.NodeInternal:
 28 |             assert Tree.internal_node_has_right_child(
 29 |                 node
 30 |             ), "invalid tree with no right child"
 31 |             if Tree.internal_node_num_keys(node) == 0:
 32 |                 # get right child- unary tree
 33 |                 child_page_num = Tree.internal_node_right_child(node)
 34 |             else:
 35 |                 child_page_num = Tree.internal_node_child(node, 0)
 36 |             self.page_num = child_page_num
 37 |             node = self.pager.get_page(child_page_num)
 38 | 
 39 |         self.cell_num = 0
 40 |         # node must be leaf node
 41 |         self.end_of_table = Tree.leaf_node_num_cells(node) == 0
 42 | 
 43 |     def get_cell(self) -> bytes:
 44 |         """
 45 |         return cell pointed by cursor
 46 |         :return:
 47 |         """
 48 |         node = self.pager.get_page(self.page_num)
 49 |         cell = Tree.leaf_node_cell(node, self.cell_num)
 50 |         return cell
 51 | 
 52 |     def next_leaf(self):
 53 |         """
 54 |         move self.page_num and self.cell_num to next leaf and next cell
 55 |         this method requires the self.page_num start at a leaf node.
 56 | 
 57 |         NOTE: if starting from an internal node, to get to a leaf use `first_leaf` method
 58 |         :return:
 59 |         """
 60 |         # starting point
 61 |         node = self.pager.get_page(self.page_num)
 62 |         if Tree.is_node_root(node) is True:
 63 |             # there is nothing
 64 |             self.end_of_table = True
 65 |             return
 66 | 
 67 |         node_max_value = self.tree.get_node_max_key(node)
 68 |         assert node_max_value is not None
 69 | 
 70 |         parent_page_num = Tree.get_parent_page_num(node)
 71 |         # check if current page, i.e. self.page_num is right most child of it's parent
 72 |         parent = self.pager.get_page(parent_page_num)
 73 |         child_num = self.tree.internal_node_find(parent_page_num, node_max_value)
 74 |         if child_num == INTERNAL_NODE_MAX_CELLS:
 75 |             # this is the right child; thus all children have been consumed
 76 |             # go up another level
 77 |             self.page_num = parent_page_num
 78 |             self.next_leaf()
 79 |         else:
 80 |             # there is at least one child to be consumed
 81 |             # find the next child
 82 |             if child_num == Tree.internal_node_num_keys(parent) - 1:
 83 |                 # next child is the right child
 84 |                 next_child = Tree.internal_node_right_child(parent)
 85 |             else:
 86 |                 next_child = Tree.internal_node_child(parent, child_num + 1)
 87 |             self.page_num = next_child
 88 |             # now find first leaf in next child
 89 |             self.first_leaf()
 90 | 
 91 |     def advance(self):
 92 |         """
 93 |         advance the cursor
 94 |          1) from left most leaf node to right most leaf node
 95 |          2) from leftmost cell to right most cell
 96 |         :return:
 97 |         """
 98 |         # advance always start at leaf node and ends at a leaf node;
 99 |         # starting at or ending at an internal node means the cursor is inconsistent
100 |         node = self.pager.get_page(self.page_num)
101 |         # we are currently on the last cell in the node
102 |         # go to the next node if it exists
103 |         if self.cell_num >= Tree.leaf_node_num_cells(node) - 1:
104 |             self.next_leaf()
105 |         else:
106 |             self.cell_num += 1
107 | 


--------------------------------------------------------------------------------
/learndb/dataexchange.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains classes used for data exchange, i.e.
 3 | do not have any "compute" methods.
 4 | """
 5 | from typing import Any, TypeVar, Generic
 6 | from dataclasses import dataclass
 7 | from enum import Enum, auto
 8 | 
 9 | # This is used to parameterize Response type as per: https://stackoverflow.com/a/42989302
10 | T = TypeVar("T")
11 | 
12 | 
13 | # section result enums
14 | 
15 | 
16 | # NOTE: now that I'm returning Response objects
17 | # I don't need a Success enums - this was also
18 | # when I was following
19 | class MetaCommandResult(Enum):
20 |     Success = auto()
21 |     UnrecognizedCommand = auto()
22 |     InvalidArgument = auto()
23 | 
24 | 
25 | class StatementType(Enum):
26 |     Uninitialized = auto()
27 |     Insert = auto()
28 |     Select = auto()
29 |     Delete = auto()
30 | 
31 | 
32 | @dataclass
33 | class Response(Generic[T]):
34 |     """
35 |     Use as a generic class to encapsulate a response and a body
36 |     """
37 | 
38 |     # is success
39 |     success: bool
40 |     # if fail, why
41 |     error_message: str = None
42 |     # an enum encoding state
43 |     status: Any = None
44 |     # output of operation
45 |     body: T = None
46 | 
47 |     def __str__(self):
48 |         if self.error_message:
49 |             return f"Response(fail, {self.error_message})"
50 |         else:
51 |             return f"Response(success, {str(self.body)})"
52 | 
53 |     def __repr__(self):
54 |         return self.__str__()
55 | 


--------------------------------------------------------------------------------
/learndb/datatypes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Database, i.e. storage layer datatypes, as distinct from: 1) parsed AST datatype, 2) execution datatype (i.e. impl
  3 | language datatype)
  4 | """
  5 | import sys
  6 | import struct
  7 | from abc import ABCMeta
  8 | from typing import Any, Type
  9 | 
 10 | from .constants import INTEGER_SIZE, REAL_SIZE
 11 | 
 12 | 
 13 | class DataType:
 14 |     """
 15 |     This is a datatype of a value in the database.
 16 | 
 17 |     This provides an interface to provide serde of implemented type
 18 |     and details of underlying encoding.
 19 | 
 20 |     Note: There can be multiple underlying (physical) types for a given
 21 |     datatype. Datatype is logical and can be implemented in multiple
 22 |     ways (i.e. details of the serde, encoding length)
 23 | 
 24 |     """
 25 | 
 26 |     __metaclass__ = ABCMeta
 27 |     # non-serializable types are fixed value types, e.g. null, true, false
 28 |     # which are in fact only encoded in the header, and not in the data payload
 29 |     is_serializable = False
 30 |     is_fixed_length = False
 31 |     fixed_length = 0
 32 |     typename = "Untyped"
 33 | 
 34 |     @staticmethod
 35 |     def serialize(value) -> bytes:
 36 |         """
 37 |         serialize argument `value` to byte string
 38 |         :param value:
 39 |         :return:
 40 |         """
 41 |         raise NotImplementedError
 42 | 
 43 |     @staticmethod
 44 |     def deserialize(bstring) -> Any:
 45 |         """
 46 |         deserialize argument byte string to value (of given type)
 47 |         :param bstring:
 48 |         :return:
 49 |         """
 50 |         raise NotImplementedError
 51 | 
 52 |     @staticmethod
 53 |     def is_valid_term(term) -> bool:
 54 |         """
 55 |         return True if term can be converted
 56 |         to datatype
 57 |         :param term:
 58 |         :return:
 59 |         """
 60 |         raise NotImplementedError
 61 | 
 62 | 
 63 | class Integer(DataType):
 64 |     """
 65 |     Represents a fixed-size integer
 66 |     """
 67 | 
 68 |     is_fixed_length = True
 69 |     fixed_length = INTEGER_SIZE
 70 |     is_serializable = True
 71 |     typename = "Integer"
 72 | 
 73 |     @staticmethod
 74 |     def serialize(value: int) -> bytes:
 75 |         # print("In integer::serialize")
 76 |         return value.to_bytes(INTEGER_SIZE, sys.byteorder)
 77 | 
 78 |     @staticmethod
 79 |     def deserialize(bstring: bytes) -> int:
 80 |         return int.from_bytes(bstring, sys.byteorder)
 81 | 
 82 |     @staticmethod
 83 |     def is_valid_term(term) -> bool:
 84 |         return isinstance(term, int)
 85 | 
 86 | 
 87 | class Real(DataType):
 88 |     """
 89 |     Represents a fixed-size floating point number.
 90 |     Note: The usual concerns around finite-precision and
 91 |     rounding hold here.
 92 |     """
 93 | 
 94 |     is_fixed_length = True
 95 |     fixed_length = REAL_SIZE
 96 |     is_serializable = True
 97 |     typename = "Real"
 98 | 
 99 |     @staticmethod
100 |     def serialize(value: float) -> bytes:
101 |         """
102 |         :param value:
103 |         :return:
104 |         """
105 |         # encodes float according to native byteorder ('=')
106 |         return struct.pack("=f", value)
107 | 
108 |     @staticmethod
109 |     def deserialize(bstring) -> float:
110 |         """
111 |         :param value:
112 |         :return:
113 |         """
114 |         tpl = struct.unpack("=f", bstring)
115 |         return tpl[0]
116 | 
117 |     @staticmethod
118 |     def is_valid_term(term) -> bool:
119 |         return isinstance(term, float)
120 | 
121 | 
122 | class Text(DataType):
123 |     """
124 |     represents a variable length text
125 |     """
126 | 
127 |     is_fixed_length = False
128 |     fixed_length = 0
129 |     is_serializable = True
130 |     typename = "Text"
131 | 
132 |     @staticmethod
133 |     def serialize(value: str):
134 |         return value.encode("utf-8")
135 | 
136 |     @staticmethod
137 |     def deserialize(bstring: bytes):
138 |         return bstring.decode("utf-8")
139 | 
140 |     @staticmethod
141 |     def is_valid_term(term) -> bool:
142 |         return isinstance(term, str)
143 | 
144 | 
145 | class Boolean(DataType):
146 |     """
147 |     represents a variable length text
148 |     """
149 | 
150 |     is_fixed_length = False
151 |     fixed_length = 0
152 |     is_serializable = True
153 |     typename = "Text"
154 | 
155 |     @staticmethod
156 |     def serialize(value: bool):
157 |         return struct.pack("=?", value)
158 | 
159 |     @staticmethod
160 |     def deserialize(bstring: bytes):
161 |         tpl = struct.unpack("=?", bstring)
162 |         return tpl[0]
163 | 
164 |     @staticmethod
165 |     def is_valid_term(term) -> bool:
166 |         return isinstance(term, bool)
167 | 
168 | 
169 | class Null(DataType):
170 |     """
171 |     Represents a null type. Will represent this as a
172 |     fixed length 0. This could be encoded more efficiently.
173 | 
174 |     Further, consider, whether records with nulls will
175 |     actually store nulls, i.e. will I use a sparse representation
176 |     to encode the record. However, such sparse representations generally
177 |     require storing the schema definition per record and some checks/flags
178 |     to determine whether a given value is null or not.
179 | 
180 |     translate
181 |     to slower ops on defined values, since there are additional checks/flags
182 |      needed to access the value.
183 | 
184 |     """
185 | 
186 |     is_fixed_length = True
187 |     fixed_length = 0
188 |     is_serializable = False
189 |     typename = "Null"
190 | 
191 | 
192 | class Blob(DataType):
193 |     """
194 |     This represent an as-is byte string
195 |     """
196 | 
197 |     is_fixed_length = False
198 |     fixed_length = 0
199 |     is_serializable = True
200 |     typename = "Blob"
201 | 
202 |     @staticmethod
203 |     def serialize(value: bytes) -> bytes:
204 |         return value
205 | 
206 |     @staticmethod
207 |     def deserialize(bstring: bytes) -> bytes:
208 |         return bstring
209 | 
210 | 
211 | def is_term_valid_for_datatype(data_type: Type[DataType], term: Any) -> bool:
212 |     """
213 |     Return True, if term is valid for given datatype
214 |     """
215 |     return data_type.is_valid_term(term)
216 | 


--------------------------------------------------------------------------------
/learndb/functions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This implements functions. Specifically:
  3 |     - the function declaration interface
  4 |         -- and the ability to get the return type
  5 |     - Here, I assume the datatype are the same as those exposed to ddl definitions, i.e.
  6 |         Integer, Float, Text, Blob.
  7 |     - the ability to execute the functions
  8 | 
  9 | There will be 2 kinds of functions: native and defined in language
 10 | Functions declarations should apply to both, but only lang functions will have a Function object.
 11 | Native functions will have a declaration.
 12 | """
 13 | 
 14 | from typing import List, Dict, Any, Callable, Type, TypeVar, Union
 15 | 
 16 | 
 17 | from .dataexchange import Response
 18 | from .datatypes import DataType, Integer, Real
 19 | 
 20 | 
 21 | T = TypeVar("T")
 22 | 
 23 | 
 24 | class InvalidFunctionArguments(Exception):
 25 |     """
 26 |     function was invoked with invalid args.
 27 |     For position args, either the arity or type didn't match
 28 |     For named args, either name didn't exist, or type didn't match
 29 |     """
 30 | 
 31 | 
 32 | class FunctionDefinition:
 33 |     """
 34 |     Represents a function definition, for both scalar and aggregate functions.
 35 | 
 36 |     :param func_name: name of function; not strictly needed; used for debugging
 37 |     :param pos_params: list of positional arguments.
 38 |         NOTE: a scalar function can accept multiple positional arguments,
 39 |         while an aggregate function can accept only a single positional argument- a list of values (to operate on)
 40 |     :param named_params:
 41 |     :param func_body: callable function body
 42 |     :param return_type: return type of function
 43 |     :return:
 44 | 
 45 |     FUTURE_NOTE: Currently, pos_params are represented as a List[DataType].
 46 |     A named_params are represented as Dict[str, DataType], where the key is the param name.
 47 |     All pos_params will always be required.
 48 |     However, in the future, we may want to support named params with default values.
 49 |     In that case, it may be easier to represent this with a new type,
 50 |         e.g. NamedParam(arg_type: DataType, has_default_value: bool, default_value: Any)
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         func_name: str,
 56 |         pos_params: List[Union[Type[DataType], List[Type[DataType]]]],
 57 |         named_params: Dict[str, Type[DataType]],
 58 |         func_body: Callable,
 59 |         return_type: Type[DataType],
 60 |     ):
 61 |         self.name = func_name
 62 |         self.pos_params = pos_params
 63 |         self.named_params = named_params
 64 |         self.body = func_body
 65 |         self._return_type = return_type
 66 | 
 67 |     def __str__(self):
 68 |         return f"FunctionDefinition[{self.name}]"
 69 | 
 70 |     def __repr__(self):
 71 |         return self.__str__()
 72 | 
 73 |     @property
 74 |     def return_type(self) -> Type[DataType]:
 75 |         return self._return_type
 76 | 
 77 |     @staticmethod
 78 |     def is_valid_term(param: Type[DataType], term) -> bool:
 79 |         """Check if term matches param"""
 80 |         if param == DataType:
 81 |             # means any type
 82 |             return True
 83 |         return param.is_valid_term(term)
 84 | 
 85 |     def validate_args(
 86 |         self, pos_args: List[Any], named_args: Dict[str, Any]
 87 |     ) -> Response:
 88 |         """
 89 |         Validate pos and named args.
 90 |         Validate pos args on existence and type;
 91 |         validate named args on name, and type
 92 |         Args:
 93 |             pos_args: list of literals
 94 |             named_args: dict of param_name -> literal
 95 |         """
 96 |         # 1. validate positional params
 97 |         # 1.1. check arity- positional params are all required
 98 |         if len(pos_args) != len(self.pos_params):
 99 |             return Response(
100 |                 False,
101 |                 error_message=f"Arity mismatch between expected positional params [{len(pos_args)}] "
102 |                 f"and received args [{len(self.pos_params)}]",
103 |             )
104 |         # 1.2. validate types
105 |         for idx, arg in enumerate(pos_args):
106 |             param = self.pos_params[idx]
107 |             # 1.2.1. collection type
108 |             if isinstance(param, list):
109 |                 # not sure if this is the best check, since param may be an collection type
110 |                 # but checking for Iterable might catch false positive, which are atom but iterable, e.g. strings
111 |                 for item in param:
112 |                     # check each item in the collection
113 |                     for value in arg:
114 |                         if not self.is_valid_term(item, value):
115 |                             return Response(
116 |                                 False,
117 |                                 error_message=f"Invalid positional argument type [{arg}] at index {idx}. "
118 |                                 f"Expected argument of type [{item.typename}]",
119 |                             )
120 |             else:
121 |                 # 1.2.2. arg is a literal
122 |                 if not self.is_valid_term(param, arg):
123 |                     return Response(
124 |                         False,
125 |                         error_message=f"Invalid positional argument type [{arg}] at index {idx}. "
126 |                         f"Expected argument of type [{param.typename}]",
127 |                     )
128 | 
129 |         # 2. validate named params
130 |         # 2.1. validate arity - for now all named params are required
131 |         if len(named_args) != len(self.named_params):
132 |             return Response(
133 |                 False,
134 |                 error_message=f"Arity mismatch between expected named params [{len(named_args)}] "
135 |                 f"and received args [{self.named_params}]",
136 |             )
137 |         # validate existence and type
138 |         for arg_name, arg_value in named_args.items():
139 |             if arg_name not in self.named_params:
140 |                 return Response(
141 |                     False, error_message=f"Unexpected named argument [{arg_name}]"
142 |                 )
143 |             else:
144 |                 param = self.named_params[arg_name]
145 |                 param.is_valid_term(arg_value)
146 |                 return Response(
147 |                     False,
148 |                     error_message=f"Invalid named argument type [{arg_name}] for param [{arg_name}]."
149 |                     f"Expected argument of type [{param.typename}]",
150 |                 )
151 | 
152 |         return Response(True)
153 | 
154 |     def apply(self, pos_args: List[Any], named_args: Dict[str, Any]):
155 |         """
156 |         This models native functions, where each specific function
157 |         provides a callable `body`.
158 |         For a function in leardb-sql, we will have to walk an AST.
159 | 
160 |         This accepts a list of `pos_args` and a dict of `named_args`
161 |         This method first evaluates that the args match what is expected by the function definition.
162 |         Then invokes the actual function body/impl
163 |         """
164 |         # 1. validate args
165 |         resp = self.validate_args(pos_args, named_args)
166 |         if not resp.success:
167 |             raise InvalidFunctionArguments(
168 |                 f"Invocation of function [{self.name}] failed with: {resp.error_message}"
169 |             )
170 | 
171 |         # 2. apply function to args
172 |         return self.body(*pos_args, **named_args)
173 | 
174 | 
175 | # scalar function definitions
176 | 
177 | 
178 | def number_square_function_body(x: T) -> T:
179 |     """
180 |     Body for integer/float square
181 |     """
182 |     return x * x
183 | 
184 | 
185 | # square an int
186 | integer_square_function = FunctionDefinition(
187 |     "integer_square", [Integer], {}, number_square_function_body, Integer
188 | )
189 | float_square_function = FunctionDefinition(
190 |     "float_square", [Real], {}, number_square_function_body, Real
191 | )
192 | 
193 | 
194 | # aggregate function definitions
195 | 
196 | 
197 | def value_count_function_body(values: List[Any]) -> int:
198 |     """
199 |     Note: count(*) counts every row (not supported in learndb)
200 |     count(column) should only count non-null columns
201 |     """
202 |     count = 0
203 |     for value in values:
204 |         if value is not None:
205 |             count += 1
206 |     return count
207 | 
208 | 
209 | # a type of datatype means, it can accept any type
210 | count_function = FunctionDefinition(
211 |     "count", [[DataType]], {}, value_count_function_body, Integer
212 | )
213 | 
214 | # if we have same function for integers and floats, we'll name the int function
215 | # with not qualifiers, and name the float function with _float qualifier
216 | _SCALAR_FUNCTION_REGISTRY = {
217 |     "square": integer_square_function,
218 |     "square_float": float_square_function,
219 | }
220 | 
221 | _AGGREGATE_FUNCTION_REGISTRY = {"count": count_function}
222 | 
223 | 
224 | # public functions
225 | 
226 | 
227 | def resolve_function_name(name: str) -> FunctionDefinition:
228 |     """
229 |     Resolve function name, i.e. lookup name in registry.
230 |     In the future this could be extended to support,
231 |     dynamic dispatch, etc.
232 |     """
233 |     name = name.lower()
234 |     if name in _SCALAR_FUNCTION_REGISTRY:
235 |         return _SCALAR_FUNCTION_REGISTRY[name]
236 |     elif name in _AGGREGATE_FUNCTION_REGISTRY:
237 |         return _AGGREGATE_FUNCTION_REGISTRY[name]
238 | 
239 |     raise ValueError(f"Unable to find function [{name}]")
240 | 
241 | 
242 | def get_scalar_functions_names() -> List[str]:
243 |     """Return list of all scalar function names"""
244 |     return list(_SCALAR_FUNCTION_REGISTRY.keys())
245 | 
246 | 
247 | def get_aggregate_functions_names() -> List[str]:
248 |     """Return list of all aggregate function names"""
249 |     return list(_AGGREGATE_FUNCTION_REGISTRY.keys())
250 | 
251 | 
252 | def is_aggregate_function(func_name: str) -> bool:
253 |     """
254 |     Return bool if function is an aggregate function
255 |     """
256 |     func_name = func_name.lower()
257 |     return func_name in _AGGREGATE_FUNCTION_REGISTRY
258 | 
259 | 
260 | def is_scalar_function(func_name: str) -> bool:
261 |     """
262 |     Return bool if function is an scalar function
263 |     """
264 |     func_name = func_name.lower()
265 |     return func_name in _SCALAR_FUNCTION_REGISTRY
266 | 
267 | 
268 | def resolve_scalar_func_name(func_name: str) -> Response:
269 |     if is_scalar_function(func_name):
270 |         return Response(True, body=resolve_function_name(func_name))
271 |     return Response(False, error_message=f"Scalar function [{func_name}] not found")
272 | 
273 | 
274 | def resolve_aggregate_func_name(func_name: str) -> Response:
275 |     if is_aggregate_function(func_name):
276 |         return Response(True, body=resolve_function_name(func_name))
277 |     return Response(False, error_message=f"Aggregate function [{func_name}] not found")
278 | 


--------------------------------------------------------------------------------
/learndb/interface.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | """
  4 | This module contains the highest level user-interaction and resource allocation
  5 | i.e. management of entities, like parser, virtual machine, pager, etc. that implement
  6 | the DBMS functionality that is learndb.
  7 | """
  8 | import os
  9 | import os.path
 10 | import sys
 11 | import logging
 12 | 
 13 | from typing import List
 14 | 
 15 | from .constants import DB_FILE, USAGE, EXIT_SUCCESS
 16 | from .lang_parser.sqlhandler import SqlFrontEnd
 17 | from .lang_parser.symbols import Program
 18 | from .dataexchange import Response, MetaCommandResult
 19 | from .pipe import Pipe
 20 | from .stress import run_add_del_stress_suite
 21 | from .virtual_machine import VirtualMachine, VMConfig
 22 | 
 23 | 
 24 | # section: core execution/user-interface logic
 25 | 
 26 | 
 27 | def config_logging():
 28 |     # config logger
 29 |     FORMAT = "[%(filename)s:%(lineno)s - %(funcName)s ] %(message)s"
 30 |     # log to file
 31 |     # logging.basicConfig(format=FORMAT, level=logging.DEBUG, filename=os.path.join(os.getcwd(), "log.log"))
 32 |     # log to stdout
 33 |     logging.basicConfig(format=FORMAT, level=logging.DEBUG)
 34 | 
 35 | 
 36 | class LearnDB:
 37 |     """
 38 |     This provides programmatic interface for interacting with databases managed by Learndb.
 39 |     This class defines the handle
 40 | 
 41 |     An example flow is like:
 42 |     ```
 43 |     # create handler instance
 44 |     db = LearnDB(db_filepath)
 45 | 
 46 |     # submit statement
 47 |     resp = db.handle_input("select col_a from foo")
 48 |     assert resp.success
 49 | 
 50 |     # below are only needed to read results of statements that produce output
 51 |     # get output pipe
 52 |     pipe = db.get_pipe()
 53 | 
 54 |     # print rows
 55 |     while pipe.has_msgs():
 56 |         print(pipe.read())
 57 | 
 58 |     # close handle - flushes any in-memory state
 59 |     db.close()
 60 |     ```
 61 |     """
 62 | 
 63 |     def __init__(self, db_filepath: str, nuke_db_file: bool = False):
 64 |         """
 65 |         :param db_filepath: path to DB file; i.e. file that stores state of this database
 66 |         :param nuke_db_file: whether to nuke the file before self is initialized
 67 |         """
 68 |         self.db_filepath = db_filepath
 69 |         # NOTE: the method
 70 |         if nuke_db_file and os.path.exists(self.db_filepath):
 71 |             os.remove(self.db_filepath)
 72 |         self.pipe = None
 73 |         self.virtual_machine = None
 74 |         self.configure()
 75 |         self.reset()
 76 | 
 77 |     def reset(self):
 78 |         """
 79 |         Reset state. Recreates pipe and virtual_machine.
 80 |         """
 81 |         config = VMConfig(self.db_filepath)
 82 |         self.pipe = Pipe()
 83 |         if self.virtual_machine:
 84 |             self.virtual_machine.terminate()
 85 |         self.virtual_machine = VirtualMachine(config, self.pipe)
 86 | 
 87 |     def configure(self):
 88 |         """
 89 |         Handle any configuration tasks
 90 |         """
 91 |         config_logging()
 92 | 
 93 |     def nuke_dbfile(self):
 94 |         """
 95 |         remove db file.
 96 |         This effectively restarts the instance into a clean state.
 97 |         :return:
 98 |         """
 99 |         if os.path.exists(self.db_filepath):
100 |             os.remove(self.db_filepath)
101 |         self.reset()
102 | 
103 |     def get_pipe(self) -> Pipe:
104 |         """
105 |         NOTE: get pipe; pipes are recycled if LearnDB.reset is invoked
106 |         :return:
107 |         """
108 |         return self.pipe
109 | 
110 |     def close(self):
111 |         """
112 |         NOTE: must be called before exiting, to persist data to disk
113 |         :return:
114 |         """
115 |         self.virtual_machine.terminate()
116 | 
117 |     def handle_input(self, input_buffer: str) -> Response:
118 |         """
119 |         handle input- parse and execute
120 | 
121 |         :param input_buffer:
122 |         :return:
123 |         """
124 |         return self.input_handler(input_buffer)
125 | 
126 |     @staticmethod
127 |     def is_meta_command(command: str) -> bool:
128 |         return command and command[0] == "."
129 | 
130 |     def do_meta_command(self, command: str) -> Response:
131 |         """
132 |         handle execution of meta command
133 |         :param command:
134 |         :param db:
135 |         :return:
136 |         """
137 |         if command == ".quit":
138 |             print("goodbye")
139 |             self.close()
140 |             sys.exit(EXIT_SUCCESS)
141 |         elif command.startswith(".btree"):
142 |             # .btree expects table-name
143 |             splits = command.split(" ")
144 |             if len(splits) != 2:
145 |                 print("Invalid argument to .btree| Usage: > .btree <table-name>")
146 |                 return Response(False, status=MetaCommandResult.InvalidArgument)
147 |             tree_name = splits[1]
148 |             print("Printing tree" + "-" * 50)
149 |             self.virtual_machine.state_manager.print_tree(tree_name)
150 |             print("Finished printing tree" + "-" * 50)
151 |             return Response(True, status=MetaCommandResult.Success)
152 |         elif command == ".validate":
153 |             print("Validating tree....")
154 |             splits = command.split(" ")
155 |             if len(splits) != 2:
156 |                 print("Invalid argument to .validate| Usage: > .validate <table-name>")
157 |                 return Response(False, status=MetaCommandResult.InvalidArgument)
158 |             tree_name = splits[1]
159 |             self.virtual_machine.state_manager.validate_tree(tree_name)
160 |             print("Validation succeeded.......")
161 |             return Response(True, status=MetaCommandResult.Success)
162 |         elif command == ".nuke":
163 |             self.nuke_dbfile()
164 |         elif command == ".help":
165 |             print(USAGE)
166 |             return Response(True, status=MetaCommandResult.Success)
167 |         return Response(False, status=MetaCommandResult.UnrecognizedCommand)
168 | 
169 |     @staticmethod
170 |     def prepare_statement(command) -> Response:
171 |         """
172 |         prepare statement, i.e. parse statement and
173 |         return it's AST. For now the AST structure is the prepared
174 |         statement. This may change, e.g. if frontend changes to output bytecode
175 | 
176 |         :param command:
177 |         :return:
178 |         """
179 |         parser = SqlFrontEnd()
180 |         parser.parse(command)
181 |         if not parser.is_success():
182 |             return Response(
183 |                 False, error_message=f"parse failed due to: [{parser.error_summary()}]"
184 |             )
185 |         return Response(True, body=parser.get_parsed())
186 | 
187 |     def execute_statement(self, program: Program) -> Response:
188 |         """
189 |         execute statement;
190 |         returns return value of child-invocation
191 |         """
192 |         return self.virtual_machine.run(program)
193 | 
194 |     def input_handler(self, input_buffer: str) -> Response:
195 |         """
196 |         receive input, parse input, and execute vm.
197 | 
198 |         :param input_buffer:
199 |         :return:
200 |         """
201 |         if self.is_meta_command(input_buffer):
202 |             m_resp = self.do_meta_command(input_buffer)
203 |             if m_resp.success:
204 |                 return Response(True, status=MetaCommandResult.Success)
205 | 
206 |             print("Unable to process meta command")
207 |             return Response(False, status=m_resp.status)
208 | 
209 |         p_resp = self.prepare_statement(input_buffer)
210 |         if not p_resp.success:
211 |             return Response(False, error_message=p_resp.error_message)
212 | 
213 |         # handle non-meta command
214 |         # execute statement can be handled by the interpreter
215 |         program = p_resp.body
216 |         e_resp = self.execute_statement(program)
217 |         if e_resp.success:
218 |             print(f"Execution of command '{input_buffer}' succeeded")
219 |             return Response(True, body=e_resp.body)
220 |         else:
221 |             print(f"Execution of command '{input_buffer}' failed")
222 |             return Response(False, error_message=e_resp.error_message)
223 | 
224 | 
225 | def repl(db_filepath: str = DB_FILE):
226 |     """
227 |     REPL (read-eval-print loop) for learndb
228 |     """
229 | 
230 |     # create Learndb handler
231 |     db = LearnDB(db_filepath)
232 | 
233 |     print("Welcome to learndb")
234 |     print("For help use .help")
235 |     while True:
236 |         input_buffer = input("db > ")
237 |         resp = db.handle_input(input_buffer)
238 |         if not resp.success:
239 |             print(f"Command execution failed due to [{resp.error_message}] ")
240 |             continue
241 | 
242 |         # get output pipe
243 |         pipe = db.get_pipe()
244 | 
245 |         while pipe.has_msgs():
246 |             print(pipe.read())
247 | 
248 | 
249 | def run_file(input_filepath: str, db_filepath: str = DB_FILE) -> Response:
250 |     """
251 |     Execute statements in file.
252 |     """
253 |     # create Learndb handler
254 |     db = LearnDB(db_filepath)
255 | 
256 |     if not os.path.exists(input_filepath):
257 |         return Response(
258 |             False, error_message=f"Argument file [{input_filepath}] not found"
259 |         )
260 | 
261 |     with open(input_filepath) as fp:
262 |         contents = fp.read()
263 | 
264 |     resp = db.handle_input(contents)
265 |     if not resp.success:
266 |         print(f"Command execution failed due to [{resp.error_message}] ")
267 | 
268 |     # get output pipe
269 |     pipe = db.get_pipe()
270 | 
271 |     while pipe.has_msgs():
272 |         print(pipe.read())
273 | 
274 |     db.close()
275 | 
276 | 
277 | def run_stress(db_filepath: str = DB_FILE):
278 |     """
279 |     Run stress test
280 |     """
281 |     db = LearnDB(db_filepath)
282 |     run_add_del_stress_suite(db)
283 | 
284 | 
285 | def devloop():
286 |     """
287 |     This function can be in-place edited to run any arbitrary code
288 |     """
289 | 
290 |     # db = LearnDB(DB_FILE, nuke_db_file=True)
291 |     db = LearnDB(DB_FILE)
292 | 
293 |     # texts = ["select name, salary from employees order by salary"]
294 |     texts = ["select name, salary from employees order by salary asc, name desc"]
295 |     texts = [
296 |         """CREATE TABLE fruits (
297 |         id INTEGER PRIMARY KEY,
298 |         name TEXT,
299 |         avg_weight INTEGER)
300 |         """,
301 |         "insert into fruits (id, name, avg_weight) values (1, 'apple', 200)",
302 |         "insert into fruits (id, name, avg_weight) values (2, 'orange', 140)",
303 |         "insert into fruits (id, name, avg_weight) values (3, 'pineapple', 1000)",
304 |         "insert into fruits (id, name, avg_weight) values (4, 'grape', 5)",
305 |         "insert into fruits (id, name, avg_weight) values (5, 'pear', 166)",
306 |         "insert into fruits (id, name, avg_weight) values (6, 'mango', 150)",
307 |         "insert into fruits (id, name, avg_weight) values (7, 'watermelon', 10000)",
308 |         "insert into fruits (id, name, avg_weight) values (8, 'banana', 118)",
309 |         "insert into fruits (id, name, avg_weight) values (9, 'peach', 147)",
310 |         # "select name, id from fruits order by id limit 5"
311 |     ]
312 |     texts = [
313 |         "select name, avg_weight from fruits order by avg_weight, name desc limit 4"
314 |     ]
315 | 
316 |     for text in texts:
317 |         logging.info(f"handling. {text}")
318 |         resp = db.handle_input(text)
319 |         logging.info(f"received resp: {resp}")
320 |         while db.pipe.has_msgs():
321 |             logging.info("read from pipe: {}".format(db.pipe.read()))
322 | 
323 |     db.close()
324 | 
325 | 
326 | def parse_args_and_start(args: List):
327 |     """
328 |     parse args and starts
329 |     :return:
330 |     """
331 |     args_description = """Usage:
332 | python run.py repl
333 |     // start repl
334 | python run.py devloop
335 |     // start a dev-loop function
336 | python run.py file <filepath>
337 |     // read file at <filepath>
338 | python stress.py
339 |     // run stress test
340 |     """
341 |     if len(args) < 1:
342 |         print("Error: run-mode not specified")
343 |         print(args_description)
344 |         return
345 | 
346 |     runmode = args[0].lower()
347 |     if runmode == "repl":
348 |         repl()
349 |     elif runmode == "stress":
350 |         run_stress()
351 |     elif runmode == "devloop":
352 |         devloop()
353 |     elif runmode == "file":
354 |         # todo:  and output file
355 |         # if no output file, write to console
356 |         if len(args) < 2:
357 |             print("Error: Expected input filepath")
358 |             print(args_description)
359 |             return
360 |         input_filepath = args[1].lower()
361 |         run_file(input_filepath)
362 |     else:
363 |         print(f"Error: Invalid run mode [{runmode}]")
364 |         print(args_description)
365 |         return
366 | 


--------------------------------------------------------------------------------
/learndb/lang_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spandanb/learndb-py/242884e2418a09f480f17eac65e9c88e518bab1a/learndb/lang_parser/__init__.py


--------------------------------------------------------------------------------
/learndb/lang_parser/grammar.py:
--------------------------------------------------------------------------------
  1 | # lark grammar for a subset of learndb-sql using
  2 | GRAMMAR = """
  3 |         program          : stmnt
  4 |                          | terminated
  5 |                          | (terminated)+ stmnt?
  6 | 
  7 |         ?terminated      : stmnt ";"
  8 |         ?stmnt           : select_stmnt | drop_stmnt | delete_stmnt | update_stmnt | truncate_stmnt | insert_stmnt
  9 |                          | create_stmnt
 10 | 
 11 |         // we only want logically valid statements; and from is required for all other clauses
 12 |         // and so other clauses (e.g. where) are nested under from clause
 13 |         select_stmnt     : select_clause from_clause?
 14 |         select_clause    : "select"i selectable ("," selectable)*
 15 |         selectable       : expr
 16 | 
 17 |         from_clause      : "from"i source where_clause? group_by_clause? having_clause? order_by_clause? limit_clause?
 18 |         where_clause     : "where"i condition
 19 |         group_by_clause  : "group"i "by"i column_name ("," column_name)*
 20 |         having_clause    : "having"i condition
 21 |         order_by_clause  : "order"i "by"i ordered_column ("," ordered_column)*
 22 |         limit_clause     : "limit"i INTEGER_NUMBER ("offset"i INTEGER_NUMBER)?
 23 | 
 24 |         source            : single_source
 25 |                           | joining
 26 | 
 27 |         single_source      : table_name table_alias?
 28 | 
 29 |         ordered_column     : column_name (asc|desc)?
 30 | 
 31 |         //split conditioned and unconditioned (cross) join as cross join does not have an on-clause
 32 |         ?joining          : unconditioned_join | conditioned_join
 33 |         conditioned_join  : source join_modifier? "join"i single_source "on"i condition
 34 |         unconditioned_join : source "cross"i "join"i single_source
 35 | 
 36 |         join_modifier    : inner | left_outer | right_outer | full_outer
 37 | 
 38 |         inner            : "inner"i
 39 |         left_outer       : "left"i ["outer"i]
 40 |         right_outer      : "right"i ["outer"i]
 41 |         full_outer       : "full"i ["outer"i]
 42 |         cross            : "cross"i
 43 | 
 44 |         asc              : "asc"i
 45 |                          | "ascending"i
 46 |         desc             : "desc"i
 47 |                          | "descending"i
 48 | 
 49 |         // `expr` is the de-facto root of the expression hierarchy
 50 |         expr             : condition
 51 |         condition        : or_clause
 52 |         or_clause        : and_clause
 53 |                          | or_clause "or"i and_clause
 54 |         and_clause       : predicate
 55 |                          | and_clause "and"i predicate
 56 | 
 57 |         // predicate and comparison are separate so =, <> have lower precedence than other comp ops
 58 |         predicate        : comparison
 59 |                          | predicate ( EQUAL | NOT_EQUAL ) comparison
 60 |         comparison       : term
 61 |                          | comparison ( LESS_EQUAL | GREATER_EQUAL | LESS | GREATER ) term
 62 |         term             : factor
 63 |                          | term ( MINUS | PLUS ) factor
 64 |         factor           : unary
 65 |                          | factor ( SLASH | STAR ) unary
 66 |         unary            : primary
 67 |                          | ( BANG | MINUS ) unary
 68 | 
 69 |         primary          : literal
 70 |                          | nested
 71 |                          | column_name
 72 |                          | func_call
 73 |                          | "(" expr ")"
 74 | 
 75 |         literal          : INTEGER_NUMBER | REAL_NUMBER | STRING | TRUE | FALSE | NULL
 76 | 
 77 |         nested    : "(" select_stmnt | expr ")"
 78 | 
 79 |         // func calls; positional invocations only for now
 80 |         func_call        : func_name "(" func_arg_list ")"
 81 |         // TODO: add support for named args in func_arg_list
 82 |         // arbitrary expr can be a function argument, since we want to support algebraic expressions on func arguments,
 83 |         // e.g. some_func(col_x + 1)
 84 |         func_arg_list    : (expr ",")* expr
 85 | 
 86 |         create_stmnt     : "create"i "table"i table_name "(" column_def_list ")"
 87 |         ?column_def_list  : (column_def ",")* column_def
 88 |         ?column_def       : column_name datatype primary_key? not_null?
 89 |         datatype         : INTEGER | TEXT | BOOL | NULL | REAL
 90 | 
 91 |         primary_key      : "primary"i "key"i
 92 |         not_null         : "not"i "null"i
 93 | 
 94 |         drop_stmnt       : "drop"i "table"i table_name
 95 | 
 96 |         insert_stmnt     : "insert"i "into"i table_name "(" column_name_list ")" "values"i "(" value_list ")"
 97 |         column_name_list : (column_name ",")* column_name
 98 |         value_list       : (literal ",")* literal
 99 | 
100 |         delete_stmnt     : "delete"i "from"i table_name where_clause?
101 | 
102 |         update_stmnt     : "update"i table_name "set"i column_name "=" literal where_clause?
103 | 
104 |         truncate_stmnt   : "truncate"i table_name
105 | 
106 |         // datatype values
107 |         TRUE             : "true"i
108 |         FALSE            : "false"i
109 | 
110 |         // func names are globally defined, i.e. not a multipart scoped name
111 |         func_name        : IDENTIFIER
112 |         column_name      : SCOPED_IDENTIFIER
113 |         table_name       : SCOPED_IDENTIFIER
114 |         table_alias      : IDENTIFIER
115 | 
116 |         // keywords
117 |         INTEGER          : "integer"i
118 |         TEXT             : "text"i
119 |         BOOL             : "bool"i
120 |         NULL             : "null"i
121 |         // floating point type
122 |         REAL            : "real"i
123 | 
124 |         // operators
125 |         STAR              : "*"
126 |         LEFT_PAREN        : "("
127 |         RIGHT_PAREN       : ")"
128 |         LEFT_BRACKET      : "["
129 |         RIGHT_BRACKET     : "]"
130 |         DOT               : "."
131 |         EQUAL             : "="
132 |         LESS              : "<"
133 |         GREATER           : ">"
134 |         COMMA             : ","
135 |         MINUS             : "-"
136 |         PLUS              : "+"
137 |         SLASH             : "/"
138 |         BANG              : "!"
139 | 
140 |         // 2-char ops
141 |         LESS_EQUAL        : "<="
142 |         GREATER_EQUAL     : ">="
143 |         NOT_EQUAL         : "<>" | "!="
144 | 
145 |         // todo: remove
146 |         SEMICOLON         : ";"
147 | 
148 |         IDENTIFIER       : ("_" | ("a".."z") | ("A".."Z"))* ("_" | ("a".."z") | ("A".."Z") | ("0".."9"))+
149 |         SCOPED_IDENTIFIER : (IDENTIFIER ".")* IDENTIFIER
150 | 
151 |         // single quoted string
152 |         // NOTE: this doesn't have any support for escaping
153 |         SINGLE_QUOTED_STRING  : /'[^']*'/
154 |         STRING: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
155 | 
156 |         // ref: https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark
157 |         %import common.ESCAPED_STRING   -> DOUBLE_QUOTED_STRING
158 |         %import common.SIGNED_INT       -> INTEGER_NUMBER
159 |         // floating point number
160 |         %import common.SIGNED_NUMBER    -> REAL_NUMBER
161 |         %import common.WS
162 |         %ignore WS
163 | """
164 | 


--------------------------------------------------------------------------------
/learndb/lang_parser/sqlhandler.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import logging
 3 | 
 4 | from lark import Lark
 5 | from lark.exceptions import UnexpectedInput  # root of all lark exceptions
 6 | 
 7 | from .symbols import ToAst
 8 | from .grammar import GRAMMAR
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class SqlFrontEnd:
15 |     """
16 |     Parser for learndb lang, based on lark definition
17 |     """
18 | 
19 |     def __init__(self, raise_exception=False, debug_mode=True):
20 |         self.parser = None
21 |         # parse tree generated by Lark
22 |         self.parse_tree = None
23 |         # abstract syntax tree, output of parse tree being transformed
24 |         self.tree = None
25 |         self.exc = None  # exception
26 |         self.is_succ = False
27 |         self.raise_exception = raise_exception
28 |         self._init()
29 |         self.debug_mode = debug_mode
30 | 
31 |     def _init(self):
32 |         self.parser = Lark(GRAMMAR, parser="earley", start="program", debug=True)
33 | 
34 |     def error_summary(self):
35 |         if self.exc is not None:
36 |             return str(self.exc)
37 | 
38 |     def is_success(self):
39 |         """
40 |         whether parse operation is success
41 |         # TODO: this and other methods should raise if no parse
42 |         :return:
43 |         """
44 |         return self.is_succ
45 | 
46 |     def get_parsed(self):
47 |         return self.tree
48 | 
49 |     def parse(self, text: str):
50 |         """
51 | 
52 |         :param text:
53 |         :return:
54 |         """
55 |         # parse tree
56 |         try:
57 |             self.parse_tree = self.parser.parse(text)
58 |             transformer = ToAst()
59 |             self.tree = transformer.transform(self.parse_tree)
60 |             self.is_succ = True
61 |             self.exc = None
62 |         except UnexpectedInput as e:
63 |             self.exc = e
64 |             self.parse_tree = None
65 |             self.tree = None
66 |             self.is_succ = False
67 |             if self.raise_exception:
68 |                 raise
69 | 
70 |     def debug(self):
71 |         """
72 |         print some debug info on recent parse
73 |         """
74 |         logger.info("Outputting parse tree (untransformed)...")
75 |         print(self.parse_tree.pretty())
76 |         logger.info("Outputting AST...")
77 |         print(self.tree)
78 | 


--------------------------------------------------------------------------------
/learndb/lang_parser/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class TokenizeError(Exception):
 5 |     pass
 6 | 
 7 | 
 8 | class ParseError(Exception):
 9 |     pass
10 | 
11 | 
12 | def camel_to_snake(name: str) -> str:
13 |     """
14 |     change casing abcdXyz -> abcd_xyz
15 |     """
16 |     return re.sub(r"(?<!^)(?=[A-Z])", "_", name).lower()
17 | 
18 | 
19 | def pascal_to_snake(name) -> str:
20 |     """
21 |     convert case
22 |     HelloWorld -> hello_world
23 |     :return:
24 |     """
25 |     snake_name = []
26 |     for i, ch in enumerate(name):
27 |         if i == 0:
28 |             snake_name.append(ch.lower())
29 |         elif ch.isupper():
30 |             snake_name.append("_" + ch.lower())
31 |         else:
32 |             snake_name.append(ch)
33 | 
34 |     return "".join(snake_name)
35 | 


--------------------------------------------------------------------------------
/learndb/lang_parser/visitor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import logging
 3 | from .utils import camel_to_snake
 4 | 
 5 | 
 6 | class HandlerNotFoundException(Exception):
 7 |     """
 8 |     A specific handler (method) is not found;
 9 |     defined by me
10 |     """
11 | 
12 |     pass
13 | 
14 | 
15 | class Visitor:
16 |     """
17 |     Conceptually, Visitor is an interface/abstract class,
18 |     where different concrete Visitors, e.g. AstPrinter can handle
19 |     different tasks, e.g. printing tree, evaluating tree.
20 |     This indirection allows us to add new behaviors for the parser
21 |     via a new concrete class; instead of either: 1) modifying
22 |     the parser symbol classes (OOF), or 2) adding a new function
23 |     for any new behavior (e.g. functional)
24 | 
25 |     See following for visitor design pattern in python:
26 |      https://refactoring.guru/design-patterns/visitor/python/example
27 |     """
28 | 
29 |     def visit(self, symbol: "Symbol"):  # noqa F821
30 |         """
31 |         this will determine which specific handler to invoke; dispatch
32 |         """
33 |         suffix = camel_to_snake(symbol.__class__.__name__)
34 |         # determine the name of the handler method from class of expr
35 |         # NB: this requires the class and handler have the
36 |         # same name in PascalCase and snake_case, respectively
37 |         handler = f"visit_{suffix}"
38 |         if hasattr(self, handler):
39 |             return getattr(self, handler)(symbol)
40 |         else:
41 |             logging.warning(f"Visitor does not have {handler}")
42 |             raise HandlerNotFoundException(
43 |                 f"Visitor [{self.__class__.__name__}] does not have {handler}"
44 |             )
45 | 


--------------------------------------------------------------------------------
/learndb/name_registry.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from lark import Token
 4 | 
 5 | from .dataexchange import Response
 6 | from .lang_parser.symbols import ColumnName
 7 | from .record_utils import InvalidNameException
 8 | 
 9 | 
10 | class NameRegistry:
11 |     """
12 |     This entity is responsible for registering and resolving column name and types
13 |     from records and schemas.
14 |     TODO: split this into SchemaReader, and RecordReader
15 |     """
16 | 
17 |     def __init__(self):
18 |         # record used to resolve values
19 |         self.record = None
20 |         # schema to resolve names from
21 |         self.schema = None
22 | 
23 |     def set_record(self, record):
24 |         self.record = record
25 | 
26 |     def set_schema(self, schema):
27 |         self.schema = schema
28 | 
29 |     def is_name(self, operand) -> bool:
30 |         """
31 |         Return true if operand is a name, i.e. IDENTIFIER or SCOPED_IDENTIFIER
32 |         """
33 |         if isinstance(operand, Token) and (
34 |             operand.type == "IDENTIFIER" or operand.type == "SCOPED_IDENTIFIER"
35 |         ):
36 |             return True
37 |         elif isinstance(operand, ColumnName):
38 |             return True
39 |         else:
40 |             return False
41 | 
42 |     def resolve_name(self, operand) -> Response:
43 |         """
44 |         This is only valid if called on a name, i.e. is_name(operand) == True.
45 |         Note: This returns Response to distinguish resolve failed, from resolved to None
46 |         """
47 |         if isinstance(operand, ColumnName):
48 |             try:
49 |                 val = self.record.get(operand.name)
50 |                 return Response(True, body=val)
51 |             except InvalidNameException as e:
52 |                 logging.error(f"Attempted lookup on unknown column [{operand.name}]")
53 |                 logging.error(f"Valid column choices are [{self.record.columns}]")
54 |                 return Response(False, error_message=e.args[0])
55 | 
56 |         # NOTE: this was adapated from vm.check_resolve_name
57 |         raise NotImplementedError
58 | 
59 |     def resolve_column_name_type(self, operand: str) -> Response:
60 |         """
61 |         Determine type of column name
62 |         """
63 |         if self.schema.has_column(operand):
64 |             column = self.schema.get_column_by_name(operand)
65 |             return Response(True, body=column.datatype)
66 |         return Response(False, error_message=f"Unable to resolve column [{operand}]")
67 | 


--------------------------------------------------------------------------------
/learndb/pipe.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | 
 3 | 
 4 | class Pipe:
 5 |     """
 6 |     Used to read results from select expr
 7 |     """
 8 | 
 9 |     def __init__(self):
10 |         self.store = deque()
11 | 
12 |     def write(self, msg):
13 |         self.store.append(msg)
14 | 
15 |     def has_msgs(self) -> bool:
16 |         return len(self.store) > 0
17 | 
18 |     def read(self):
19 |         """
20 |         Read message and remove from the pipe
21 |         """
22 |         return self.store.popleft()
23 | 
24 |     def reset(self):
25 |         self.store = deque()
26 | 


--------------------------------------------------------------------------------
/learndb/schema.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | """
  4 | This contain structures to generate and manipulate
  5 | logical schema- (column name, column type).
  6 | 
  7 | The physical representation (number of bytes,
  8 | length of encoding) of the schema is contained in serde.py.
  9 | 
 10 | While, records- i.e. data-containing objects with the structure
 11 | specified by the schema-, and related utilities are contained in record_utils.py
 12 | """
 13 | 
 14 | from copy import copy
 15 | from typing import List, Optional, Union
 16 | 
 17 | from .datatypes import DataType, Integer, Text, Blob, Real
 18 | from .dataexchange import Response
 19 | from .lang_parser.symbols import TableName, SymbolicDataType, ColumnName
 20 | 
 21 | 
 22 | class Column:
 23 |     """
 24 |     Represents a column in a schema
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         name: str,
 30 |         datatype,
 31 |         is_primary_key: bool = False,
 32 |         is_nullable: bool = True,
 33 |     ):
 34 |         self.name = name.lower()
 35 |         self.datatype = datatype
 36 |         self.is_primary_key = is_primary_key
 37 |         self.is_nullable = is_nullable
 38 | 
 39 |     def __str__(self):
 40 |         return f"Column[{self.name}, {self.datatype}, is_primary: {self.is_primary_key}, is_nullable: {self.is_nullable}]"
 41 | 
 42 |     def __repr__(self):
 43 |         return self.__str__()
 44 | 
 45 | 
 46 | class AbstractSchema:
 47 |     """
 48 |     Defines interface for all schema types.
 49 | 
 50 |     NOTE: this doesn't enforce that implementation classes implement all interface methods.
 51 |     TODO: Consider formalizing the interface using abc.ABCMeta; see: https://realpython.com/python-interface/
 52 |     """
 53 | 
 54 |     @property
 55 |     def columns(self):
 56 |         raise NotImplementedError
 57 | 
 58 |     def get_column_by_name(self, name: str) -> Column:
 59 |         raise NotImplementedError
 60 | 
 61 |     def has_column(self, name) -> bool:
 62 |         raise NotImplementedError
 63 | 
 64 | 
 65 | class SimpleSchema(AbstractSchema):
 66 |     """
 67 |     Represents a schema. This includes
 68 |     logical aspects (name) and physical aspects
 69 |     (number of bytes of storage, fixed vs. variable length encoding)
 70 | 
 71 |     Note: a schema must be valid. If the schema is invalid, this
 72 |     should be raised prior to creating. This is particularly important,
 73 |     since schemas will correspond: 1) to a real data sources,
 74 |     2) computed schema for output resultset. For (1) we would have
 75 |     constraints like primary key; but for (2) we would not; and hence
 76 |     these constraints should be external to the schema definition
 77 | 
 78 |     NOTE: once constructed a schema should be treated as read-only
 79 |     """
 80 | 
 81 |     def __init__(self, name: str = None, columns: List[Column] = None):
 82 |         # name of object/entity defined
 83 |         self.name = name
 84 |         # list of column objects ordered by definition order
 85 |         self.cols = columns
 86 | 
 87 |     @property
 88 |     def columns(self):
 89 |         return self.cols
 90 | 
 91 |     def __str__(self):
 92 |         body = " ".join([col.name for col in self.cols])
 93 |         return f"Schema({str(self.name)}, {str(body)})"
 94 | 
 95 |     def __repr__(self):
 96 |         return str(self)
 97 | 
 98 |     def get_primary_key_column(self) -> Optional[str]:
 99 |         """
100 |         return column name of primary key column
101 |         :return:
102 |         """
103 |         for column in self.columns:
104 |             if column.is_primary_key:
105 |                 return column.name
106 |         return None
107 | 
108 |     def get_column_by_name(self, name) -> Optional[Column]:
109 |         name = name.lower()
110 |         for column in self.columns:
111 |             if column.name.lower() == name:
112 |                 return column
113 |         return None
114 | 
115 |     def has_column(self, name: str) -> bool:
116 |         """
117 |         Whether schema has column with given name
118 |         """
119 |         return self.get_column_by_name(name) is not None
120 | 
121 | 
122 | class ScopedSchema(AbstractSchema):
123 |     """
124 |     Represents a scoped (by table_alias) collection of schema
125 |     """
126 | 
127 |     def __init__(self, schemas: dict):
128 |         self.schemas = schemas  # table_name -> Schema
129 | 
130 |     def get_table_names(self):
131 |         return self.schemas.keys()
132 | 
133 |     @classmethod
134 |     def from_single_schema(cls, schema: SimpleSchema, alias: str):
135 |         return cls({alias: schema})
136 | 
137 |     @classmethod
138 |     def from_schemas(
139 |         cls,
140 |         left_schema: Union[SimpleSchema, ScopedSchema],
141 |         right_schema: SimpleSchema,
142 |         left_alias: Optional[str],
143 |         right_alias: str,
144 |     ):
145 |         if isinstance(left_schema, SimpleSchema):
146 |             assert left_alias is not None
147 |             return cls({left_alias: left_schema, right_alias: right_schema})
148 |         else:
149 |             assert isinstance(left_schema, ScopedSchema)
150 |             schemas = left_schema.schemas.copy()
151 |             schemas[right_alias] = right_schema
152 |             return cls(schemas)
153 | 
154 |     @property
155 |     def columns(self):
156 |         return [
157 |             col
158 |             for table_alias, schema in self.schemas.items()
159 |             for col in schema.columns
160 |         ]
161 | 
162 |     def has_column(self, name: str) -> bool:
163 |         column = self.get_column_by_name(name)
164 |         return column is not None
165 | 
166 |     def get_column_by_name(self, name) -> Optional[Column]:
167 |         name_parts = name.split(".")
168 |         assert len(name_parts) == 2
169 |         table_alias, column_name = name_parts
170 |         table_schema = self.schemas[table_alias]
171 |         for column in table_schema.columns:
172 |             if column.name.lower() == column_name:
173 |                 return column
174 |         return None
175 | 
176 | 
177 | class GroupedSchema(AbstractSchema):
178 |     """
179 |     Represents a grouped multi or simple schema
180 |     """
181 | 
182 |     def __init__(
183 |         self,
184 |         schema: Union[SimpleSchema, ScopedSchema],
185 |         group_by_columns: List[ColumnName],
186 |     ):
187 |         # all columns, i.e. group-by and non- group-by columns
188 |         self.schema = schema
189 |         # list of group-by columns, sorted by grouping order
190 |         self.group_by_columns = group_by_columns
191 | 
192 |     @property
193 |     def columns(self) -> List[Column]:
194 |         if isinstance(self.schema, SimpleSchema):
195 |             return self.schema.columns
196 |         else:
197 |             assert isinstance(self.schema, ScopedSchema)
198 |             # the children schema store columns without the alias
199 |             # we will create a new (flat) column list with
200 |             # alias prepended to column name
201 |             # as this is needed to ensure column existence checks work as expected
202 |             columns = []
203 |             for alias, schema in self.schema.schemas.items():
204 |                 for column in schema.columns:
205 |                     column_copy = copy(column)
206 |                     column_copy.name = f"{alias}.{column.name}"
207 |                     columns.append(column_copy)
208 |             return columns
209 | 
210 |     def get_column_by_name(self, name) -> Optional[Column]:
211 |         name = name.lower()
212 |         for column in self.columns:
213 |             if column.name.lower() == name:
214 |                 return column
215 |         return None
216 | 
217 |     def has_column(self, name) -> bool:
218 |         # todo: consider caching this
219 |         column = self.get_column_by_name(name)
220 |         return column is not None
221 | 
222 |     def is_non_grouping_column(self, name: str) -> bool:
223 |         """Return True if `column_name` is a non-grouping column"""
224 |         return self.has_column(name) and not self.is_grouping_column(name)
225 | 
226 |     def is_grouping_column(self, name: str) -> bool:
227 |         """Return True if `column_name` is a grouping column"""
228 |         name = name.lower()
229 |         for column in self.group_by_columns:
230 |             if name == column.name.lower():
231 |                 return True
232 |         return False
233 | 
234 | 
235 | NonGroupedSchema = Union[SimpleSchema, ScopedSchema]
236 | 
237 | 
238 | class CatalogSchema(SimpleSchema):
239 |     """
240 |     Hardcoded schema object for the catalog table.
241 | 
242 |     This corresponds to the following table definition:
243 |     create table catalog (
244 |         type  text,
245 |         name text,
246 |         tbl_name text,
247 |         rootpage integer,
248 |         sql text
249 |     )
250 | 
251 |     NOTE: This could be bootstrapped by parsing the above schema
252 |     definition text- as all other schemas will be. But this
253 |     will be easier. Yet, even doing that will require special
254 |     handling of the catalog schema. Further, having a hardcoded
255 |     schema will provide an easy validation on the parser.
256 |     """
257 | 
258 |     def __init__(self):
259 |         super().__init__(
260 |             "catalog",
261 |             [
262 |                 Column("pkey", Integer, is_primary_key=True),
263 |                 Column("name", Text),
264 |                 Column("root_pagenum", Integer),
265 |                 Column("sql_text", Text),
266 |             ],
267 |         )
268 | 
269 | 
270 | def schema_to_ddl(schema: SimpleSchema) -> str:
271 |     """
272 |     convert a schema to canonical ddl
273 | 
274 |     parser rule:
275 |         create_stmnt -> "create" "table" table_name "(" column_def_list ")"
276 | 
277 |     e.g. ddl
278 |     create table catalog (
279 |         pkey int primary key
280 |         type  text,
281 |         name text,
282 |         tbl_name text,
283 |         rootpage integer,
284 |         sql text
285 |     )
286 | 
287 |     :return:
288 |     """
289 |     column_defs = []
290 |     for column in schema.columns:
291 |         if column.is_primary_key:
292 |             # key is the first column in ddl
293 |             # primary key implies not null
294 |             column_defs.insert(
295 |                 0, f"{column.name} {column.datatype.typename} PRIMARY KEY"
296 |             )
297 |         else:
298 |             null_cond = "" if column.is_nullable else "NOT NULL"
299 |             column_defs.append(f"{column.name} {column.datatype.typename} {null_cond}")
300 |     column_def_body = ", ".join(column_defs)
301 |     assert isinstance(schema.name, TableName)
302 |     return f"CREATE TABLE {schema.name.table_name} ( {column_def_body} )"
303 | 
304 | 
305 | def validate_schema(schema: SimpleSchema) -> Response:
306 |     """
307 |     Ensure schema is valid.
308 |     A valid schema must have:
309 |         - integer primary key (this can be handled automatically later)
310 |         - unique column names
311 |         - valid column names
312 |         - valid datatypes
313 | 
314 |     :param schema:
315 |     :return:
316 |     """
317 |     # validate - single column primary key
318 |     if len([col for col in schema.columns if col.is_primary_key]) != 1:
319 |         return Response(False, error_message="missing primary key")
320 | 
321 |     # validate - primary key is integer
322 |     pkey = None
323 |     for col in schema.columns:
324 |         if col.is_primary_key:
325 |             pkey = col
326 |             break
327 |     if pkey.datatype != Integer:
328 |         return Response(False, error_message="primary key must be of integer type")
329 | 
330 |     # validate column names are unique
331 |     names = set()
332 |     for col in schema.columns:
333 |         if col.name in names:
334 |             return Response(False, error_message=f"duplicate column name [{col.name}]")
335 |         names.add(col.name)
336 | 
337 |     # validate column types are valid
338 |     for col in schema.columns:
339 |         if not issubclass(col.datatype, DataType):
340 |             return Response(False, error_message=f"invalid datatype for [{col.name}]")
341 | 
342 |     return Response(True)
343 | 
344 | 
345 | def token_to_datatype(datatype: DataType) -> Response:
346 |     """
347 |     parse datatype token into DataType
348 |     :param datatype_token:
349 |     :return:
350 |     """
351 |     if datatype == SymbolicDataType.Integer:
352 |         return Response(True, body=Integer)
353 |     elif datatype == SymbolicDataType.Text:
354 |         return Response(True, body=Text)
355 |     elif datatype == SymbolicDataType.Blob:
356 |         return Response(True, body=Blob)
357 |     elif datatype == SymbolicDataType.Real:
358 |         return Response(True, body=Real)
359 |     return Response(False, error_message=f"Unrecognized datatype: [{datatype}]")
360 | 
361 | 
362 | def generate_schema(create_stmnt) -> Response:
363 |     """
364 |     Generate schema from a create stmnt. There is a very thin
365 |     layer of translation between the stmnt and the schema object.
366 |     But I want to distinguish the (create) stmnt from the schema.
367 |     Note if the operation is successful, a valid schema was read.
368 |     :param create_stmnt:
369 |     :return:
370 |     """
371 | 
372 |     columns = []
373 |     for coldef in create_stmnt.columns:
374 |         resp = token_to_datatype(coldef.datatype)
375 |         if not resp.success:
376 |             return Response(
377 |                 False, error_message=f"Unable to parse datatype [{coldef.datatype}]"
378 |             )
379 |         datatype = resp.body
380 |         #  NOTE: all column names are stored as lower case
381 |         column_name = coldef.column_name.name.lower()
382 |         column = Column(
383 |             column_name,
384 |             datatype,
385 |             is_primary_key=coldef.is_primary_key,
386 |             is_nullable=coldef.is_nullable,
387 |         )
388 |         columns.append(column)
389 |     schema = SimpleSchema(name=create_stmnt.table_name, columns=columns)
390 | 
391 |     # validate schema
392 |     resp = validate_schema(schema)
393 |     if not resp.success:
394 |         return Response(
395 |             False, error_message=f"schema validation due to [{resp.error_message}]"
396 |         )
397 |     return Response(True, body=schema)
398 | 
399 | 
400 | def generate_unvalidated_schema(source_name: str, columns: List[Column]) -> Response:
401 |     """Generate an unvalidated schema with argument `columns`.
402 | 
403 |     This is used for output schema, which doesn't have primary key.
404 |     NOTE: `unvalidated` means we don't run any validations, e.g. generated
405 |     schema must have a primary key.
406 | 
407 |     TODO: apply any validations that do hold, e.g. column name uniqueness?"""
408 |     return Response(True, body=SimpleSchema(name=source_name, columns=columns))
409 | 
410 | 
411 | def make_grouped_schema(schema, group_by_columns: List) -> Response:
412 |     """
413 |     Generate a grouped schema from a non-grouped schema. How
414 |     will this handle both simple, and multi-schema
415 |     """
416 |     return Response(True, body=GroupedSchema(schema, group_by_columns))
417 | 


--------------------------------------------------------------------------------
/learndb/semantic_analysis.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum, auto
  2 | from typing import Optional, Type
  3 | 
  4 | from .dataexchange import Response
  5 | from .datatypes import DataType
  6 | from .functions import resolve_scalar_func_name, resolve_aggregate_func_name
  7 | from .lang_parser.symbols import (
  8 |     Symbol,
  9 |     Expr,
 10 |     OrClause,
 11 |     AndClause,
 12 |     BinaryArithmeticOperation,
 13 |     FuncCall,
 14 |     ColumnName,
 15 |     Literal,
 16 | )
 17 | from .lang_parser.visitor import Visitor
 18 | from .name_registry import NameRegistry
 19 | from .vm_utils import datatype_from_symbolic_datatype, EvalMode
 20 | 
 21 | 
 22 | class SemanticAnalysisError(Exception):
 23 |     pass
 24 | 
 25 | 
 26 | class SemanticAnalysisFailure(Enum):
 27 |     TypeMismatch = auto()
 28 |     FunctionDoesNotExist = auto()
 29 |     ColumnDoesNotExist = auto()
 30 |     # aggregate function called on grouping column
 31 |     FunctionMismatch = auto()
 32 | 
 33 | 
 34 | class SemanticAnalyzer(Visitor):
 35 |     """
 36 |     Performs semantic analysis:
 37 |         - evaluate expr types
 38 |         - determine if expr is valid,
 39 |             -- an expr may be invalid due to non-existent function, or column references
 40 |             -- type incompatible operation
 41 | 
 42 |     NOTE: (for now) type checking will be strict, i.e. no auto conversions,
 43 |         e.g. 2+ 2.0 will fail due to a type mismatch
 44 | 
 45 |     """
 46 | 
 47 |     def __init__(self, name_registry: NameRegistry):
 48 |         self.name_registry = name_registry
 49 |         self.mode = None
 50 |         self.failure_type: Optional[SemanticAnalysisFailure] = None
 51 |         self.error_message = ""
 52 |         # schema used to check column existence, etc.
 53 |         self.schema = None
 54 | 
 55 |     def analyze_no_schema(self, expr):
 56 |         """
 57 |         Public method.
 58 |         Analyze an expr with no schema
 59 |         """
 60 |         self.mode = EvalMode.NoSchema
 61 |         self.schema = None
 62 |         return self.analyze(expr)
 63 | 
 64 |     def analyze_scalar(self, expr: Symbol, schema):
 65 |         """
 66 |         Public method.
 67 |         Analyze a scalar schema
 68 |         """
 69 |         self.mode = EvalMode.Scalar
 70 |         self.schema = schema
 71 |         return self.analyze(expr)
 72 | 
 73 |     def analyze_grouped(self, expr: Symbol, schema):
 74 |         """
 75 |         Public method.
 76 |         Analyze a grouped schema
 77 |         """
 78 |         self.mode = EvalMode.Grouped
 79 |         self.schema = schema
 80 |         return self.analyze(expr)
 81 | 
 82 |     def analyze(self, expr: Symbol) -> Response[Type[DataType]]:
 83 |         """
 84 |         Determine type of expr,
 85 |         Returns ResponseType[DataType].
 86 |         This will terminate type analysis, at the first failure
 87 |         """
 88 |         try:
 89 |             return_value = self.evaluate(expr)
 90 |             return Response(True, body=return_value)
 91 |         except SemanticAnalysisError:
 92 |             return Response(
 93 |                 False, status=self.failure_type, error_message=self.error_message
 94 |             )
 95 | 
 96 |     def evaluate(self, expr: Symbol) -> Type[DataType]:
 97 |         return_value = expr.accept(self)
 98 |         return return_value
 99 | 
100 |     def visit_expr(self, expr: Expr):
101 |         return self.evaluate(expr.expr)
102 | 
103 |     def visit_or_clause(self, or_clause: OrClause):
104 |         or_value = None
105 |         value_unset = True
106 |         for and_clause in or_clause.and_clauses:
107 |             value = self.evaluate(and_clause)
108 |             if value_unset:
109 |                 or_value = value
110 |                 value_unset = False
111 |             else:
112 |                 # NOTE: and clause can only be applied over booleans (true, false, null), else error
113 |                 raise NotImplementedError
114 |         return or_value
115 | 
116 |     def visit_and_clause(self, and_clause: AndClause):
117 |         """
118 |         NOTE: This handles both where the and_clause is evals to a bool, and
119 |         to an value
120 |         """
121 |         and_value = None
122 |         # ensure value is set before we begin and'ing
123 |         value_unset = True
124 |         for predicate in and_clause.predicates:
125 |             pred_val = self.evaluate(predicate)
126 |             if value_unset:
127 |                 # set first value as is
128 |                 and_value = pred_val
129 |                 value_unset = False
130 |             else:
131 |                 # NOTE: and clause can only be applied over booleans (true, false, null), else error
132 |                 raise NotImplementedError
133 | 
134 |         return and_value
135 | 
136 |     def visit_binary_arithmetic_operation(self, operation: BinaryArithmeticOperation):
137 |         # evaluate operators, then check type
138 |         op1_type = self.evaluate(operation.operand1)
139 |         op2_type = self.evaluate(operation.operand2)
140 |         # for now, we will only support strict type checking, i.e.
141 |         if op1_type != op2_type:
142 |             self.error_message = (
143 |                 f"Type mismatch; {operation.operand1} is of type {op1_type}; "
144 |                 f"{operation.operand2} is of type {op2_type}"
145 |             )
146 |             raise SemanticAnalysisError()
147 |         return op1_type
148 | 
149 |     def visit_func_call(self, func_call: FuncCall):
150 |         """
151 |         Validate:
152 |         1) function exists,
153 |         2) for scalar case, function is scalar
154 |         3) for grouped case, this depends on the column
155 |         """
156 |         func_name = func_call.name
157 |         # 1. handle scalar case
158 |         if self.mode == EvalMode.Scalar:
159 |             # 1.1. check if function exists
160 |             # 2.1. function must be a scalar function
161 |             resp = resolve_scalar_func_name(func_name)
162 |             if not resp.success:
163 |                 # function not found
164 |                 self.error_message = resp.error_message
165 |                 raise SemanticAnalysisError()
166 | 
167 |             func = resp.body
168 |             return func.return_type
169 | 
170 |         # 2. handle no schema case
171 |         elif self.mode == EvalMode.NoSchema:
172 |             # NOTE: this will also be a scalar function
173 |             resp = resolve_scalar_func_name(func_name)
174 |             if not resp.success:
175 |                 # function not found
176 |                 self.error_message = resp.error_message
177 |                 raise SemanticAnalysisError()
178 | 
179 |             func = resp.body
180 |             return func.return_type
181 | 
182 |         # 3. handle grouped case
183 |         else:
184 |             assert self.mode == EvalMode.Grouped
185 |             # case 1: if function is applied to a grouping column, function must be a scalar function
186 |             # case 2: if function is applied to a non-grouping column, function must be an aggregate function
187 | 
188 |             # first attempt to resolve scalar
189 |             resp = resolve_scalar_func_name(func_name)
190 |             if resp.success:
191 |                 # enforce any column references are grouping columns
192 |                 # arguments could be an arbitrary expr over grouping columns
193 |                 columns = func_call.find_descendents(ColumnName)
194 |                 for column in columns:
195 |                     if not self.schema.is_grouping_column(column.name):
196 |                         self.failure_type = SemanticAnalysisFailure.FunctionMismatch
197 |                         self.error_message = (
198 |                             "Scalar function in grouped select expects grouping columns"
199 |                         )
200 |                         raise SemanticAnalysisError()
201 | 
202 |                 func = resp.body
203 |                 return func.return_type
204 | 
205 |             resp = resolve_aggregate_func_name(func_name)
206 |             if resp.success:
207 |                 # aggregate functions
208 |                 # currently, we only support functions that take a single column reference to a non-grouping column
209 |                 # i.e. min, max, count, etc.
210 |                 if len(func_call.args) != 1:
211 |                     self.failure_type = SemanticAnalysisFailure.FunctionMismatch
212 |                     self.error_message = (
213 |                         f"Aggregate function expects one and only one column reference; "
214 |                         f"received {len(func_call.args)}"
215 |                     )
216 |                     raise SemanticAnalysisError()
217 | 
218 |                 arg_expr = func_call.args[0]
219 |                 column_name = arg_expr.expr
220 | 
221 |                 if not isinstance(column_name, ColumnName):
222 |                     self.failure_type = SemanticAnalysisFailure.FunctionMismatch
223 |                     self.error_message = (
224 |                         "Aggregate function expects a single column reference"
225 |                     )
226 |                     raise SemanticAnalysisError()
227 | 
228 |                 if not self.schema.has_column(column_name.name):
229 |                     self.failure_type = SemanticAnalysisFailure.ColumnDoesNotExist
230 |                     self.error_message = f"column does not exist [{column_name.name}]"
231 |                     raise SemanticAnalysisError()
232 | 
233 |                 if not self.schema.is_non_grouping_column(column_name.name):
234 |                     # ensure column_arg is a non-grouping column
235 |                     self.failure_type = SemanticAnalysisFailure.FunctionMismatch
236 |                     self.error_message = (
237 |                         f"Expected non-grouping column as arg to aggregate function; "
238 |                         f"received column [{column_name.name}] for function [{func_name}] "
239 |                     )
240 |                     raise SemanticAnalysisError()
241 | 
242 |                 func = resp.body
243 |                 return func.return_type
244 | 
245 |             # function does not exist
246 |             self.failure_type = SemanticAnalysisFailure.FunctionDoesNotExist
247 |             self.error_message = f"Function {func_name} not found"
248 |             raise SemanticAnalysisError()
249 | 
250 |     def visit_column_name(self, column_name: ColumnName) -> Type[DataType]:
251 |         if self.mode == EvalMode.NoSchema:
252 |             # no column resolution in NoSchema mode
253 |             self.error_message = (
254 |                 f"Unexpected column name [{column_name}] in query without source"
255 |             )
256 |             self.failure_type = SemanticAnalysisFailure.ColumnDoesNotExist
257 |             raise SemanticAnalysisError()
258 | 
259 |         resp = self.name_registry.resolve_column_name_type(column_name.name)
260 |         if resp.success:
261 |             return resp.body
262 |         # name registry was unable to resolve name
263 |         self.error_message = f"Name registry failed to resolve column [{column_name}] due to: [{resp.error_message}]"
264 |         self.failure_type = SemanticAnalysisFailure.ColumnDoesNotExist
265 |         raise SemanticAnalysisError()
266 | 
267 |     def visit_literal(self, literal: Literal) -> Type[DataType]:
268 |         return datatype_from_symbolic_datatype(literal.type)
269 | 


--------------------------------------------------------------------------------
/learndb/serde.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import Type
  3 | 
  4 | from .constants import CELL_KEY_SIZE_SIZE, CELL_DATA_SIZE_SIZE, INTEGER_SIZE
  5 | 
  6 | from .datatypes import DataType, Null, Integer, Text, Blob, Real
  7 | from .dataexchange import Response
  8 | from .schema import SimpleSchema
  9 | from .record_utils import SimpleRecord
 10 | 
 11 | 
 12 | class InvalidCell(Exception):
 13 |     """
 14 |     A invalid formatted cell
 15 |     """
 16 | 
 17 | 
 18 | class SerialType(Enum):
 19 |     """
 20 |     serial-type of encoded data
 21 |     """
 22 | 
 23 |     Null = 0
 24 |     Integer = 1
 25 |     Real = 2
 26 |     Text = 3
 27 |     Blob = 4
 28 | 
 29 | 
 30 | def serialtype_to_datatype(serial_type: SerialType) -> Type[DataType]:
 31 |     """
 32 |     Convert serial type enum to datatype
 33 |     :param serial_type:
 34 |     :return:
 35 |     """
 36 |     if serial_type == SerialType.Null:
 37 |         return Null
 38 |     elif serial_type == SerialType.Integer:
 39 |         return Integer
 40 |     elif serial_type == SerialType.Real:
 41 |         return Real
 42 |     elif serial_type == SerialType.Text:
 43 |         return Text
 44 |     else:
 45 |         assert serial_type == SerialType.Blob
 46 |         return Blob
 47 | 
 48 | 
 49 | def datatype_to_serialtype(datatype: DataType) -> SerialType:
 50 |     """
 51 |     Convert datatype to serialtype
 52 |     :param datatype:
 53 |     :return:
 54 |     """
 55 |     if datatype == Null:
 56 |         return SerialType.Null
 57 |     elif datatype == Integer:
 58 |         return SerialType.Integer
 59 |     elif datatype == Real:
 60 |         return SerialType.Real
 61 |     elif datatype == Text:
 62 |         return SerialType.Text
 63 |     else:
 64 |         assert datatype == Blob
 65 |         return SerialType.Blob
 66 | 
 67 | 
 68 | def serialize_record(record: SimpleRecord) -> Response:
 69 |     """
 70 |     Serialize an entire record and return the bytes corresponding
 71 |     to a cell.
 72 | 
 73 |     For now, serialize each value and concatenate
 74 |     the resulting bytes. If this is not performant,
 75 |     consider using struct.pack
 76 | 
 77 |     See docs/file-format.txt for complete details; the following are
 78 |     the key details of a node:
 79 | 
 80 |     - (low address) header, cell pointer array, unallocated space, cells (high address)
 81 |     - cell ptrs are sorted by key (2 bytes); contain page offset to cell
 82 |     - cell -> [key_size(4B), data_size(4B), payload (key, data)]
 83 |         -- data can be divided into header and body
 84 |         -- data header -> [size of header, serial types (size of variable length value)?
 85 |         -- data body -> concatenated bytes of serialized values (in definition order)
 86 |         -- all data must fit in a cell, i.e. no overflow- this limits the max content size to what can fit in a single cell
 87 | 
 88 |     serial types:
 89 |         sqlite for inspiration (https://www.sqlite.org/fileformat2.html#record_format)
 90 | 
 91 |             serial-type  byte-length  datatype
 92 |             0            0            Null
 93 |             1            4            Integer
 94 |             2            4            Float
 95 |             3            var          Text
 96 |             4            var          Blob
 97 | 
 98 |         Types with a fixed-value, e.g. null will not be encoded in the data payload.
 99 | 
100 | 
101 |     """
102 |     # encode columns in definition order
103 |     key = b""
104 |     data_header = b""
105 |     data = b""
106 |     # 1. encode chunks of payload
107 |     for column in record.schema.columns:
108 |         # get column value
109 |         value = record.values.get(column.name)
110 |         # handle key
111 |         if column.is_primary_key:
112 |             # ensure primary key is an int
113 |             # this validation should be done at schema generation time
114 |             # if value is None:
115 |             #    breakpoint()
116 |             assert column.datatype == Integer, "Primary key must be an integer"
117 |             assert value is not None, "Primary key must exist"
118 |             key = column.datatype.serialize(value)
119 |         # handle non-key field
120 |         else:
121 |             # check if a value is required
122 |             if value is None and column.is_nullable is False:
123 |                 return Response(
124 |                     False,
125 |                     error_message=f"Required column [{column.name}] missing value",
126 |                 )
127 | 
128 |             if value is None:
129 |                 serial_type = SerialType.Null
130 |                 serialized_serial_type = Integer.serialize(serial_type.value)
131 |                 data_header += serialized_serial_type
132 |             else:
133 |                 serial_type = datatype_to_serialtype(column.datatype)
134 |                 # all columns except null can be serialized;
135 |                 # in the future, there may be non-null unserializable types, e.g. bool
136 |                 assert (
137 |                     column.datatype.is_serializable
138 |                 ), f"non-null unserializable column [{column.name}]"
139 | 
140 |                 # serialize header
141 |                 serialized_serial_type = Integer.serialize(serial_type.value)
142 |                 data_header += serialized_serial_type
143 | 
144 |                 # serialize data
145 |                 serialized_value = column.datatype.serialize(value)
146 |                 data += serialized_value
147 | 
148 |                 # check if datatype is variable length
149 |                 if not column.datatype.is_fixed_length:
150 |                     length = Integer.serialize(len(serialized_value))
151 |                     # encode length in header
152 |                     data_header += length
153 | 
154 |     # data-header is defined like:
155 |     # [size of header, serial types (size of variable length value)? ]
156 |     # NOTE: the data header, size of header includes self
157 |     data_header_len = Integer.serialize(Integer.fixed_length + len(data_header))
158 |     data_header = data_header_len + data_header
159 | 
160 |     # 2. assemble chunks as per file format spec into a cell
161 |     # i.e. cell = [key_size(4B), data_size(4B), key(var), data-header(var), data(var) ]
162 |     key_size = Integer.serialize(len(key))
163 |     data_payload = data_header + data
164 |     data_size = Integer.serialize(len(data_payload))
165 |     # print(f'In serialize; key-size: {key_size}, data-header-len: {data_header_len}, data_size: {data_size}')
166 |     # print(cell)
167 |     cell = key_size + data_size + key + data_payload
168 |     return Response(True, body=cell)
169 | 
170 | 
171 | def deserialize_cell(cell: bytes, schema: SimpleSchema) -> Response:
172 |     """
173 |     deserialize cell corresponding to schema
174 |     :param cell:
175 |     :param schema:
176 |     :return: Response[Record]
177 |     """
178 |     values = {}  # colname -> value
179 |     # read the columns in the cell
180 |     offset = 0
181 |     key_size = Integer.deserialize(cell[offset : offset + INTEGER_SIZE])
182 |     # skip past cell size fields
183 |     offset += CELL_KEY_SIZE_SIZE
184 |     offset += CELL_DATA_SIZE_SIZE
185 | 
186 |     # read key column
187 |     # bytes corresponding to key
188 |     key_bytes = cell[offset : offset + key_size]
189 |     key = Integer.deserialize(key_bytes)
190 |     key_columns = [col.name for col in schema.columns if col.is_primary_key]
191 | 
192 |     assert len(key_columns) == 1, "More than 1 key column"
193 |     key_column_name = key_columns[0]
194 |     values[key_column_name] = key
195 |     # after this, offset points past the key bytes, i.e. to the first
196 |     # byte of data payload
197 |     offset += len(key_bytes)
198 | 
199 |     # keep track of which column (relative position) we have read from
200 |     col_pos = 0
201 | 
202 |     # read non-key columns
203 |     header_size = Integer.deserialize(cell[offset : offset + INTEGER_SIZE])
204 |     # this is the abs addr value
205 |     header_abs_ubound = offset + header_size
206 | 
207 |     # print(f'In deserialize; key-size: {key_size}, data-header-len: {header_size}, data_size: {data_size}')
208 |     # print(cell)
209 | 
210 |     # process column metadata
211 |     # initialize data header ptr
212 |     # points to first column metadata
213 |     header_offset = offset + INTEGER_SIZE
214 |     # first address where data resides
215 |     data_offset = offset + header_size
216 |     while header_offset < header_abs_ubound:
217 |         # read until all column metadata has been run
218 |         serial_type_value = Integer.deserialize(
219 |             cell[header_offset : header_offset + INTEGER_SIZE]
220 |         )
221 |         serial_type = SerialType(serial_type_value)
222 |         # resolve datatype
223 |         datatype = serialtype_to_datatype(serial_type)
224 |         # increment header ptr
225 |         header_offset += INTEGER_SIZE
226 | 
227 |         # check whether column type is variable length
228 |         varlen = 0
229 |         if not datatype.is_fixed_length:
230 |             varlen = Integer.deserialize(
231 |                 cell[header_offset : header_offset + INTEGER_SIZE]
232 |             )
233 |             header_offset += INTEGER_SIZE
234 | 
235 |         # resolve column name
236 |         column = schema.columns[col_pos]
237 |         col_pos += 1
238 |         if column.is_primary_key:
239 |             # we've already handled the key column above; consider next column
240 |             column = schema.columns[col_pos]
241 |             col_pos += 1
242 | 
243 |         # read body
244 |         if datatype.is_fixed_length and not datatype.is_serializable:
245 |             # handle fixed-value type, i.e. only null for now, boolean's would be similar
246 |             assert datatype == Null
247 |             values[column.name] = None
248 |         elif datatype.is_fixed_length:
249 |             # handle fixed-length type
250 |             # increment body by a fixed amount
251 |             values[column.name] = datatype.deserialize(
252 |                 cell[data_offset : data_offset + datatype.fixed_length]
253 |             )
254 |             data_offset += datatype.fixed_length
255 |         else:
256 |             assert datatype.is_fixed_length is False
257 |             assert varlen > 0
258 |             # handle variable length type
259 |             # increment body by a variable amount
260 |             data_bstring = cell[data_offset : data_offset + varlen]
261 |             values[column.name] = datatype.deserialize(data_bstring)
262 |             data_offset += varlen
263 | 
264 |     # add non-existent columns with null values
265 |     for column in schema.columns:
266 |         if column.name not in values:
267 |             values[column.name] = None
268 | 
269 |     record = SimpleRecord(values, schema)
270 |     return Response(True, body=record)
271 | 
272 | 
273 | def get_cell_key(cell: bytes) -> int:
274 |     """
275 | 
276 |     :param cell:
277 |     :return:
278 |     """
279 |     return get_cell_key_in_page(cell, 0)
280 | 
281 | 
282 | def get_cell_key_in_page(node: bytes, cell_offset: int) -> int:
283 |     """
284 |     get key from cell given page num, cell_offset
285 | 
286 |     rename to: _in_node
287 | 
288 |     :param page:
289 |     :param cell_offset:
290 |     :return:
291 |     """
292 |     offset = cell_offset
293 |     key_size = Integer.deserialize(node[offset : offset + INTEGER_SIZE])
294 |     offset += CELL_KEY_SIZE_SIZE
295 |     # skip over data size field
296 |     offset += CELL_DATA_SIZE_SIZE
297 | 
298 |     # read key column
299 |     # bytes corresponding to key
300 |     key_bytes = node[offset : offset + key_size]
301 |     key = Integer.deserialize(key_bytes)
302 |     return key
303 | 
304 | 
305 | def get_cell_size(node: bytes, cell_offset: int) -> int:
306 |     offset = cell_offset
307 |     key_size = Integer.deserialize(node[offset : offset + INTEGER_SIZE])
308 |     offset += CELL_KEY_SIZE_SIZE
309 |     # skip over data size field
310 |     data_size = Integer.deserialize(node[offset : offset + INTEGER_SIZE])
311 |     offset += CELL_DATA_SIZE_SIZE
312 |     return INTEGER_SIZE + INTEGER_SIZE + key_size + data_size
313 | 


--------------------------------------------------------------------------------
/learndb/statemanager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | class representing a logical-database
  3 | support API to read/write data via Table
  4 | and creating tables etc.
  5 | """
  6 | import random
  7 | import string
  8 | from collections import UserList, UserDict
  9 | from typing import Optional, List, Union, Tuple
 10 | 
 11 | from .btree import Tree
 12 | from .constants import CATALOG_ROOT_PAGE_NUM
 13 | from .dataexchange import Response
 14 | from .pager import Pager
 15 | from .record_utils import GroupedRecord
 16 | from .schema import (
 17 |     SimpleSchema,
 18 |     ScopedSchema,
 19 |     CatalogSchema,
 20 |     GroupedSchema,
 21 |     NonGroupedSchema,
 22 | )
 23 | 
 24 | 
 25 | class RecordSet(UserList):
 26 |     """
 27 |     Maintains a list of records
 28 |     """
 29 | 
 30 |     pass
 31 | 
 32 | 
 33 | class GroupedRecordSet(UserDict):
 34 |     """
 35 |     Maintains a dictionary of lists of records, where the dict is
 36 |     indexed by the group key
 37 |     """
 38 | 
 39 |     def __getitem__(self, key):
 40 |         if key not in self.data:
 41 |             self.data[key] = []
 42 |         return self.data[key]
 43 | 
 44 |     def __setitem__(self, key, value):
 45 |         if key not in self.data:
 46 |             self.data[key] = []
 47 |         self.data[key].append(value)
 48 | 
 49 | 
 50 | class Scope:
 51 |     """
 52 |     A scope is a logical environment, within which names and objects are contained/defined.
 53 |     A passive entity that exposes add, remove, has_ {recordset, groupedrecordset,
 54 |     """
 55 | 
 56 |     def __init__(self):
 57 |         self.aliased_source = {}
 58 |         # NOTE: previously this was a list, but now since TableName is alias
 59 |         self.unaliased_source = set()
 60 |         # recordset name -> recordset
 61 |         self.record_sets = {}
 62 |         self.group_rsets = {}
 63 |         # recordset name -> schema
 64 |         self.rsets_schemas = {}
 65 |         self.group_rsets_schemas = {}
 66 | 
 67 |     def register_aliased_source(self, source: str, alias: str):
 68 |         raise NotImplementedError
 69 | 
 70 |     def register_unaliased_source(self, source: str):
 71 |         raise NotImplementedError
 72 | 
 73 |     def get_recordset(self, name: str) -> Optional[RecordSet]:
 74 |         return self.record_sets.get(name)
 75 | 
 76 |     def add_recordset(
 77 |         self, name: str, schema: NonGroupedSchema, recordset: RecordSet
 78 |     ) -> None:
 79 |         """
 80 |         Upsert a new recordset with `name`
 81 |         """
 82 |         self.rsets_schemas[name] = schema
 83 |         self.record_sets[name] = recordset
 84 | 
 85 |     def drop_recordset(self, name: str):
 86 |         del self.record_sets[name]
 87 | 
 88 |     def get_recordset_schema(self, name: str) -> Optional[NonGroupedSchema]:
 89 |         return self.rsets_schemas.get(name)
 90 | 
 91 |     def add_grouped_recordset(
 92 |         self, name, schema: GroupedSchema, recordset: GroupedRecordSet
 93 |     ) -> None:
 94 |         self.group_rsets_schemas[name] = schema
 95 |         self.group_rsets[name] = recordset
 96 | 
 97 |     def get_grouped_recordset(self, name: str) -> Optional[GroupedRecordSet]:
 98 |         return self.group_rsets.get(name)
 99 | 
100 |     def get_grouped_recordset_schema(self, name: str) -> Optional[GroupedSchema]:
101 |         return self.group_rsets_schemas.get(name)
102 | 
103 |     def drop_grouped_recordset(self, name: str):
104 |         raise NotImplementedError
105 | 
106 |     def cleanup(self):
107 |         """
108 |         TODO: recycle any objects
109 |         """
110 | 
111 | 
112 | class StateManager:
113 |     """
114 |     This entity is responsible for management of all state of the database
115 |     (contained a single file).
116 | 
117 |     State can be broadly divided into: 1) persisted tables (btree and schema),
118 |     that live in an implicit global scope.
119 |     2) all objects that live and die with a session, e.g. local recordsets,
120 |     scopes, and materialized sources.
121 | 
122 |     There is a third category- objects like functions that logically/from the user's
123 |     perspective live in the same assumed global scope as table names. But these,
124 |     are managed separately.
125 | 
126 |     This class is responsible for creating Tree and Table objects.
127 |     This is responsible for creating/managing the catalog (a special table).
128 | 
129 |     The class is intimately tied to catalog definition, i.e. has magic
130 |     constants for manipulating catalog.
131 |     """
132 | 
133 |     def __init__(self, filename: str):
134 |         # database file
135 |         self.db_filename = filename
136 |         # initialize pager; this will create the file
137 |         # file create functionality can be moved elsewhere if better suited
138 |         self.pager = Pager.pager_open(self.db_filename)
139 |         # the catalog root pagenum is hardcoded
140 |         self.catalog_root_page_num = CATALOG_ROOT_PAGE_NUM
141 |         # catalog schema
142 |         self.catalog_schema = CatalogSchema()
143 |         # catalog tree
144 |         self.catalog_tree = Tree(self.pager, self.catalog_root_page_num)
145 |         # mapping from table_name to schema object
146 |         self.schemas = {}
147 |         self.trees = {}
148 |         # scope stack
149 |         self.scopes: List[Scope] = []
150 | 
151 |     def close(self):
152 |         """
153 |         this calls the pager `close`
154 |         """
155 |         self.pager.close()
156 | 
157 |     def get_pager(self):
158 |         return self.pager
159 | 
160 |     def allocate_tree(self):
161 |         """
162 |         Allocate tree, by requesting an unused from pager, i.e.
163 |         as a root page for new tree.
164 |         :return:
165 |         """
166 |         return self.pager.get_unused_page_num()
167 | 
168 |     def table_exists(self, table_name: str) -> bool:
169 |         return table_name in self.trees
170 | 
171 |     def register_tree(self, table_name: str, tree: Tree):
172 |         self.trees[table_name] = tree
173 | 
174 |     def register_schema(self, table_name: str, schema: SimpleSchema):
175 |         self.schemas[table_name] = schema
176 | 
177 |     def unregister_table(self, table_name: str):
178 |         """
179 |         Remove table_name entry from both trees and schemas cache
180 |         """
181 |         del self.trees[table_name]
182 |         del self.schemas[table_name]
183 | 
184 |     def get_catalog_schema(self):
185 |         return self.catalog_schema
186 | 
187 |     def has_schema(self, table_name: str):
188 |         return table_name in self.schemas
189 | 
190 |     def get_schema(self, table_name: str):
191 |         return self.schemas[table_name]
192 | 
193 |     def get_catalog_tree(self):
194 |         return self.catalog_tree
195 | 
196 |     def get_tree(self, table_name):
197 |         return self.trees[table_name]
198 | 
199 |     def print_tree(self, table_name: str):
200 |         """
201 |         This method prints the tree
202 |         Putting this here, since the datastore encapsulates tree
203 |         :return:
204 |         """
205 |         self.get_tree(table_name).print_tree()
206 | 
207 |     def validate_tree(self, table_name: str):
208 |         self.get_tree(table_name).validate()
209 | 
210 |     # section: scope management
211 | 
212 |     def begin_scope(
213 |         self,
214 |     ):
215 |         self.scopes.append(Scope())
216 | 
217 |     def end_scope(self):
218 |         scope = self.scopes.pop()
219 |         scope.cleanup()
220 | 
221 |     # recordset management
222 | 
223 |     @staticmethod
224 |     def gen_randkey(size=10, prefix=""):
225 |         return prefix + "".join(
226 |             random.choice(string.ascii_letters) for i in range(size)
227 |         )
228 | 
229 |     def unique_recordset_name(self) -> str:
230 |         """
231 |         Generate a recordset name unique across all scopes
232 |         """
233 |         is_unique = False
234 |         name = None
235 |         while not is_unique:
236 |             name = self.gen_randkey(prefix="r")
237 |             # name must be unique across all scopes
238 |             for scope in self.scopes:
239 |                 if scope.get_recordset(name) is not None:
240 |                     break
241 |             else:
242 |                 is_unique = True
243 |         return name
244 | 
245 |     def unique_grouped_recordset_name(self) -> str:
246 |         """
247 |         Generate a recordset name unique across all scopes
248 |         """
249 |         is_unique = False
250 |         name = None
251 |         while not is_unique:
252 |             name = self.gen_randkey(prefix="g")
253 |             # name must be unique across all scopes
254 |             for scope in self.scopes:
255 |                 if scope.get_grouped_recordset(name) is not None:
256 |                     break
257 |             else:
258 |                 is_unique = True
259 |         return name
260 | 
261 |     def init_recordset(self, schema: Union[SimpleSchema, ScopedSchema]) -> Response:
262 |         """
263 |         Creates a new recordset with the associated `schema`, and
264 |         stores it in the current scope.
265 |         Recordset name should be unique across all scopes
266 |         """
267 |         name = self.unique_recordset_name()
268 |         scope = self.scopes[-1]
269 |         scope.add_recordset(name, schema, RecordSet())
270 |         return Response(True, body=name)
271 | 
272 |     def init_grouped_recordset(self, schema: GroupedSchema):
273 |         """
274 |         init a grouped recordset.
275 |         NOTE: A grouped record set is internally stored like
276 |         {group_key_tuple -> list_of_records}
277 |         """
278 |         name = self.unique_grouped_recordset_name()
279 |         scope = self.scopes[-1]
280 |         scope.add_grouped_recordset(name, schema, GroupedRecordSet())
281 |         return Response(True, body=name)
282 | 
283 |     def find_recordset_scope(self, name: str) -> Optional[Scope]:
284 |         """
285 |         Find and return scope, where scope contains recordset with `name`
286 |         """
287 |         for scope in reversed(self.scopes):
288 |             rset = scope.get_recordset(name)
289 |             if rset is not None:
290 |                 return scope
291 | 
292 |     def find_grouped_recordset_scope(self, name: str) -> Optional[Scope]:
293 |         """
294 |         Find and return scope, where scope contains grouped recordset with `name`
295 |         """
296 |         for scope in reversed(self.scopes):
297 |             rset = scope.get_grouped_recordset(name)
298 |             if rset is not None:
299 |                 return scope
300 | 
301 |     def get_recordset_schema(self, name: str) -> Optional[NonGroupedSchema]:
302 |         scope = self.find_recordset_scope(name)
303 |         if scope:
304 |             return scope.get_recordset_schema(name)
305 | 
306 |     def get_grouped_recordset_schema(self, name: str) -> Optional[NonGroupedSchema]:
307 |         scope = self.find_grouped_recordset_scope(name)
308 |         if scope:
309 |             return scope.get_grouped_recordset_schema(name)
310 | 
311 |     def append_recordset(self, name: str, record):
312 |         """
313 |         find the correct recordset across all scopes;
314 |         then add record to it
315 |         """
316 |         scope = self.find_recordset_scope(name)
317 |         assert scope is not None
318 |         recordset = scope.get_recordset(name)
319 |         recordset.append(record)
320 | 
321 |     def append_grouped_recordset(self, name: str, group_key: Tuple, record):
322 |         """
323 |         Add record to a group
324 |         """
325 |         scope = self.find_grouped_recordset_scope(name)
326 |         assert scope is not None
327 |         recordset = scope.get_grouped_recordset(name)
328 |         recordset[group_key].append(record)
329 | 
330 |     def add_group_grouped_recordset(self, name: str, group_key: Tuple, group_recordset):
331 |         """
332 |         Add a new group, with a given set of records for group_recordset
333 |         """
334 |         scope = self.find_grouped_recordset_scope(name)
335 |         assert scope is not None
336 |         recordset = scope.get_grouped_recordset(name)
337 |         assert group_key not in recordset
338 |         recordset[group_key] = group_recordset
339 | 
340 |     def drop_recordset(self, name: str):
341 |         scope = self.find_recordset_scope(name)
342 |         assert scope is not None
343 |         scope.drop_recordset(name)
344 | 
345 |     def drop_grouped_recordset(self, name: str):
346 |         raise NotImplementedError
347 | 
348 |     def recordset_iter(self, name: str):
349 |         """Return an iterator over recordset
350 |         NOTE: The iterator will be consumed after one iteration
351 |         """
352 |         scope = self.find_recordset_scope(name)
353 |         assert scope is not None
354 |         return iter(scope.get_recordset(name))
355 | 
356 |     def grouped_recordset_iter(self, name) -> List[GroupedRecord]:
357 |         """
358 |         return an iterator over a groups from a grouped recordset
359 |         """
360 |         scope = self.find_grouped_recordset_scope(name)
361 |         assert scope is not None
362 |         recordset = scope.get_grouped_recordset(name)
363 |         schema = scope.get_grouped_recordset_schema(name)
364 |         # NOTE: cloning the group_rset, since it may need to be iterated multiple times
365 |         # A group is represented by a GroupedRecord
366 |         return [
367 |             GroupedRecord(schema, group_key, group_rset)
368 |             for group_key, group_rset in recordset.items()
369 |         ]
370 | 


--------------------------------------------------------------------------------
/learndb/stress.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a stub for "stress" tests, which will perform
  3 | a large number of random operations or perform them
  4 | for a fixed amount of time.
  5 | 
  6 | These should compliment, static unit tests, in that they
  7 | should run non-deterministically, and thus expose issues
  8 | that unit-tests can't catch.
  9 | """
 10 | import logging
 11 | import itertools
 12 | import math
 13 | 
 14 | from .constants import EXIT_SUCCESS
 15 | 
 16 | 
 17 | def run_add_del_stress_test(db, insert_keys, del_keys):
 18 |     """
 19 |     perform some ops/validations
 20 | 
 21 |     :param db:
 22 |     :param insert_keys:
 23 |     :param del_keys:
 24 |     :return:
 25 |     """
 26 | 
 27 |     db.nuke_dbfile()
 28 | 
 29 |     print(f"running test case: {insert_keys} {del_keys}")
 30 | 
 31 |     # random.shuffle(del_keys)
 32 |     cmd = "create table foo ( colA integer primary key, colB text)"
 33 |     logging.info(f"handling [{cmd}]")
 34 |     resp = db.handle_input(cmd)
 35 | 
 36 |     # insert
 37 |     for key in insert_keys:
 38 |         cmd = f"insert into foo (colA, colB) values ({key}, 'hellew words foo')"
 39 |         logging.info(f"handling [{cmd}]")
 40 |         resp = db.handle_input(cmd)
 41 | 
 42 |     logging.debug("printing tree.......................")
 43 |     db.state_manager.print_tree("foo")
 44 | 
 45 |     # delete and validate
 46 |     for idx, key in enumerate(del_keys):
 47 |         # cmd = f"delete from foo where colA = {key} AND colB = 'foo'"
 48 |         cmd = f"delete from foo where colA = {key}"
 49 |         logging.info(f"handling [{cmd}]")
 50 |         resp = db.handle_input(cmd)
 51 |         if not resp.success:
 52 |             print(f"cmd {cmd} failed with {resp.status} {resp.error_message}")
 53 |             return EXIT_SUCCESS
 54 | 
 55 |         resp = db.handle_input("select cola, colb from foo")
 56 |         assert resp.success
 57 | 
 58 |         # output pipe
 59 |         pipe = db.get_pipe()
 60 | 
 61 |         result_keys = []
 62 |         # print anything in the output buffer
 63 |         logging.debug(f"pipe has msgs: {pipe.has_msgs()}")
 64 |         while pipe.has_msgs():
 65 |             record = pipe.read()
 66 |             key = record.get("cola")
 67 |             print(f"pipe read: {record}")
 68 |             result_keys.append(key)
 69 | 
 70 |         # assert result_keys == [k for k in sorted(keys)], f"result {result_keys} doesn't not match {[k for k in sorted(keys)]}"
 71 | 
 72 |         logging.debug("printing tree.......................")
 73 |         db.state_manager.print_tree("foo")
 74 |         # ensure tree is valid
 75 |         db.state_manager.validate_tree("foo")
 76 | 
 77 |         # check if all keys we expect are there in result
 78 |         expected = [key for key in sorted(del_keys[idx + 1 :])]
 79 |         actual = [key for key in sorted(set(result_keys))]
 80 |         assert actual == expected, f"expected: {expected}; received {actual}"
 81 | 
 82 |         print("*" * 100)
 83 | 
 84 |     db.close()
 85 | 
 86 | 
 87 | def run_add_del_stress_suite(learndb):
 88 |     """
 89 |     Perform a large number of add/del operation
 90 |     and validate btree correctness.
 91 |     :return:
 92 |     """
 93 | 
 94 |     test_cases = [
 95 |         [1, 2, 3, 4],
 96 |         [64, 5, 13, 82],
 97 |         [82, 13, 5, 2, 0],
 98 |         [10, 20, 30, 40, 50, 60, 70],
 99 |         [72, 79, 96, 38, 47],
100 |         [432, 507, 311, 35, 246, 950, 956, 929, 769, 744, 994, 438],
101 |         [159, 597, 520, 189, 822, 725, 504, 397, 218, 134, 516],
102 |         [159, 597, 520, 189, 822, 725, 504, 397],
103 |         [960, 267, 947, 400, 795, 327, 464, 884, 667, 870, 92],
104 |         [793, 651, 165, 282, 177, 439, 593],
105 |         [229, 653, 248, 298, 801, 947, 63, 619, 475, 422, 856, 57, 38],
106 |         [103, 394, 484, 380, 834, 677, 604, 611, 952, 71, 568, 291, 433, 305],
107 |         [
108 |             114,
109 |             464,
110 |             55,
111 |             450,
112 |             729,
113 |             646,
114 |             95,
115 |             649,
116 |             59,
117 |             412,
118 |             546,
119 |             340,
120 |             667,
121 |             274,
122 |             477,
123 |             363,
124 |             333,
125 |             897,
126 |             772,
127 |             508,
128 |             182,
129 |             305,
130 |             428,
131 |             180,
132 |             22,
133 |         ],
134 |         [15, 382, 653, 668, 139, 70, 828, 17, 891, 121, 175, 642, 491, 281, 920],
135 |         [
136 |             967,
137 |             163,
138 |             791,
139 |             938,
140 |             939,
141 |             196,
142 |             104,
143 |             465,
144 |             886,
145 |             355,
146 |             58,
147 |             251,
148 |             928,
149 |             758,
150 |             535,
151 |             737,
152 |             357,
153 |             125,
154 |             171,
155 |             838,
156 |             572,
157 |             745,
158 |             999,
159 |             417,
160 |             393,
161 |             458,
162 |             292,
163 |             904,
164 |             158,
165 |             286,
166 |             900,
167 |             859,
168 |             668,
169 |             183,
170 |         ],
171 |         [
172 |             726,
173 |             361,
174 |             583,
175 |             121,
176 |             908,
177 |             789,
178 |             842,
179 |             67,
180 |             871,
181 |             461,
182 |             522,
183 |             394,
184 |             225,
185 |             637,
186 |             792,
187 |             393,
188 |             656,
189 |             748,
190 |             39,
191 |             696,
192 |         ],
193 |         [
194 |             54,
195 |             142,
196 |             440,
197 |             783,
198 |             619,
199 |             273,
200 |             95,
201 |             961,
202 |             692,
203 |             369,
204 |             447,
205 |             825,
206 |             555,
207 |             908,
208 |             483,
209 |             356,
210 |             40,
211 |             110,
212 |             519,
213 |             599,
214 |         ],
215 |         [
216 |             413,
217 |             748,
218 |             452,
219 |             666,
220 |             956,
221 |             926,
222 |             94,
223 |             813,
224 |             245,
225 |             237,
226 |             264,
227 |             709,
228 |             706,
229 |             872,
230 |             535,
231 |             214,
232 |             561,
233 |             882,
234 |             646,
235 |         ],
236 |     ]
237 | 
238 |     # stress
239 |     for test_case in test_cases:
240 |         insert_keys = test_case
241 |         # del_keys = test_case[:]
242 | 
243 |         # there is a large number of perms ~O(n!)
244 |         # and they are generated in a predictable order
245 |         # we'll skip based on fixed step- later, this too should be randomized
246 |         num_perms = 1
247 |         total_perms = math.factorial(len(insert_keys))
248 |         del_perms = []
249 | 
250 |         step_size = min(total_perms // num_perms, 10)
251 |         # iterator over permutations
252 |         perm_iter = itertools.permutations(insert_keys)
253 | 
254 |         while len(del_perms) < num_perms:
255 |             for _ in range(step_size - 1):
256 |                 # skip n-1 deletes
257 |                 next(perm_iter)
258 |             del_perms.append(next(perm_iter))
259 | 
260 |         for del_keys in del_perms:
261 |             try:
262 |                 run_add_del_stress_test(learndb, insert_keys, del_keys)
263 |             except Exception as e:
264 |                 logging.error(
265 |                     f"Inner devloop failed on: {insert_keys} {del_keys} with {e}"
266 |                 )
267 |                 raise
268 | 


--------------------------------------------------------------------------------
/learndb/value_generators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Contains classes corresponding to value generators.
  3 | 
  4 | ValueGenerators are used in to evaluate select clauses. They are objects
  5 | that track the formal parameters passed to the select clause. Then
  6 | when iterating over a recordset, the valueGenerator takes a record, and returns a single output value
  7 | """
  8 | from dataclasses import dataclass
  9 | from typing import Any, Dict, List, NewType, Union
 10 | 
 11 | from .functions import FunctionDefinition
 12 | from .lang_parser.symbols import OrClause
 13 | from .record_utils import SimpleRecord, ScopedRecord, GroupedRecord
 14 | from .expression_interpreter import ExpressionInterpreter
 15 | 
 16 | 
 17 | @dataclass
 18 | class ColumnRefSelectableAtom:
 19 |     """
 20 |     Represents a selectable, that is a column ref, as opposed to a literal.
 21 |     A note on name, a selectable is any component of a select clause, e.g.
 22 |     select 1, upper(name) from people
 23 | 
 24 |     Here, `upper(name)` is a selectable, and `name` is a ColumnRefSelectableAtom, and 1 is a LiteralSelectableAtom.
 25 |     """
 26 | 
 27 |     name: Any
 28 | 
 29 | 
 30 | @dataclass
 31 | class LiteralSelectableAtom:
 32 |     """
 33 |     Represents a literal selectable
 34 |     """
 35 | 
 36 |     value: Any
 37 | 
 38 | 
 39 | SelectableAtom = NewType(
 40 |     "SelectableAtom", Union[ColumnRefSelectableAtom, LiteralSelectableAtom]
 41 | )
 42 | 
 43 | 
 44 | class ValueGeneratorFromRecordOverFunc:
 45 |     """
 46 |     Generate value from a single record.
 47 | 
 48 |     This is used in a select clause, e.g.
 49 | 
 50 |     select cola from foo
 51 | 
 52 |     or
 53 | 
 54 |     select upper(cola) from foo
 55 |     """
 56 | 
 57 |     def __init__(
 58 |         self,
 59 |         pos_args: List[SelectableAtom],
 60 |         named_args: Dict[str, SelectableAtom],
 61 |         func: FunctionDefinition,
 62 |     ):
 63 |         """
 64 |         pos_args: List of SelectableAtoms which represents either: 1) static values, 2) column identifiers
 65 |         named_args: Dict of ^
 66 |         func: should this be a FunctionDefinition or None
 67 |         """
 68 |         self.pos_args = pos_args
 69 |         self.named_args = named_args
 70 |         self.func = func
 71 | 
 72 |     def get_value(self, record) -> Any:
 73 |         """
 74 |         This is invoked when iterating over a recordset with each record
 75 |         """
 76 |         # evaluate pos_args, i.e. convert SelectableAtom to a value that can be passed to a function
 77 |         evaluated_pos_args = []
 78 |         for arg in self.pos_args:
 79 |             if isinstance(arg, LiteralSelectableAtom):
 80 |                 # evaluate any literals, by unboxing from `LiteralSelectableAtom`
 81 |                 evaluated_pos_args.append(arg.value)
 82 |             else:
 83 |                 # evaluate any column references, i.e. replace with value in record
 84 |                 evaluated_pos_args.append(record.get(arg.name))
 85 | 
 86 |         evaluated_named_args = {}
 87 |         for arg_name, arg_val in self.named_args.items():
 88 |             if isinstance(arg_val, LiteralSelectableAtom):
 89 |                 evaluated_named_args[arg_name] = arg_val.value
 90 |             else:
 91 |                 evaluated_named_args[arg_name] = record.get(arg_val.name)
 92 | 
 93 |         # apply a function on arguments to
 94 |         ret_val = self.func.apply(evaluated_pos_args, evaluated_named_args)
 95 |         return ret_val
 96 | 
 97 | 
 98 | class ValueGeneratorFromRecordOverExpr:
 99 |     """
100 |     Generate value from a single record. Where the value is the result of evaluating an expr.
101 |     The expr can be composed of column refs, literals, function calls, and algebraic combinations of these.
102 | 
103 |     This generalizes the ValueGeneratorFromRecordOverFunc, ValueExtractorFromRecord
104 |     """
105 | 
106 |     def __init__(self, or_clause: OrClause, interpreter: ExpressionInterpreter):
107 |         self.or_clause = or_clause
108 |         self.interpreter = interpreter
109 | 
110 |     def get_value(self, record: Union[SimpleRecord, ScopedRecord]) -> Any:
111 |         """
112 |         Evaluate the or_clause
113 |         """
114 |         value = self.interpreter.evaluate_over_record(self.or_clause, record)
115 |         return value
116 | 
117 | 
118 | class ValueGeneratorFromNoRecordOverExpr:
119 |     """
120 |     Generate value from a no-record. Where the value is the result of evaluating an expr.
121 |     The expr can be composed of column refs, literals, function calls, and algebraic combinations of these.
122 |     """
123 | 
124 |     def __init__(self, or_clause: OrClause, interpreter: ExpressionInterpreter):
125 |         self.or_clause = or_clause
126 |         self.interpreter = interpreter
127 | 
128 |     def get_value(self) -> Any:
129 |         """
130 |         Evaluate the or_clause
131 |         """
132 |         value = self.interpreter.evaluate_over_no_record(self.or_clause)
133 |         return value
134 | 
135 | 
136 | class ValueGeneratorFromRecordGroupOverExpr:
137 |     """
138 |     Generate value from pair of group_key, and recordset for record group. Where the value is the result of evaluating an expr.
139 |     The expr can be composed of column refs, literals, function calls, and algebraic combinations of these.
140 |     """
141 | 
142 |     def __init__(self, or_clause: OrClause, interpreter: ExpressionInterpreter):
143 |         self.or_clause = or_clause
144 |         self.interpreter = interpreter
145 | 
146 |     def get_value(self, record: GroupedRecord) -> Any:
147 |         """ """
148 |         value = self.interpreter.evaluate_over_grouped_record(self.or_clause, record)
149 |         return value
150 | 


--------------------------------------------------------------------------------
/learndb/vm_utils.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum, auto
 2 | from typing import Type
 3 | 
 4 | from .datatypes import DataType, Integer, Real, Blob, Text
 5 | from .lang_parser.symbols import SymbolicDataType
 6 | 
 7 | 
 8 | class EvalMode(Enum):
 9 |     NoSchema = auto()
10 |     Scalar = auto()
11 |     Grouped = auto()
12 | 
13 | 
14 | def datatype_from_symbolic_datatype(data_type: SymbolicDataType) -> Type[DataType]:
15 |     """
16 |     Convert symbols.DataType to datatypes.DataType
17 |     """
18 |     if data_type == SymbolicDataType.Integer:
19 |         return Integer
20 |     elif data_type == SymbolicDataType.Real:
21 |         return Real
22 |     elif data_type == SymbolicDataType.Blob:
23 |         return Blob
24 |     elif data_type == SymbolicDataType.Text:
25 |         return Text
26 |     else:
27 |         raise Exception(f"Unknown type {data_type}")
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | black==23.3.0
2 | lark==1.0.0
3 | pytest==7.1.2
4 | 


--------------------------------------------------------------------------------
/run_learndb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Main interface for user/developer of learndb.
 3 | 
 4 | Utility to start repl and run commands.
 5 | 
 6 | Requires learndb to be installed.
 7 | """
 8 | 
 9 | import sys
10 | 
11 | from learndb import parse_args_and_start
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     parse_args_and_start(sys.argv[1:])


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='LearnDB',
 4 |       version='0.1',
 5 |       description='LearnDB database',
 6 |       author='Spandan Bemby',
 7 |       url='https://github.com/spandanb/learndb-py',
 8 |       packages=['learndb', 'learndb.lang_parser'],
 9 |      )
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/sqls/employees.sql:
--------------------------------------------------------------------------------
 1 | create table employees (
 2 |     id INTEGER PRIMARY KEY,
 3 |     name TEXT,
 4 |     salary INTEGER,
 5 |     depid INTEGER
 6 | );
 7 | 
 8 | INSERT INTO employees(id, name, salary, depid) VALUES (1, 'John', 100, 1);
 9 | INSERT INTO employees(id, name, salary, depid) VALUES (2, 'Anita', 200, 1);
10 | INSERT INTO employees(id, name, salary, depid) VALUES (3, 'Gab', 100, 2);
11 | 
12 | create table department (
13 |     depid INTEGER PRIMARY KEY,
14 |     name TEXT
15 | );
16 | 
17 | INSERT INTO department(depid, name) VALUES (1, 'accounting');
18 | INSERT INTO department(depid, name) VALUES (2, 'sales');
19 | INSERT INTO department(depid, name) VALUES (3, 'engineering');
20 | 
21 | select e.name, d.name from employees e inner join department d on e.depid = d.depid;
22 | 
23 | select count(e.name), d.depid from employees e inner join department d on e.depid = d.depid group by d.depid;
24 | 
25 | select count(e.name), d.depid from  department d left join employees e on e.depid = d.depid group by d.depid;
26 | 
27 | select count(e.name), d.depid from employees e right join department d on e.depid = d.depid group by d.depid;


--------------------------------------------------------------------------------
/sqls/employees2.sql:
--------------------------------------------------------------------------------
 1 | create table employees (
 2 |     id INTEGER PRIMARY KEY,
 3 |     name TEXT,
 4 |     salary INTEGER,
 5 |     depid INTEGER
 6 | );
 7 | 
 8 | INSERT INTO employees(id, name, salary, depid) VALUES (1, 'John', 100, 1);
 9 | INSERT INTO employees(id, name, salary, depid) VALUES (2, 'Anita', 200, 1);
10 | INSERT INTO employees(id, name, salary, depid) VALUES (3, 'Gab', 100, 2);
11 | 
12 | create table department (
13 |     depid INTEGER PRIMARY KEY,
14 |     name TEXT
15 | );
16 | 
17 | INSERT INTO department(depid, name) VALUES (1, 'accounting');
18 | INSERT INTO department(depid, name) VALUES (2, 'sales');
19 | INSERT INTO department(depid, name) VALUES (3, 'engineering');
20 | 
21 | 
22 | 
23 | select count(e.name), d.depid from  department d left join employees e on e.depid = d.depid group by d.depid;
24 | 


--------------------------------------------------------------------------------
/tasks.md:
--------------------------------------------------------------------------------
  1 | it # Tasks
  2 | 
  3 | ## Top Priority
  4 | 
  5 | - docs
  6 |   - architecture.md
  7 |     -  complete: flows, component breakdown
  8 | 
  9 | - docs
 10 |   - tutorial.md
 11 |   - README
 12 |   - generate code docs (where should these be placed, perhaps src_docs)
 13 | 
 14 | - bad ux:
 15 |   - "select tbl_name from catalog"
 16 |     - invalid column name - perhaps valid columns can be surfaces
 17 |   - "drop table table_doesnot_exist"
 18 |     - weird message
 19 | 
 20 | - tests 
 21 |   - add more lang_tests
 22 |   - no_source tests?
 23 | 
 24 | - is the usage of constants::NULLPTR valid?
 25 | 
 26 | 
 27 | - increase constants::INTERNAL_NODE_MAX_CELLS
 28 | - constants::LEAF_NODE_MAX_CELLS
 29 | 
 30 | - validate ordering
 31 |   - quicksort: does this equality need to handle floats in a special way?
 32 | 
 33 | 
 34 | - put admin tasks somewhere (Make, python doit, shell)
 35 |   - run all tests
 36 |   - run black
 37 |   - 
 38 | 
 39 | 
 40 | - release 
 41 |     - add config file (controls output filepath, etc)
 42 | 
 43 |   
 44 |     - track bugs/gotchas
 45 | 
 46 | - - ungrouped source impl
 47 | - for lang_tests, I should assert on contents, right now only checking if parse is successful
 48 | - cleanup learndb.py; ensure all devloops are encoded in some test cases
 49 | - how to best structure E2E tests? 
 50 |   - how should they be named?
 51 | 
 52 | ## Testing
 53 | - btree: test all permutations of small test cases
 54 | - add stress tests (stress-tests.txt)
 55 | - use pytest fixtures
 56 |   - seems it would be cleaner to define fixtures, i.e. pre-inited dbs with different schemas
 57 |   - right now, I have a lot of boiler plate
 58 | - improve coverage and robustness of test suite
 59 |   - robustness: try randomized inputs
 60 |   - coverage: auto generate new inputs
 61 | 
 62 | 
 63 | ## User API
 64 |  - metaops to list tables, show table schema
 65 |  - add config to control
 66 |     - how to pass config- update entry point method
 67 |     - stop_execution_on_error
 68 |     - output data file
 69 |  - in addition to LearnDB do I want to support:
 70 |  - cursor?
 71 |  - records should be immutable-since they're shallow copied; or final records returned to user should be separate
 72 |  - repl should have help message at beginning
 73 |    - have an additional/secondary command to output sql example/primer
 74 |  - run learndb with input file
 75 | 
 76 | ## documentation/refactoring
 77 | - complete architecture.md
 78 | - document datatypes (type and valid ranges)
 79 |   - int is 4 byte int
 80 |   - real is a floating point number, but I'm handling it with much simpler rules than IEEE754
 81 | - complete future-work.md  
 82 |     - should contain high-level roadmap and interesting areas of future work
 83 | - add tutorial
 84 | - complete btree-structural-ops.txt
 85 | - add btree types (pages/nodes) are bytearray (mutable) not bytes (immutable)
 86 | 
 87 | 
 88 | ## Lark
 89 |   - document how parse tree -> AST is working
 90 |   - pretty print transformed tree
 91 |   - to_ast 
 92 |       - sql_handler should return cleaned up ast
 93 |   - write tests for lark
 94 |   - validations: when parse tree is being turned into ast, assert things like, e.g. no-on clause on cross-join
 95 | 
 96 | 
 97 | ## Optimization
 98 | - If a function invocation is used, e.g. a count(col_a_) in both select, and having the expr value should be cached 
 99 | 
100 | 
101 | ## Parser
102 | - ensure rules make sense with `expression` symbol
103 |   - this (expression) should wrap or_clause
104 |   - func_call from grammar/parser side only supports pos args; extend this to allow named args since function objects support named args
105 | 
106 | 
107 | ## Storage (btree)
108 | - support deletion/free-list  
109 |   - support defragmentation of unallocated space
110 |   - when allocating from free list, allocate whole block, 
111 |     the diff between block size and data size can be accounted for in diff between
112 |     i.e. don't bother chunking blocks- since we'll have to account for padding anyways since free-list blocks have a min size
113 | - allocating on fragmented node (with enough space) should trigger 
114 |   single-node-in-place compaction
115 | 
116 | 
117 | 
118 | 
119 | ## VM
120 | - flow
121 |   - if a statement fails, should exec stop? how is this behavior controlled?
122 | - select statement
123 |   - in addition to cursor iteration; select will have conditions
124 |     and an optimizer
125 |   - what is interface for select
126 |     - user executes select and is returned a pipe object
127 | 
128 | 
129 | 
130 | ## Cleanliness
131 | - move all code into /learndb ?
132 | - run black
133 | - run mypy
134 | 
135 | ## Bugs
136 |   - e2e_test.py::join_test should fail
137 |   - duplicate key not erroring (this might be working now)
138 |   - create table bar (col1 integer primary key, col2 text), i.e. num in colname
139 |   - create table def , requires space after final column and final ')'
140 |   - select count(*) from countries group by country_name
141 | 
142 | ## Release Requirements 
143 |   - complete docs
144 |   - complete tutorial
145 |   - document supported features
146 |   
147 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spandanb/learndb-py/242884e2418a09f480f17eac65e9c88e518bab1a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/btree_tests.py:
--------------------------------------------------------------------------------
  1 | """
  2 | These are the new btree tests. These indirectly test the
  3 | btree functionality via the frontend. I prefer this, as this simplifies
  4 | the testing; otherwise, I'll have to import serde logic to generate formatted cells
  5 | """
  6 | import pytest
  7 | import random
  8 | 
  9 | from .context import LearnDB
 10 | from .test_constants import TEST_DB_FILE
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def test_cases():
 15 |     return [
 16 |             [1, 2, 3, 4],
 17 |             [64, 5, 13, 82],
 18 |             [82, 13, 5, 2, 0],
 19 |             [10, 20, 30, 40, 50, 60, 70],
 20 |             [72, 79, 96, 38, 47],
 21 |             [432, 507, 311, 35, 246, 950, 956, 929, 769, 744, 994, 438],
 22 |             [159, 597, 520, 189, 822, 725, 504, 397, 218, 134, 516],
 23 |             [159, 597, 520, 189, 822, 725, 504, 397],
 24 |             [960, 267, 947, 400, 795, 327, 464, 884, 667, 870, 92],
 25 |             [793, 651, 165, 282, 177, 439, 593],
 26 |             [229, 653, 248, 298, 801, 947, 63, 619, 475, 422, 856, 57, 38],
 27 |             [103, 394, 484, 380, 834, 677, 604, 611, 952, 71, 568, 291, 433, 305],
 28 |             [114, 464, 55, 450, 729, 646, 95, 649, 59, 412, 546, 340, 667, 274, 477, 363, 333, 897, 772, 508, 182, 305, 428,
 29 |                 180, 22],
 30 |             [15, 382, 653, 668, 139, 70, 828, 17, 891, 121, 175, 642, 491, 281, 920],
 31 |             [967, 163, 791, 938, 939, 196, 104, 465, 886, 355, 58, 251, 928, 758, 535, 737, 357, 125, 171, 838, 572, 745,
 32 |                 999, 417, 393, 458, 292, 904, 158, 286, 900, 859, 668, 183],
 33 |             [726, 361, 583, 121, 908, 789, 842, 67, 871, 461, 522, 394, 225, 637, 792, 393, 656, 748, 39, 696],
 34 |             [54, 142, 440, 783, 619, 273, 95, 961, 692, 369, 447, 825, 555, 908, 483, 356, 40, 110, 519, 599],
 35 |             [413, 748, 452, 666, 956, 926, 94, 813, 245, 237, 264, 709, 706, 872, 535, 214, 561, 882, 646]
 36 |         ]
 37 | 
 38 | 
 39 | @pytest.fixture
 40 | def tiny_test_cases():
 41 |     return [
 42 |             [1, 2, 3, 4],
 43 |         ]
 44 | 
 45 | @pytest.fixture
 46 | def small_test_cases():
 47 |     return [
 48 |             [1, 2, 3, 4],
 49 |             [4, 3, 2, 1],
 50 |             [64, 5, 13, 82],
 51 |             [82, 13, 5, 2, 0],
 52 |             [10, 20, 30, 40, 50, 60, 70],
 53 |         ]
 54 | 
 55 | 
 56 | def test_inserts(test_cases):
 57 |     """
 58 |     iterate over test cases, insert keys
 59 |     - validate tree
 60 |     - scan table and ensure keys are sorted version of inputted keys
 61 | 
 62 |     :param test_cases: fixture
 63 |     :return:
 64 |     """
 65 | 
 66 |     for test_case in test_cases:
 67 |         db = LearnDB(TEST_DB_FILE, nuke_db_file=True)
 68 |         # delete old file
 69 |         db.nuke_dbfile()
 70 | 
 71 |         # test interfaces via db frontend
 72 |         # create table before inserting
 73 |         # TODO: FIX me current parser + VM can't handle mixed-case column names, e.g.
 74 |         # colA; for now making them all lowercase
 75 |         #db.handle_input("create table foo ( colA integer primary key, colB text)")
 76 |         db.handle_input("create table foo ( cola integer primary key, colb text)")
 77 | 
 78 |         # insert keys
 79 |         for idx, key in enumerate(test_case):
 80 |             db.handle_input(f"insert into foo (cola, colb) values ({key}, 'hello world')")
 81 | 
 82 |             # select rows
 83 |             db.handle_input("select cola, colb  from foo")
 84 |             pipe = db.get_pipe()
 85 |             assert pipe.has_msgs(), "expected rows"
 86 |             # collect keys into a list
 87 |             result_keys = []
 88 |             while pipe.has_msgs():
 89 |                 record = pipe.read()
 90 |                 key = record.get("cola")
 91 |                 result_keys.append(key)
 92 | 
 93 |             db.virtual_machine.state_manager.validate_tree("foo")
 94 |             sorted_test_case = [k for k in sorted(test_case[:idx+1])]
 95 |             assert result_keys == sorted_test_case, f"result {result_keys} doesn't not match {sorted_test_case}"
 96 | 
 97 |         db.close()
 98 |         del db
 99 | 
100 | 
101 | def test_deletes(test_cases):
102 |     """
103 |     iterate over test cases- insert all keys
104 |     then delete keys and ensure:
105 |     - tree is consistent
106 |     - has expected keys
107 | 
108 |     :param test_cases:
109 |     :return:
110 |     """
111 | 
112 |     for test_case in test_cases:
113 |         db = LearnDB(TEST_DB_FILE)
114 |         # delete old file
115 |         db.nuke_dbfile()
116 | 
117 |         # test interfaces via db frontend
118 |         # create table before inserting
119 |         # db.handle_input("create table foo ( cola integer primary key, colb text)")
120 |         db.handle_input("create table foo ( cola integer primary key, colb text)")
121 | 
122 |         # insert keys
123 |         for key in test_case:
124 |             db.handle_input(f"insert into foo (colA, colB) values ({key}, 'hello world')")
125 | 
126 |         # shuffle keys in repeatable order
127 |         random.seed(1)
128 |         del_keys = test_case[:]
129 |         random.shuffle(del_keys)
130 | 
131 |         for idx, key in enumerate(del_keys):
132 |             try:
133 |                 # delete key
134 |                 db.handle_input(f"delete from foo where cola = {key}")
135 |                 # validate input
136 | 
137 |                 # select rows
138 |                 db.handle_input("select cola, colb  from foo")
139 |                 pipe = db.get_pipe()
140 | 
141 |                 # collect keys into a list
142 |                 result_keys = []
143 |                 while pipe.has_msgs():
144 |                     record = pipe.read()
145 |                     key = record.get("cola")
146 |                     result_keys.append(key)
147 | 
148 |                 try:
149 |                     db.virtual_machine.state_manager.validate_tree("foo")
150 |                 except Exception as e:
151 |                     raise Exception(f"validate tree failed for {idx} {del_keys} with {e}")
152 |                 sorted_test_case = [k for k in sorted(del_keys[idx+1:])]
153 |                 assert result_keys == sorted_test_case, f"result {result_keys} doesn't not match {sorted_test_case}"
154 |             except Exception as e:
155 |                 raise Exception(f"Delete test case [{test_case}][{idx}] {del_keys} with {e}")
156 | 
157 |         db.close()
158 |         del db
159 | 


--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This sets up the modules for testing
 3 | """
 4 | import os
 5 | import sys
 6 | # otherwise everything that needs to be tested will have to be explicitly imported
 7 | # which would make the top level export expose items that aren't intended for user access
 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | 
10 | 
11 | # specific internal imports for specific tests suites
12 | # generally we'll import entire module, unless it' clearer to import a specific member
13 | 
14 | from learndb.constants import REAL_EPSILON
15 | 
16 | # learndb
17 | from learndb.interface import LearnDB
18 | 
19 | # lang_tests
20 | from learndb.lang_parser.sqlhandler import SqlFrontEnd
21 | 
22 | from learndb import datatypes
23 | from learndb.schema import SimpleSchema, Column
24 | from learndb.record_utils import SimpleRecord
25 | from learndb.serde import deserialize_cell, serialize_record
26 | 
27 | from learndb.pager import Pager


--------------------------------------------------------------------------------
/tests/e2e_suite2_tests.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Set of tests on employees schema
  3 | """
  4 | import pytest
  5 | 
  6 | from .context import LearnDB
  7 | from .test_constants import TEST_DB_FILE
  8 | 
  9 | # utils
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def db_employees():
 14 |     """
 15 |     Return db with employees schema
 16 |     """
 17 |     commands = [
 18 |         """create table employees (
 19 |                 id INTEGER PRIMARY KEY,
 20 |                 name TEXT,
 21 |                 salary INTEGER,
 22 |                 depid INTEGER)""",
 23 | 
 24 |         "INSERT INTO employees(id, name, salary, depid) VALUES (1, 'John', 100, 1)",
 25 |         "INSERT INTO employees(id, name, salary, depid) VALUES (2, 'Anita', 300, 1)",
 26 |         "INSERT INTO employees(id, name, salary, depid) VALUES (3, 'Gab', 200, 2)",
 27 | 
 28 |         """create table department (
 29 |             depid INTEGER PRIMARY KEY,
 30 |             name TEXT)""",
 31 | 
 32 |         "INSERT INTO department(depid, name) VALUES (1, 'accounting')",
 33 |         "INSERT INTO department(depid, name) VALUES (2, 'sales')",
 34 |         "INSERT INTO department(depid, name) VALUES (3, 'engineering')",
 35 |     ]
 36 |     db = LearnDB(TEST_DB_FILE, nuke_db_file=True)
 37 |     db.nuke_dbfile()
 38 |     for cmd in commands:
 39 |         resp = db.handle_input(cmd)
 40 |         assert resp.success, f"{cmd} failed with {resp.error_message}"
 41 | 
 42 |     return db
 43 | 
 44 | 
 45 | @pytest.fixture
 46 | def db_fruits():
 47 |     db = LearnDB(TEST_DB_FILE, nuke_db_file=True)
 48 |     db.nuke_dbfile()
 49 |     commands = [
 50 |         """CREATE TABLE fruits ( 
 51 |         id INTEGER PRIMARY KEY, 
 52 |         name TEXT, 
 53 |         avg_weight INTEGER)
 54 |         """,
 55 |         "insert into fruits (id, name, avg_weight) values (1, 'apple', 200)",
 56 |         "insert into fruits (id, name, avg_weight) values (2, 'orange', 140)",
 57 |         "insert into fruits (id, name, avg_weight) values (3, 'pineapple', 1000)",
 58 |         "insert into fruits (id, name, avg_weight) values (4, 'grape', 5)",
 59 |         "insert into fruits (id, name, avg_weight) values (5, 'pear', 166)",
 60 |         "insert into fruits (id, name, avg_weight) values (6, 'mango', 140)",
 61 |         "insert into fruits (id, name, avg_weight) values (7, 'watermelon', 10000)",
 62 |         "insert into fruits (id, name, avg_weight) values (8, 'banana', 118)",
 63 |         "insert into fruits (id, name, avg_weight) values (9, 'peach', 147)",
 64 |     ]
 65 |     for cmd in commands:
 66 |         resp = db.handle_input(cmd)
 67 |         assert resp.success, f"{cmd} failed with {resp.error_message}"
 68 | 
 69 |     return db
 70 | 
 71 | 
 72 | # test
 73 | 
 74 | 
 75 | def test_select_inner_join(db_employees):
 76 |     db_employees.handle_input("select e.name, d.name from employees e inner join department d on e.depid = d.depid")
 77 |     employees = {}
 78 |     while db_employees.get_pipe().has_msgs():
 79 |         record = db_employees.get_pipe().read()
 80 |         employee_name = record.at_index(0)
 81 |         dep_name = record.at_index(1)
 82 |         if dep_name not in employees:
 83 |             employees[dep_name] = set()
 84 |         employees[dep_name].add(employee_name)
 85 |     assert employees["accounting"] == {"Anita", "John"}
 86 |     assert employees["sales"] == {"Gab"}
 87 | 
 88 | 
 89 | def test_select_left_join_and_group_by(db_employees):
 90 |     """
 91 |     Count number of employees in department, even for departments with no employees
 92 |     count employees after doing department left join employees
 93 |     """
 94 |     db_employees.handle_input("select count(e.name), d.name from department d left join employees e on e.depid = d.depid group by d.name")
 95 |     employees = {}
 96 |     while db_employees.get_pipe().has_msgs():
 97 |         record = db_employees.get_pipe().read()
 98 |         employee_count = record.at_index(0)
 99 |         dep_name = record.at_index(1)
100 |         employees[dep_name] = employee_count
101 |     assert employees["accounting"] == 2
102 |     assert employees["sales"] == 1
103 |     assert employees["engineering"] == 0
104 | 
105 | 
106 | def test_select_right_join_and_group_by(db_employees):
107 |     """
108 |     Count number of employees in department, even for departments with no employees
109 |     count employees after doing department left join employees
110 |     """
111 |     db_employees.handle_input("select count(e.name), d.name from employees e right join department d on e.depid = d.depid group by d.name")
112 |     employees = {}
113 |     while db_employees.get_pipe().has_msgs():
114 |         record = db_employees.get_pipe().read()
115 |         employee_count = record.at_index(0)
116 |         dep_name = record.at_index(1)
117 |         employees[dep_name] = employee_count
118 |     assert employees["accounting"] == 2
119 |     assert employees["sales"] == 1
120 |     assert employees["engineering"] == 0
121 | 
122 | 
123 | def test_select_group_by_and_having(db_employees):
124 |     db_employees.handle_input("select count(e.name), d.name from employees e inner join department d on e.depid = d.depid group by d.name having count(e.name) < 2")
125 |     employees = {}
126 |     while db_employees.get_pipe().has_msgs():
127 |         record = db_employees.get_pipe().read()
128 |         employee_count = record.at_index(0)
129 |         dep_name = record.at_index(1)
130 |         employees[dep_name] = employee_count
131 |     assert len(employees) == 1
132 |     assert employees["sales"] == 1
133 | 
134 | 
135 | def test_order_limit(db_fruits):
136 |     db_fruits.handle_input("select name, id from fruits order by id limit 5")
137 |     values = []
138 |     while db_fruits.get_pipe().has_msgs():
139 |         record = db_fruits.get_pipe().read()
140 |         fruit = record.at_index(0)
141 |         values.append(fruit)
142 |     expected = ['apple', 'orange', 'pineapple', 'grape', 'pear']
143 |     assert expected == values
144 | 
145 | 
146 | def test_multi_column_order(db_fruits):
147 |     db_fruits.handle_input("select name, avg_weight from fruits order by avg_weight, name desc limit 4")
148 |     values = []
149 |     while db_fruits.get_pipe().has_msgs():
150 |         record = db_fruits.get_pipe().read()
151 |         fruit = record.at_index(0)
152 |         values.append(fruit)
153 |     # critically, mango and orange have the same weight
154 |     # descending ordering on name, means mango does first
155 |     expected = ['grape', 'banana', 'orange', 'mango']
156 |     assert expected == values
157 | 
158 | 
159 | def test_table_drop(db_employees):
160 |     db_employees.handle_input("SELECT name from catalog")
161 |     table_names = []
162 |     while db_employees.get_pipe().has_msgs():
163 |         record = db_employees.get_pipe().read()
164 |         table_name = record.at_index(0)
165 |         table_names.append(table_name)
166 |     assert len(table_names) == 2
167 |     assert "department" in table_names and "employees" in table_names
168 | 
169 |     db_employees.handle_input("DROP TABLE employees")
170 |     db_employees.handle_input("SELECT name from catalog")
171 |     table_names = []
172 |     while db_employees.get_pipe().has_msgs():
173 |         record = db_employees.get_pipe().read()
174 |         table_name = record.at_index(0)
175 |         table_names.append(table_name)
176 |     assert len(table_names) == 1
177 |     assert table_names[0] == "department"
178 | 


--------------------------------------------------------------------------------
/tests/lang_tests.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests to validate whether learndb-sql statements are parsed as expected
  3 | """
  4 | import pytest
  5 | 
  6 | from .context import SqlFrontEnd
  7 | 
  8 | 
  9 | def test_select_stmnt():
 10 |     cmds = ["select colA from foo where colA <> 4.2",
 11 |             "select colA from foo",
 12 |             "select colA from foo",
 13 |             # "select 32"  # fails
 14 |             ]
 15 |     handler = SqlFrontEnd()
 16 |     for cmd in cmds:
 17 |         handler.parse(cmd)
 18 |         assert handler.is_success()
 19 | 
 20 | 
 21 | def test_misc_succ_stmnt():
 22 |     """
 23 |     Collection of misc statements that should
 24 |     succeed successfully
 25 |     - should be moved into statement type specific tests
 26 | 
 27 |     :return:
 28 |     """
 29 |     cmds = ["select cola, colb from foo where cola = 1 and colb = 2 or colc = 3",
 30 |             "select cola, colb from foo where cola = 1 and colb = 2 and colc = 3",
 31 |             "select cola, colb from foo where cola = 1 and colb = 2 or colc = 3 and cold = 4",
 32 |             "select cola, colb from foo f join bar b on f.cola = b.colb",
 33 |             "select cola, colb from foo f inner join bar b on f.cola = b.colb",
 34 |             "select cola, colb from foo f inner join bar r on f.cola = r.coly",
 35 |             "select cola, colb from foo f inner join bar r on f.b = r.y inner join car c on c.x = f.b",
 36 |             "select cola, colb from foo f inner join bar r on f.b = r.y left join car c on c.x = f.b",
 37 |             "select cola, colb from foo f left outer join bar r on f.b = r.y right join car c on c.x = f.b",
 38 |             "select cola, colb from foo f cross join bar r",
 39 |             "select cola, colb from foo f left join bar r on (select max(fig, farce) from fodo where x = 1)"
 40 |             ]
 41 |     for cmd in cmds:
 42 |         handler = SqlFrontEnd()
 43 |         handler.parse(cmd)
 44 |         assert handler.is_success()
 45 | 
 46 | 
 47 | def test_misc_fail_stmnt():
 48 |     """
 49 |     misc collections of statements that should fail
 50 |     :return:
 51 | 
 52 |     """
 53 |     cmds = [
 54 |         # NOTE: That there be no on-clause in cross-join must be enforced when parse tree is being converted to AST
 55 |         # this is currently not implemented
 56 |         # "select cola, colb from foo f cross join bar r on f.x = r.y",  # cross join should not have an on-clause
 57 |     ]
 58 |     for cmd in cmds:
 59 |         with pytest.raises(AssertionError):
 60 |             handler = SqlFrontEnd()
 61 |             handler.parse(cmd)
 62 |             assert handler.is_success()
 63 | 
 64 | 
 65 | def test_create_stmnt():
 66 |     cmds = [
 67 |         "create table foo ( colA integer primary key, colB text)"
 68 |     ]
 69 |     for cmd in cmds:
 70 |         handler = SqlFrontEnd()
 71 |         handler.parse(cmd)
 72 |         assert handler.is_success()
 73 | 
 74 | 
 75 | def test_delete_stmnt():
 76 |     cmds = [
 77 |         "delete from table_foo",
 78 |         "delete from table_foo where car_name <> 'marmar'"
 79 |     ]
 80 |     for cmd in cmds:
 81 |         handler = SqlFrontEnd()
 82 |         handler.parse(cmd)
 83 |         assert handler.is_success()
 84 | 
 85 | 
 86 | def test_truncate_stmnt():
 87 |     cmd = "truncate foo"
 88 |     handler = SqlFrontEnd()
 89 |     handler.parse(cmd)
 90 |     assert handler.is_success()
 91 | 
 92 | 
 93 | def test_drop_stmnt():
 94 |     cmd = "drop table foo"
 95 |     handler = SqlFrontEnd()
 96 |     handler.parse(cmd)
 97 |     assert handler.is_success()
 98 | 
 99 | 
100 | def test_multi_stmnt():
101 |     cmd = "create table foo ( colA integer primary key, colB text); select cola from foo"
102 |     handler = SqlFrontEnd()
103 |     handler.parse(cmd)
104 |     assert handler.is_success()
105 | 
106 | 
107 | def test_insert_stmnt():
108 |     cmds = [
109 |         "insert into table_name (col_a, col_b) values ('val_a', 32)",
110 |         "insert into table_name (col_a, col_b) values ('val_a', 'val_b')",
111 |         "insert into table_name (col_a, col_b) values (11, 92)"
112 |     ]
113 | 
114 |     handler = SqlFrontEnd()
115 |     for cmd in cmds:
116 |         handler.parse(cmd)
117 |         assert handler.is_success()
118 | 
119 | 
120 | def test_update_stmnt():
121 |     cmds = [
122 |         "update table_name set column_name = 'value' where foo = bar",
123 |         "update table_name set column_name = 32"
124 |         ]
125 |     handler = SqlFrontEnd()
126 |     for cmd in cmds:
127 |         handler.parse(cmd)
128 |         assert handler.is_success()
129 | 
130 | 
131 | def test_create_stmnt_fail_no_cols():
132 |     """
133 |     test invalid command raising parser exception.
134 |     NOTE: Currently tokenizer, parser exceptions are
135 |     just messages, and so hard to precisely validate.
136 | 
137 |     :return:
138 |     """
139 |     cmd = "create table foo ()"
140 |     handler = SqlFrontEnd()
141 |     handler.parse(cmd)
142 |     assert handler.is_success() is False
143 | 
144 | 
145 | 
146 | def test_expr():
147 |     pass
148 | 
149 | 


--------------------------------------------------------------------------------
/tests/pager_tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Get a page, return a page. close pager.
 3 | """
 4 | import os
 5 | 
 6 | from .context import Pager
 7 | from .test_constants import TEST_DB_FILE
 8 | 
 9 | 
10 | def test_free_pages_persisted():
11 |     """
12 |     Test that returned pages are persisted
13 |     and re-served after pager is closed and reopened.
14 |     :return:
15 |     """
16 |     if os.path.exists(TEST_DB_FILE):
17 |         os.remove(TEST_DB_FILE)
18 | 
19 |     pager = Pager(TEST_DB_FILE)
20 |     first = pager.get_unused_page_num()
21 |     second = pager.get_unused_page_num()
22 |     third = pager.get_unused_page_num()
23 | 
24 |     # don't return third - to avoid file truncation
25 |     pager.return_page(first)
26 |     pager.return_page(second)
27 |     returned_pages = {first, second}
28 | 
29 |     # close pager and see if the returned pages
30 |     # are given when an unused page is requested
31 |     pager.close()
32 | 
33 |     pager = Pager(TEST_DB_FILE)
34 |     new_page = pager.get_unused_page_num()
35 |     assert new_page in returned_pages
36 |     new_page = pager.get_unused_page_num()
37 |     assert new_page in returned_pages


--------------------------------------------------------------------------------
/tests/serde_tests.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests serde of individual datatypes and of schemas/records
  3 | composed of columns of many different datatype
  4 | """
  5 | from .context import (REAL_EPSILON, datatypes, SimpleSchema, Column, SimpleRecord, deserialize_cell,
  6 |                       serialize_record)
  7 | 
  8 | 
  9 | def test_integer_serde():
 10 |     values = [4, 100, 109297]
 11 |     for value in values:
 12 |         # create int
 13 |         datatype = datatypes.Integer
 14 |         # serialize int
 15 |         ser_val = datatype.serialize(value)
 16 |         # deserialize bytes
 17 |         deser_val = datatype.deserialize(ser_val)
 18 |         # assert initial value equals round-tripped value
 19 |         assert deser_val == value
 20 | 
 21 | 
 22 | def test_real_serde():
 23 |     values = [4.0, 11.7, 19.297]
 24 |     for value in values:
 25 |         # create int
 26 |         datatype = datatypes.Real
 27 |         # serialize int
 28 |         ser_val = datatype.serialize(value)
 29 |         # deserialize bytes
 30 |         deser_val = datatype.deserialize(ser_val)
 31 |         # assert initial value approximately equals round-tripped value
 32 |         # NOTE: since floats don't convert exactly, we need to
 33 |         # compare value given float representation limitations.
 34 |         # not sure if it's valid to compare the diff of values
 35 |         # be less than threshold- since the threshold may vary depending on magnitude?
 36 |         assert abs(deser_val - value) < REAL_EPSILON
 37 | 
 38 | 
 39 | def test_key_only_schema_serde():
 40 |     """
 41 |     Attempt to serialize and deserialize a schema
 42 |     :return:
 43 |     """
 44 |     schema = SimpleSchema('dummy', [
 45 |             Column('pkey', datatypes.Integer, is_primary_key=True)
 46 |         ])
 47 |     # create a record that matches above schema
 48 |     record = SimpleRecord({"pkey": 1}, schema)
 49 | 
 50 |     # serialize
 51 |     resp = serialize_record(record)
 52 |     assert resp.success, "serialize failed"
 53 |     serialized = resp.body
 54 |     # deserialize
 55 |     resp = deserialize_cell(serialized, schema)
 56 |     assert resp.success, "deserialize failed"
 57 |     deserialized = resp.body
 58 | 
 59 |     # validate original and deserialized record have the same value
 60 |     for col in schema.columns:
 61 |         assert record.values[col.name] == deserialized.values[col.name]
 62 | 
 63 | 
 64 | def test_multi_column_fixed_len_type_serde():
 65 |     """
 66 |     Attempt to serialize and deserialize a schema
 67 |     :return:
 68 |     """
 69 |     schema = SimpleSchema('dummy', [
 70 |             Column('pkey', datatypes.Integer, is_primary_key=True),
 71 |             Column('name', datatypes.Text),
 72 |             Column('root_pagenum', datatypes.Integer)
 73 |         ])
 74 |     # create a record that matches above schema
 75 |     record = SimpleRecord({"pkey": 1, "name": "some_table_nane", "root_pagenum": 2}, schema)
 76 | 
 77 |     # serialize
 78 |     resp = serialize_record(record)
 79 |     assert resp.success, "serialize failed"
 80 |     serialized = resp.body
 81 |     # deserialize
 82 |     resp = deserialize_cell(serialized, schema)
 83 |     assert resp.success, "deserialize failed"
 84 |     deserialized = resp.body
 85 | 
 86 |     # validate original and deserialized record have the same value
 87 |     for col in schema.columns:
 88 |         assert record.values[col.name] == deserialized.values[col.name]
 89 | 
 90 | 
 91 | def test_nullable_serde():
 92 |     """
 93 |     Attempt to serialize and deserialize a schema
 94 |     :return:
 95 |     """
 96 |     schema = SimpleSchema('dummy', [
 97 |             Column('pkey', datatypes.Integer, is_primary_key=True),
 98 |             Column('name', datatypes.Text),
 99 |             Column('root_pagenum', datatypes.Integer),
100 |             Column('sql', datatypes.Text)
101 |         ])
102 |     # create a record that matches above schema
103 |     record = SimpleRecord({"pkey": 1, "name": "some_table_nane", "root_pagenum": 2, "sql": None}, schema)
104 | 
105 |     # serialize
106 |     resp = serialize_record(record)
107 |     assert resp.success, "serialize failed"
108 |     serialized = resp.body
109 |     # deserialize
110 |     resp = deserialize_cell(serialized, schema)
111 |     assert resp.success, "deserialize failed"
112 |     deserialized = resp.body
113 | 
114 |     # validate original and deserialized record have the same value
115 |     for col in schema.columns:
116 |         assert record.values[col.name] == deserialized.values[col.name]
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/tests/test_constants.py:
--------------------------------------------------------------------------------
1 | TEST_DB_FILE = 'testdb.file'
2 | 


--------------------------------------------------------------------------------