├── .github └── workflows │ ├── build.yml │ └── publish.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── sample ├── LICENSE └── sakila.db └── src ├── be_i48.rs ├── error.rs ├── lib.rs ├── model.rs ├── parser.rs └── varint.rs /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | 6 | env: 7 | CARGO_TERM_COLOR: always 8 | 9 | jobs: 10 | 11 | check: 12 | name: Check 13 | strategy: 14 | matrix: 15 | platform: [ubuntu-latest, macos-latest, windows-latest] 16 | runs-on: ${{ matrix.platform }} 17 | steps: 18 | - uses: actions/checkout@v3 19 | - uses: actions-rs/toolchain@v1 20 | with: 21 | profile: minimal 22 | toolchain: stable 23 | override: true 24 | 25 | - name: Run cargo check 26 | uses: actions-rs/cargo@v1 27 | with: 28 | command: check 29 | 30 | test: 31 | name: Test 32 | strategy: 33 | matrix: 34 | platform: [ubuntu-latest, macos-latest, windows-latest] 35 | runs-on: ${{ matrix.platform }} 36 | steps: 37 | - uses: actions/checkout@v3 38 | - uses: actions-rs/toolchain@v1 39 | with: 40 | profile: minimal 41 | toolchain: stable 42 | override: true 43 | 44 | - name: Run cargo test 45 | uses: actions-rs/cargo@v1 46 | with: 47 | command: test 48 | 49 | lints: 50 | name: Lints 51 | runs-on: ubuntu-latest 52 | steps: 53 | - uses: actions/checkout@v3 54 | - uses: actions-rs/toolchain@v1 55 | with: 56 | profile: minimal 57 | toolchain: stable 58 | override: true 59 | components: rustfmt, clippy 60 | 61 | - name: cargo fmt 62 | uses: actions-rs/cargo@v1 63 | with: 64 | command: fmt 65 | args: --all -- --check 66 | 67 | - name: cargo clippy 68 | uses: actions-rs/cargo@v1 69 | with: 70 | command: clippy 71 | args: -- -D warnings 72 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | tags: 4 | - '*' 5 | workflow_dispatch: 6 | 7 | name: Publish 8 | 9 | jobs: 10 | 11 | publish: 12 | name: Publish 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: checkout 16 | uses: actions/checkout@v2 17 | 18 | - name: stable toolchain 19 | uses: actions-rs/toolchain@v1 20 | with: 21 | profile: minimal 22 | toolchain: stable 23 | override: true 24 | 25 | - run: cargo publish 26 | env: 27 | CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sqlite-parser-nom" 3 | version = "1.0.0" 4 | authors = ["Andrew Korzhuev "] 5 | edition = "2021" 6 | description = "SQLite database file parser" 7 | repository = "https://github.com/mycelial/sqlite-parser-nom" 8 | documentation = "http://docs.rs/sqlite-parser-nom/" 9 | readme = "README.md" 10 | keywords = ["sqlite", "database", "nom", "parser"] 11 | license = "Apache-2.0" 12 | categories = ["database", "parser-implementations"] 13 | 14 | exclude = [ 15 | "/.github/*" 16 | ] 17 | 18 | [dependencies] 19 | nom = "7.1.1" 20 | thiserror = "1.0.38" 21 | memmap2 = "0.5.8" 22 | 23 | [dev-dependencies] 24 | rusqlite = { version = "0.28.0", features = ["bundled"] } 25 | tempfile = "3.3.0" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sqlite-parser-nom 2 | 3 | SQLite binary database format parser. 4 | 5 | Homonym libraries: 6 | 7 | - [sqlite_parser](https://crates.io/crates/sqlite_parser) is a front-end 8 | to [rusqlite](https://crates.io/crates/rusqlite) and doesn't actually do parsing. 9 | - [sqlite3-parser](https://crates.io/crates/sqlite3-parser) is parser + lexer for SQLite3-compatible SQL grammar. 10 | 11 | ## Usage 12 | 13 | In your Cargo.toml: 14 | 15 | ```toml 16 | [dependencies] 17 | sqlite-parser-nom = "1.0.0" 18 | ``` 19 | 20 | ### Lazily parse the file 21 | Load and parse file in memory: 22 | 23 | ```rust 24 | use sqlite_parser_nom::Reader; 25 | use sqlite_parser_nom::error; 26 | 27 | fn main() -> Result<(), error::SQLiteError> { 28 | let reader = Reader::open_mmap("sample/sakila.db")?; 29 | println!("{}", reader.header.db_size); 30 | 31 | Ok(()) 32 | } 33 | ``` 34 | 35 | ### Parse a slice 36 | 37 | You can also use parsers directly 38 | 39 | ```rust 40 | use nom::Finish; 41 | use sqlite_parser_nom::parser; 42 | use sqlite_parser_nom::model; 43 | use sqlite_parser_nom::error; 44 | 45 | fn do_something_with_page(i: &[u8]) -> Result { 46 | let (_, page) = parser::page(i) 47 | .finish() 48 | // the cast is necessary here, so the error could outlive the input 49 | .map_err(|e| nom::error::Error { 50 | code: e.code, 51 | input: error::OwnedBytes(e.input.to_owned()), 52 | })?; 53 | 54 | Ok(page) 55 | } 56 | ``` 57 | 58 | Check the documentation and [parser](./src/parser.rs) to chose correct parser for your task. 59 | 60 | ## SQLite format specification 61 | 62 | References: 63 | 64 | - [Database File Format](https://www.sqlite.org/fileformat.html) - official file format guide 65 | - [Requirements for the SQLite Database File Format 66 | ](http://www.sqlite.org/draft/hlr30000.html) - detailed list of assumptions 67 | - [The Definitive Guide to SQLite](https://link.springer.com/book/10.1007/978-1-4302-3226-1) - Chapter 11, 68 | high level overview 69 | - [Mobile Forensics – The File Format Handbook](https://link.springer.com/book/10.1007/978-3-030-98467-0) - detailed 70 | description until the cell contents 71 | 72 | ### Physical structure 73 | 74 | #### Database file 75 | 76 | ```text 77 | +---+-------+-----------+-----------+-----------+ 78 | | h | | | | | 79 | | e | | | | | 80 | | a | root | page 2 | ... | page N | 81 | | d | page | | | | 82 | | e | | | | | 83 | | r | | | | | 84 | +---+-------+-----------+-----------+-----------+ 85 | ^ ^ ^ 86 | < page size | page size | page size | page size > 87 | ``` 88 | 89 | - The SQLite database file is divided into equally-sized pages 90 | - Page size is defined within the header 91 | - Root page includes file header, but together still fits page size 92 | - All pages, including root page, count internal offsets from the beginning of the page itself 93 | - Pages are referenced by the number, therefore their position in the binary file can be computed 94 | 95 | #### Page 96 | 97 | ```text 98 | +---------------------+ 99 | | page header | 100 | +---------------------+ 101 | | cell pointer array | 102 | +---------------------+ 103 | | | 104 | | unallocated space | 105 | | | 106 | +-------+-------------+ 107 | |cell N |free block | 108 | +-------+------+------+ 109 | |cell 5 |cell 4|cell 3| 110 | +-------+----+-+------+ 111 | | cell 2 | cell 1 | 112 | +------------+--------+ 113 | ``` 114 | 115 | Page types: 116 | 117 | - Both Index and Table pages in their Interior and Leaf flavour have the same structure, but differ in the header and 118 | some extra fields 119 | - See [models](./src/model.rs) for exact definition and BTree section for logic 120 | - Overflow page just has `0x00` in header and the rest is payload 121 | - Locking page is empty page in databases > 1gb at 1,073,741,824 offset 122 | - Pointer page exists in autovacuumed DBs and contains pointers to reorganized pages 123 | - Free blocks are stored in free-list and are not nulled, they might contain data, which was supposed to be removed 124 | 125 | Page structure: 126 | - Cell pointer array grows from the top of the page to the bottom 127 | - Pointers are byte offsets within the page 128 | - Cells grow from the bottom of the page to the top 129 | 130 | #### Cell 131 | 132 | ```text 133 | +-----------+--------+--------+--------+-----+--------+-----------+-----+-----------+ 134 | |Payload | | Header | Serial | | Serial | Data Cell | | Data Cell | 135 | |(Cell Size)| ... | Size | Type 1 | ... | Type N | Column 1 | ... | Column N | 136 | +-----------+--------+--------+--------+-----+--------+-----------+-----+-----------+ 137 | | | 138 | < cell header ^ record header ^ table row data > 139 | < cell size > 140 | ``` 141 | 142 | - This structure with some amendments applies to Table Leaf, Index Leaf and Interior pages 143 | - Table Interior page contains pointers to other pages and corresponding rowid 144 | - Header and auxillary values within the cell are mostly as [varint](https://sqlite.org/src4/doc/trunk/www/varint.wiki) 145 | - Serial types correspond to the data payloads and contain size information within them 146 | -------------------------------------------------------------------------------- /sample/LICENSE: -------------------------------------------------------------------------------- 1 | License: BSD 2 | Copyright DB Software Laboratory 3 | http://www.etl-tools.com 4 | 5 | -------------------------------------------------------------------------------- /sample/sakila.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrusha/sqlite-parser-nom/98e57ee41d62e832a30a003c185eb30b5d2b3361/sample/sakila.db -------------------------------------------------------------------------------- /src/be_i48.rs: -------------------------------------------------------------------------------- 1 | use nom::number::complete::{be_u16, be_u32}; 2 | use nom::sequence::pair; 3 | use nom::IResult; 4 | 5 | /// Big-ending signed 48-bit two-complimentary integer 6 | pub fn be_i48(i: &[u8]) -> IResult<&[u8], i64> { 7 | let (i, (head, tail)) = pair(be_u16, be_u32)(i)?; 8 | let mut x = (head as u64) << 32 | (tail as u64); 9 | if x & 0x80_00_00_00_00_00 != 0 { 10 | x |= 0xff_ff_00_00_00_00_00_00; 11 | }; 12 | 13 | Ok((i, x as i64)) 14 | } 15 | 16 | #[cfg(test)] 17 | mod tests { 18 | use crate::be_i48::be_i48; 19 | 20 | #[test] 21 | fn consumes_6_bytes() { 22 | let bytes = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66]; 23 | let (i, res) = be_i48(&bytes).unwrap(); 24 | 25 | assert_eq!(i.len(), 0); // consumes all 26 | assert_eq!(res, 0x11_22_33_44_55_66); 27 | } 28 | 29 | #[test] 30 | fn fails_on_short_input() { 31 | let bytes = [0x11, 0x22, 0x33, 0x44, 0x55]; 32 | let res = be_i48(&bytes); 33 | 34 | assert!(res.is_err()); 35 | } 36 | 37 | #[test] 38 | fn passes_through_rest() { 39 | let bytes = [0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77]; 40 | let (i, _) = be_i48(&bytes).unwrap(); 41 | 42 | assert_eq!(i.len(), 1); 43 | assert_eq!(i.first().unwrap().to_owned(), 0x77); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{Display, Formatter}; 2 | 3 | #[derive(thiserror::Error, Debug)] 4 | pub enum SQLiteError { 5 | #[error(transparent)] 6 | IOError(#[from] std::io::Error), 7 | 8 | #[error(transparent)] 9 | ParsingError(#[from] nom::error::Error), 10 | 11 | #[error("unknown text encoding `{0}`")] 12 | UnknownTextEncodingError(u32), 13 | } 14 | 15 | /// Used so the error could outlive its input 16 | #[derive(Debug)] 17 | pub struct OwnedBytes(pub Vec); 18 | 19 | impl From> for OwnedBytes { 20 | fn from(value: Vec) -> Self { 21 | OwnedBytes(value) 22 | } 23 | } 24 | 25 | impl Display for OwnedBytes { 26 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 27 | self.0.iter().fold(Ok(()), |result, byte| { 28 | result.and_then(|_| writeln!(f, "{:X} ", byte)) 29 | }) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc( 2 | issue_tracker_base_url = "https://github.com/mycelial/sqlite-parser-nom/issues", 3 | test(no_crate_inject) 4 | )] 5 | #![doc = include_str ! ("../README.md")] 6 | 7 | extern crate core; 8 | 9 | use memmap2::{Mmap, MmapOptions}; 10 | use std::fs::File; 11 | use std::path::Path; 12 | 13 | use nom::Finish; 14 | 15 | use crate::error::{OwnedBytes, SQLiteError}; 16 | use crate::model::{DbHeader, Page}; 17 | use crate::parser::{db_header, page, root_page}; 18 | 19 | mod be_i48; 20 | pub mod error; 21 | pub mod model; 22 | pub mod parser; 23 | mod varint; 24 | 25 | /* 26 | todo: parse additional page types (overflow, lock, freelist, ?) 27 | todo: determine when overflow page no is used 28 | todo: how page size computation works? 29 | todo: test with more records 30 | todo: how freelist pages work? 31 | */ 32 | 33 | // todo: use bufreader 34 | pub struct Reader> { 35 | buf: S, 36 | pub header: DbHeader, 37 | } 38 | 39 | impl Reader { 40 | /// Open a SQLite database file by memory mapping it. 41 | /// 42 | /// # Example 43 | /// 44 | /// ``` 45 | /// let reader = sqlite_parser_nom::Reader::open_mmap("sample/sakila.db").unwrap(); 46 | /// ``` 47 | pub fn open_mmap>(database: P) -> Result, SQLiteError> { 48 | let file_read = File::open(database)?; 49 | let mmap = unsafe { MmapOptions::new().map(&file_read) }?; 50 | Reader::from_source(mmap) 51 | } 52 | } 53 | 54 | impl Reader> { 55 | /// Open a SQLite database file by loading it into memory. 56 | /// Payloads are not copied until use, but all the metadata must be. 57 | /// 58 | /// # Example 59 | /// 60 | /// ``` 61 | /// let reader = sqlite_parser_nom::Reader::open_readfile("sample/sakila.db").unwrap(); 62 | /// ``` 63 | pub fn open_readfile>(database: P) -> Result>, SQLiteError> { 64 | use std::fs; 65 | 66 | let buf: Vec = fs::read(&database)?; 67 | Reader::from_source(buf) 68 | } 69 | } 70 | 71 | impl> Reader { 72 | /// Open a SQLite database from anything that implements AsRef<[u8]> 73 | /// 74 | /// # Example 75 | /// 76 | /// ``` 77 | /// use std::fs; 78 | /// let buf = fs::read("sample/sakila.db").unwrap(); 79 | /// let reader = sqlite_parser_nom::Reader::from_source(buf).unwrap(); 80 | /// ``` 81 | pub fn from_source(buf: S) -> Result, SQLiteError> { 82 | let (_, header) = db_header(buf.as_ref()) 83 | .finish() 84 | .map_err(|e| nom::error::Error { 85 | code: e.code, 86 | input: OwnedBytes(e.input.to_owned()), 87 | })?; 88 | 89 | let reader = Reader { buf, header }; 90 | 91 | Ok(reader) 92 | } 93 | 94 | pub fn get_page(&self, pageno: u32) -> Result { 95 | let page_size = self.header.page_size.real_size(); 96 | let pageno = pageno as usize; 97 | 98 | let page_bytes = &self.buf.as_ref()[page_size * pageno..page_size * (pageno + 1)]; 99 | let page = if pageno == 0 { 100 | root_page(page_bytes) 101 | } else { 102 | page(page_bytes) 103 | }; 104 | 105 | let (_, page) = page.finish().map_err(|e| nom::error::Error { 106 | code: e.code, 107 | input: OwnedBytes(e.input.to_owned()), 108 | })?; 109 | 110 | Ok(page) 111 | } 112 | } 113 | 114 | #[cfg(test)] 115 | mod tests { 116 | use crate::model::Page; 117 | use crate::model::SerialType::{Null, Text, I8}; 118 | use rusqlite::Connection; 119 | use tempfile::tempdir; 120 | 121 | use super::*; 122 | 123 | #[test] 124 | fn empty_db() { 125 | let dir = tempdir().unwrap(); 126 | let path = dir.path().join("empty.sqlite3"); 127 | // let path = "empty.sqlite3"; 128 | let conn = Connection::open(&path).unwrap(); 129 | conn.execute( 130 | "CREATE TABLE test (id INTEGER PRIMARY KEY, foo TEXT NOT NULL)", 131 | (), 132 | ) 133 | .unwrap(); 134 | conn.close().unwrap(); 135 | let reader = Reader::open_readfile(&path).unwrap(); 136 | 137 | assert_eq!(reader.header.page_size.real_size(), 4096); 138 | 139 | match reader.get_page(0).unwrap() { 140 | Page::LeafTable(p) => { 141 | assert_eq!(p.header.no_cells, 1); 142 | assert_eq!(p.cells.len(), 1); 143 | assert_eq!( 144 | p.cells.first().unwrap().payload.column_types, 145 | // type, name, tbl_name, rootpage, sql 146 | vec![Text(23), Text(21), Text(21), I8, Text(135)] 147 | ); 148 | assert_eq!( 149 | p.cells.first().unwrap().payload.column_values, 150 | vec![ 151 | Some("table".into()), 152 | Some("test".into()), 153 | Some("test".into()), 154 | Some(2i8.into()), 155 | Some( 156 | "CREATE TABLE test (id INTEGER PRIMARY KEY, foo TEXT NOT NULL)".into() 157 | ), 158 | ] 159 | ); 160 | } 161 | _ => unreachable!("root page should be table leaf page"), 162 | } 163 | 164 | match reader.get_page(1).unwrap() { 165 | Page::LeafTable(p) => { 166 | assert_eq!(p.header.no_cells, 0); 167 | assert_eq!(p.cells.len(), 0); 168 | } 169 | _ => unreachable!("second page should be leaf page"), 170 | } 171 | } 172 | 173 | #[test] 174 | fn parse_table_content() { 175 | let dir = tempdir().unwrap(); 176 | let path = dir.path().join("empty.sqlite3"); 177 | let conn = Connection::open(&path).unwrap(); 178 | conn.execute( 179 | "CREATE TABLE test (id INTEGER PRIMARY KEY, foo TEXT NOT NULL)", 180 | (), 181 | ) 182 | .unwrap(); 183 | conn.execute("INSERT INTO test VALUES (42, 'tjena tjena')", ()) 184 | .unwrap(); 185 | conn.close().unwrap(); 186 | 187 | let reader = Reader::open_mmap(&path).unwrap(); 188 | 189 | match reader.get_page(1).unwrap() { 190 | Page::LeafTable(p) => { 191 | assert_eq!(p.header.no_cells, 1); 192 | assert_eq!(p.cells.len(), 1); 193 | assert_eq!(p.cells.first().unwrap().rowid, 42); 194 | assert_eq!( 195 | p.cells.first().unwrap().payload.column_types, 196 | // type, name, tbl_name, rootpage, sql 197 | vec![Null, Text(35)] 198 | ); 199 | assert_eq!( 200 | p.cells.first().unwrap().payload.column_values, 201 | vec![None, Some("tjena tjena".into())] 202 | ); 203 | } 204 | _ => unreachable!("root page should be table leaf page"), 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /src/model.rs: -------------------------------------------------------------------------------- 1 | use crate::error::SQLiteError; 2 | 3 | pub struct Database<'a> { 4 | pub header: DbHeader, 5 | pub pages: Vec>, 6 | } 7 | 8 | pub struct DbHeader { 9 | pub page_size: PageSize, 10 | pub write_version: u8, 11 | pub read_version: u8, 12 | pub max_payload_fraction: u8, 13 | pub min_payload_fraction: u8, 14 | pub leaf_payload_fraction: u8, 15 | pub file_change_counter: u32, 16 | pub db_size: u32, 17 | pub first_freelist_page_no: u32, 18 | pub total_freelist_pages: u32, 19 | pub schema_cookie: u32, 20 | pub schema_format_no: u32, 21 | pub default_page_cache_size: u32, 22 | pub no_largest_root_b_tree: u32, 23 | pub db_text_encoding: TextEncoding, 24 | pub user_version: u32, 25 | pub incremental_vacuum_mode: u32, 26 | pub application_id: u32, 27 | pub version_valid_for_no: u32, 28 | pub sqlite_version_number: u32, 29 | } 30 | 31 | pub struct PageSize(pub u16); 32 | 33 | impl PageSize { 34 | pub fn real_size(&self) -> usize { 35 | match self.0 { 36 | 1 => 0x1_00_00, 37 | _ => self.0.into(), 38 | } 39 | } 40 | } 41 | 42 | #[derive(Copy, Clone)] 43 | pub enum TextEncoding { 44 | Utf8, 45 | Utf16Le, 46 | Utf16Be, 47 | } 48 | 49 | impl TryFrom for TextEncoding { 50 | type Error = SQLiteError; 51 | 52 | fn try_from(value: u32) -> Result { 53 | use TextEncoding::*; 54 | 55 | match value { 56 | 1 => Ok(Utf8), 57 | 2 => Ok(Utf16Le), 58 | 3 => Ok(Utf16Be), 59 | _ => Err(SQLiteError::UnknownTextEncodingError(value)), 60 | } 61 | } 62 | } 63 | 64 | pub enum Page<'a> { 65 | InteriorIndex(InteriorIndexPage<'a>), 66 | LeafIndex(LeafIndexPage<'a>), 67 | InteriorTable(InteriorTablePage), 68 | LeafTable(LeafTablePage<'a>), 69 | } 70 | 71 | pub struct InteriorPageHeader { 72 | pub first_freeblock_offset: Option, 73 | pub no_cells: u16, 74 | pub cell_content_offset: CellOffset, 75 | pub no_fragmented_bytes: u8, 76 | pub rightmost_pointer: u32, 77 | } 78 | 79 | pub struct InteriorIndexPage<'a> { 80 | pub header: InteriorPageHeader, 81 | pub cell_pointers: Vec, 82 | pub cells: Vec>, 83 | } 84 | 85 | pub struct InteriorTablePage { 86 | pub header: InteriorPageHeader, 87 | pub cell_pointers: Vec, 88 | pub cells: Vec, 89 | } 90 | 91 | pub struct IndexCellPayload<'a> { 92 | pub header_size: u64, 93 | pub column_types: Vec, 94 | pub column_values: Vec>>, 95 | pub rowid: u64, 96 | } 97 | 98 | pub struct InteriorIndexCell<'a> { 99 | pub left_child_page_no: u32, 100 | pub payload_size: u64, 101 | pub payload: IndexCellPayload<'a>, 102 | pub overflow_page_no: Option, 103 | } 104 | 105 | pub struct InteriorTableCell { 106 | pub left_child_page_no: u32, 107 | pub integer_key: u64, 108 | } 109 | 110 | pub struct CellOffset(pub u16); 111 | 112 | impl CellOffset { 113 | pub fn real_offset(&self) -> u32 { 114 | match self.0 { 115 | 0 => 0x1_00_00, 116 | _ => self.0.into(), 117 | } 118 | } 119 | } 120 | 121 | pub struct LeafPageHeader { 122 | pub first_freeblock_offset: Option, 123 | pub no_cells: u16, 124 | pub cell_content_offset: CellOffset, 125 | pub no_fragmented_bytes: u8, 126 | } 127 | 128 | pub struct LeafIndexPage<'a> { 129 | pub header: LeafPageHeader, 130 | pub cell_pointers: Vec, 131 | pub cells: Vec>, 132 | } 133 | 134 | pub struct LeafIndexCell<'a> { 135 | pub payload_size: u64, 136 | pub payload: IndexCellPayload<'a>, 137 | pub overflow_page_no: Option, 138 | } 139 | 140 | pub struct LeafTablePage<'a> { 141 | pub header: LeafPageHeader, 142 | pub cell_pointers: Vec, 143 | pub cells: Vec>, 144 | } 145 | 146 | pub struct TableCellPayload<'a> { 147 | pub header_size: u64, 148 | pub column_types: Vec, 149 | pub column_values: Vec>>, 150 | } 151 | 152 | pub struct LeafTableCell<'a> { 153 | pub payload_size: u64, 154 | pub rowid: u64, 155 | pub payload: TableCellPayload<'a>, 156 | pub overflow_page_no: Option, 157 | } 158 | 159 | #[derive(Debug, Eq, PartialEq)] 160 | pub enum SerialType { 161 | Null, 162 | I8, 163 | I16, 164 | I24, 165 | I32, 166 | I48, 167 | I64, 168 | F64, 169 | Const0, 170 | Const1, 171 | Reserved, 172 | Blob(u64), 173 | Text(u64), 174 | } 175 | 176 | impl From for SerialType { 177 | fn from(value: u64) -> Self { 178 | use SerialType::*; 179 | match value { 180 | 0 => Null, 181 | 1 => I8, 182 | 2 => I16, 183 | 3 => I24, 184 | 4 => I32, 185 | 5 => I48, 186 | 6 => I64, 187 | 7 => F64, 188 | 8 => Const0, 189 | 9 => Const1, 190 | 10 | 11 => Reserved, 191 | n if n >= 12 && n % 2 == 0 => Blob(n), 192 | n if n >= 13 && n % 2 == 1 => Text(n), 193 | _ => unreachable!(), 194 | } 195 | } 196 | } 197 | 198 | impl SerialType { 199 | pub fn size(&self) -> usize { 200 | match self { 201 | SerialType::Null => 0, 202 | SerialType::I8 => 1, 203 | SerialType::I16 => 2, 204 | SerialType::I24 => 3, 205 | SerialType::I32 => 4, 206 | SerialType::I48 => 6, 207 | SerialType::I64 => 8, 208 | SerialType::F64 => 8, 209 | SerialType::Const0 => 0, 210 | SerialType::Const1 => 0, 211 | SerialType::Reserved => unimplemented!("reserved"), 212 | SerialType::Blob(n) => ((n - 12) / 2).try_into().unwrap(), 213 | SerialType::Text(n) => ((n - 13) / 2).try_into().unwrap(), 214 | } 215 | } 216 | } 217 | 218 | #[derive(Debug, Clone, PartialEq)] 219 | pub struct RawText<'a>(&'a [u8]); 220 | 221 | impl<'a> RawText<'a> { 222 | pub fn new(v: &'a [u8]) -> Self { 223 | RawText(v) 224 | } 225 | 226 | pub fn decode(&self, text_encoding: TextEncoding) -> String { 227 | match text_encoding { 228 | TextEncoding::Utf8 => String::from_utf8_lossy(self.0).to_string(), 229 | TextEncoding::Utf16Le => unimplemented!("utf16 not supported yet"), 230 | TextEncoding::Utf16Be => unimplemented!("utf16 not supported yet"), 231 | } 232 | } 233 | } 234 | 235 | impl<'a> From<&'a str> for RawText<'a> { 236 | fn from(value: &'a str) -> Self { 237 | RawText(value.as_bytes()) 238 | } 239 | } 240 | 241 | #[derive(Debug, Clone, PartialEq)] 242 | pub enum Payload<'a> { 243 | I8(i8), 244 | I16(i16), 245 | I32(i32), 246 | I64(i64), 247 | F64(f64), 248 | Blob(&'a [u8]), 249 | Text(RawText<'a>), 250 | } 251 | 252 | impl<'a> From<&'a str> for Payload<'a> { 253 | fn from(value: &'a str) -> Self { 254 | Payload::Text(value.into()) 255 | } 256 | } 257 | 258 | impl<'a> From<&'a [u8]> for Payload<'a> { 259 | fn from(value: &'a [u8]) -> Self { 260 | Payload::Blob(value) 261 | } 262 | } 263 | 264 | impl<'a> From for Payload<'a> { 265 | fn from(value: i8) -> Self { 266 | Payload::I8(value) 267 | } 268 | } 269 | 270 | impl<'a> From for Payload<'a> { 271 | fn from(value: i16) -> Self { 272 | Payload::I16(value) 273 | } 274 | } 275 | 276 | impl<'a> From for Payload<'a> { 277 | fn from(value: i32) -> Self { 278 | Payload::I32(value) 279 | } 280 | } 281 | 282 | impl<'a> From for Payload<'a> { 283 | fn from(value: i64) -> Self { 284 | Payload::I64(value) 285 | } 286 | } 287 | 288 | impl<'a> From for Payload<'a> { 289 | fn from(value: f64) -> Self { 290 | Payload::F64(value) 291 | } 292 | } 293 | 294 | #[cfg(test)] 295 | impl<'a> Eq for Payload<'a> {} 296 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use crate::be_i48; 2 | use be_i48::be_i48; 3 | use nom::branch::alt; 4 | use nom::bytes::complete::tag; 5 | use nom::bytes::complete::take; 6 | use nom::combinator::{complete, map, map_parser, map_res}; 7 | use nom::multi::{count, many0}; 8 | use nom::number::complete::{be_f64, be_i16, be_i24, be_i32, be_i64, be_i8, be_u16, be_u32, be_u8}; 9 | use nom::sequence::{pair, Tuple}; 10 | use nom::IResult; 11 | 12 | use crate::model::*; 13 | use crate::varint::be_u64_varint; 14 | 15 | const HEADER_SIZE: usize = 100; 16 | 17 | /// Goes through the whole input page-by-page 18 | /// NOTE: you should use specific parsers or Reader to parse file lazily 19 | pub fn database(i: &[u8]) -> IResult<&[u8], Database> { 20 | let (i, header) = db_header(i)?; 21 | 22 | let page_size = header.page_size.real_size(); 23 | 24 | let root_page = map_parser(take(page_size - HEADER_SIZE), page_generic(HEADER_SIZE)); 25 | let pages = complete(many0(map_parser(take(page_size), page_generic(0)))); 26 | 27 | let (i, (root_page, mut pages)) = complete(pair(root_page, pages))(i)?; 28 | 29 | pages.insert(0, root_page); 30 | 31 | Ok((i, Database { header, pages })) 32 | } 33 | 34 | /// File header parser. Page size and text encoding are required for the rest to work correctly. 35 | pub fn db_header(i: &[u8]) -> IResult<&[u8], DbHeader> { 36 | let (i, _) = tag("SQLite format 3\0")(i)?; 37 | let (i, page_size) = map(be_u16, PageSize)(i)?; 38 | let (i, (write_version, read_version)) = (be_u8, be_u8).parse(i)?; 39 | let (i, _reserved) = be_u8(i)?; 40 | let (i, (max_payload_fraction, min_payload_fraction, leaf_payload_fraction)) = 41 | (be_u8, be_u8, be_u8).parse(i)?; 42 | let (i, file_change_counter) = be_u32(i)?; 43 | let (i, db_size) = be_u32(i)?; 44 | let (i, (first_freelist_page_no, total_freelist_pages)) = (be_u32, be_u32).parse(i)?; 45 | let (i, (schema_cookie, schema_format_no)) = (be_u32, be_u32).parse(i)?; 46 | let (i, default_page_cache_size) = be_u32(i)?; 47 | let (i, no_largest_root_b_tree) = be_u32(i)?; 48 | let (i, db_text_encoding) = map_res(be_u32, |x| x.try_into())(i)?; 49 | let (i, user_version) = be_u32(i)?; 50 | let (i, incremental_vacuum_mode) = be_u32(i)?; 51 | let (i, application_id) = be_u32(i)?; 52 | let (i, _reserved) = count(be_u8, 20)(i)?; 53 | let (i, (version_valid_for_no, sqlite_version_number)) = (be_u32, be_u32).parse(i)?; 54 | 55 | Ok(( 56 | i, 57 | DbHeader { 58 | page_size, 59 | write_version, 60 | read_version, 61 | max_payload_fraction, 62 | min_payload_fraction, 63 | leaf_payload_fraction, 64 | file_change_counter, 65 | db_size, 66 | first_freelist_page_no, 67 | total_freelist_pages, 68 | schema_cookie, 69 | schema_format_no, 70 | default_page_cache_size, 71 | no_largest_root_b_tree, 72 | db_text_encoding, 73 | user_version, 74 | incremental_vacuum_mode, 75 | application_id, 76 | version_valid_for_no, 77 | sqlite_version_number, 78 | }, 79 | )) 80 | } 81 | 82 | /// The page number 0, which comes right after the header. Input assumed to contain the header. 83 | pub fn root_page(i: &[u8]) -> IResult<&[u8], Page> { 84 | let shrunk_page = &i[HEADER_SIZE..]; 85 | page_generic(HEADER_SIZE)(shrunk_page) 86 | } 87 | 88 | /// All the rest of pages, pageno >0. 89 | pub fn page(i: &[u8]) -> IResult<&[u8], Page> { 90 | page_generic(0)(i) 91 | } 92 | 93 | // todo: fix const generic thing, hack to pass through parameters 94 | fn page_generic(page_start_offset: usize) -> impl FnMut(&[u8]) -> IResult<&[u8], Page> { 95 | move |i| { 96 | alt(( 97 | map( 98 | interior_index_b_tree_page(page_start_offset), 99 | Page::InteriorIndex, 100 | ), 101 | map(leaf_index_b_tree_page(page_start_offset), Page::LeafIndex), 102 | map( 103 | interior_table_b_tree_page(page_start_offset), 104 | Page::InteriorTable, 105 | ), 106 | map(leaf_table_b_tree_page(page_start_offset), Page::LeafTable), 107 | ))(i) 108 | } 109 | } 110 | 111 | fn interior_page_header(i: &[u8]) -> IResult<&[u8], InteriorPageHeader> { 112 | let (i, first_freeblock_offset) = map(be_u16, |u| Some(u).filter(|&p| p != 0x0u16))(i)?; 113 | let (i, no_cells) = be_u16(i)?; 114 | let (i, cell_content_offset) = map(be_u16, CellOffset)(i)?; 115 | let (i, no_fragmented_bytes) = be_u8(i)?; 116 | let (i, rightmost_pointer) = be_u32(i)?; 117 | 118 | Ok(( 119 | i, 120 | InteriorPageHeader { 121 | first_freeblock_offset, 122 | no_cells, 123 | cell_content_offset, 124 | no_fragmented_bytes, 125 | rightmost_pointer, 126 | }, 127 | )) 128 | } 129 | 130 | fn leaf_page_header(i: &[u8]) -> IResult<&[u8], LeafPageHeader> { 131 | let (i, first_freeblock_offset) = map(be_u16, |u| Some(u).filter(|&p| p != 0x0u16))(i)?; 132 | let (i, no_cells) = be_u16(i)?; 133 | let (i, cell_content_offset) = map(be_u16, CellOffset)(i)?; 134 | let (i, no_fragmented_bytes) = be_u8(i)?; 135 | 136 | Ok(( 137 | i, 138 | LeafPageHeader { 139 | first_freeblock_offset, 140 | no_cells, 141 | cell_content_offset, 142 | no_fragmented_bytes, 143 | }, 144 | )) 145 | } 146 | 147 | fn interior_index_b_tree_page( 148 | page_start_offset: usize, 149 | ) -> impl FnMut(&[u8]) -> IResult<&[u8], InteriorIndexPage> { 150 | move |i| { 151 | let (ii, _) = tag([0x02u8])(i)?; 152 | let (ii, header) = interior_page_header(ii)?; 153 | let (ii, cell_pointers) = count(be_u16, header.no_cells.into())(ii)?; 154 | 155 | let mut cells = Vec::with_capacity(cell_pointers.len()); 156 | for &ptr in cell_pointers.iter() { 157 | let cell_offset = ptr as usize - page_start_offset; 158 | let (_, cell) = interior_index_cell(&i[cell_offset..])?; 159 | cells.push(cell); 160 | } 161 | 162 | Ok(( 163 | ii, 164 | InteriorIndexPage { 165 | header, 166 | cell_pointers, 167 | cells, 168 | }, 169 | )) 170 | } 171 | } 172 | 173 | /// Expects to get exactly as many bytes in input as it will consume 174 | fn column_types(i: &[u8]) -> IResult<&[u8], Vec> { 175 | // many0 as header might actually be empty 176 | complete(many0(map(be_u64_varint, SerialType::from)))(i) 177 | } 178 | 179 | fn text_payload(size: usize) -> impl FnMut(&[u8]) -> IResult<&[u8], Option> { 180 | move |i| map(take(size), |x: &[u8]| Some(Payload::Text(RawText::new(x))))(i) 181 | } 182 | 183 | fn blob_payload(size: usize) -> impl FnMut(&[u8]) -> IResult<&[u8], Option> { 184 | move |i| map(take(size), |x: &[u8]| Some(Payload::Blob(x)))(i) 185 | } 186 | 187 | fn column_values<'a, 'b>( 188 | serial_types: &'b [SerialType], 189 | ) -> impl FnMut(&'a [u8]) -> IResult<&'a [u8], Vec>> + 'b { 190 | move |i| { 191 | let mut i: &[u8] = i; 192 | let mut res = Vec::with_capacity(serial_types.len()); 193 | for serial_type in serial_types { 194 | let (ii, v) = match serial_type { 195 | SerialType::Null => Ok((i, None)), 196 | SerialType::I8 => map(be_i8, |x| Some(Payload::I8(x)))(i), 197 | SerialType::I16 => map(be_i16, |x| Some(Payload::I16(x)))(i), 198 | SerialType::I24 => map(be_i24, |x| Some(Payload::I32(x)))(i), 199 | SerialType::I32 => map(be_i32, |x| Some(Payload::I32(x)))(i), 200 | SerialType::I48 => map(be_i48, |x| Some(Payload::I64(x)))(i), 201 | SerialType::I64 => map(be_i64, |x| Some(Payload::I64(x)))(i), 202 | SerialType::F64 => map(be_f64, |x| Some(Payload::F64(x)))(i), 203 | SerialType::Const0 => Ok((i, Some(Payload::I8(0)))), 204 | SerialType::Const1 => Ok((i, Some(Payload::I8(0)))), 205 | SerialType::Reserved => unimplemented!("reserved"), 206 | SerialType::Blob(_) if serial_type.size() == 0 => Ok((i, None)), 207 | SerialType::Blob(_) => blob_payload(serial_type.size())(i), 208 | SerialType::Text(_) if serial_type.size() == 0 => Ok((i, None)), 209 | SerialType::Text(_) => text_payload(serial_type.size())(i), 210 | }?; 211 | i = ii; 212 | dbg!(v.clone()); 213 | res.push(v); 214 | } 215 | 216 | Ok((i, res)) 217 | } 218 | } 219 | 220 | fn index_cell_payload(i: &[u8]) -> IResult<&[u8], IndexCellPayload> { 221 | let (i, header_size) = be_u64_varint(i)?; 222 | let (_, column_types) = column_types(&i[0..header_size as usize - 1])?; 223 | let (i, column_values) = column_values(&column_types)(&i[header_size as usize - 1..])?; 224 | let (i, rowid) = be_u64_varint(i)?; 225 | 226 | Ok(( 227 | i, 228 | IndexCellPayload { 229 | header_size, 230 | column_types, 231 | column_values, 232 | rowid, 233 | }, 234 | )) 235 | } 236 | 237 | fn interior_index_cell(i: &[u8]) -> IResult<&[u8], InteriorIndexCell> { 238 | let (i, left_child_page_no) = be_u32(i)?; 239 | let (i, payload_size) = be_u64_varint(i)?; 240 | let (i, payload) = index_cell_payload(i)?; 241 | 242 | Ok(( 243 | i, 244 | InteriorIndexCell { 245 | left_child_page_no, 246 | payload_size, 247 | payload, 248 | overflow_page_no: None, 249 | }, 250 | )) 251 | } 252 | 253 | fn interior_table_b_tree_page( 254 | page_start_offset: usize, 255 | ) -> impl FnMut(&[u8]) -> IResult<&[u8], InteriorTablePage> { 256 | move |i| { 257 | let (ii, _) = tag([0x05u8])(i)?; 258 | let (ii, header) = interior_page_header(ii)?; 259 | let (ii, cell_pointers) = count(be_u16, header.no_cells.into())(ii)?; 260 | 261 | let mut cells = Vec::with_capacity(cell_pointers.len()); 262 | for &ptr in cell_pointers.iter() { 263 | let cell_offset = ptr as usize - page_start_offset; 264 | let (_, cell) = interior_table_cell(&i[cell_offset..])?; 265 | cells.push(cell); 266 | } 267 | 268 | Ok(( 269 | ii, 270 | InteriorTablePage { 271 | header, 272 | cell_pointers, 273 | cells, 274 | }, 275 | )) 276 | } 277 | } 278 | 279 | fn interior_table_cell(i: &[u8]) -> IResult<&[u8], InteriorTableCell> { 280 | let (i, left_child_page_no) = be_u32(i)?; 281 | let (i, integer_key) = be_u64_varint(i)?; 282 | 283 | Ok(( 284 | i, 285 | InteriorTableCell { 286 | left_child_page_no, 287 | integer_key, 288 | }, 289 | )) 290 | } 291 | 292 | fn leaf_index_b_tree_page( 293 | page_start_offset: usize, 294 | ) -> impl FnMut(&[u8]) -> IResult<&[u8], LeafIndexPage> { 295 | move |i| { 296 | let (ii, _) = tag([0x0au8])(i)?; 297 | let (ii, header) = leaf_page_header(ii)?; 298 | let (ii, cell_pointers) = count(be_u16, header.no_cells.into())(ii)?; 299 | 300 | let mut cells = Vec::with_capacity(cell_pointers.len()); 301 | for &ptr in cell_pointers.iter() { 302 | let cell_offset = ptr as usize - page_start_offset; 303 | let (_, cell) = leaf_index_cell(&i[cell_offset..])?; 304 | cells.push(cell); 305 | } 306 | 307 | Ok(( 308 | ii, 309 | LeafIndexPage { 310 | header, 311 | cell_pointers, 312 | cells, 313 | }, 314 | )) 315 | } 316 | } 317 | 318 | fn leaf_index_cell(i: &[u8]) -> IResult<&[u8], LeafIndexCell> { 319 | let (i, payload_size) = be_u64_varint(i)?; 320 | let (i, payload) = index_cell_payload(i)?; 321 | 322 | Ok(( 323 | i, 324 | LeafIndexCell { 325 | payload_size, 326 | payload, 327 | overflow_page_no: None, 328 | }, 329 | )) 330 | } 331 | 332 | fn leaf_table_b_tree_page( 333 | page_start_offset: usize, 334 | ) -> impl FnMut(&[u8]) -> IResult<&[u8], LeafTablePage> { 335 | move |i| { 336 | let (ii, _) = tag([0x0du8])(i)?; 337 | let (ii, header) = leaf_page_header(ii)?; 338 | let (ii, cell_pointers) = count(be_u16, header.no_cells.into())(ii)?; 339 | 340 | let mut cells = Vec::with_capacity(cell_pointers.len()); 341 | for &ptr in cell_pointers.iter() { 342 | let cell_offset = ptr as usize - page_start_offset; 343 | let (_, cell) = leaf_table_cell(&i[cell_offset..])?; 344 | cells.push(cell); 345 | } 346 | 347 | Ok(( 348 | ii, 349 | LeafTablePage { 350 | header, 351 | cell_pointers, 352 | cells, 353 | }, 354 | )) 355 | } 356 | } 357 | 358 | fn table_cell_payload(i: &[u8]) -> IResult<&[u8], TableCellPayload> { 359 | let (i, header_size) = be_u64_varint(i)?; 360 | let (_, column_types) = column_types(&i[0..header_size as usize - 1])?; 361 | let (i, column_values) = column_values(&column_types)(&i[header_size as usize - 1..])?; 362 | 363 | Ok(( 364 | i, 365 | TableCellPayload { 366 | header_size, 367 | column_types, 368 | column_values, 369 | }, 370 | )) 371 | } 372 | 373 | fn leaf_table_cell(i: &[u8]) -> IResult<&[u8], LeafTableCell> { 374 | let (i, payload_size) = be_u64_varint(i)?; 375 | let (i, rowid) = be_u64_varint(i)?; 376 | let (i, payload) = table_cell_payload(i)?; 377 | 378 | Ok(( 379 | i, 380 | LeafTableCell { 381 | payload_size, 382 | rowid, 383 | payload, 384 | overflow_page_no: None, 385 | }, 386 | )) 387 | } 388 | -------------------------------------------------------------------------------- /src/varint.rs: -------------------------------------------------------------------------------- 1 | use nom::error::{ErrorKind, ParseError}; 2 | use nom::Err; 3 | use nom::IResult; 4 | 5 | /// Big-endian unsigned varint (huffman coding) implementation. 6 | /// 7 | /// Most-significant bit is used as a flag if next byte should taken. 8 | /// It is discarded and the rest are concatenated into resulting integer. 9 | pub fn be_u64_varint(i: &[u8]) -> IResult<&[u8], u64> { 10 | let mut res = 0; 11 | // to guard from overflow 12 | let max_slice = &i[0..(i.len().min(5))]; 13 | for (id, &b) in max_slice.iter().enumerate() { 14 | let b = b as u64; 15 | res = (res << 7) | (b & 0b0111_1111); 16 | 17 | if b >> 7 == 0 { 18 | return Ok((&i[id + 1..], res)); 19 | } 20 | } 21 | 22 | Err(Err::Error(ParseError::from_error_kind( 23 | i, 24 | ErrorKind::MapOpt, 25 | ))) 26 | } 27 | 28 | #[cfg(test)] 29 | mod tests { 30 | use crate::varint::be_u64_varint; 31 | 32 | #[test] 33 | fn parse_1_byte() { 34 | let varint = [0b0000_1111]; 35 | let (i, res) = be_u64_varint(&varint).unwrap(); 36 | 37 | assert!(i.is_empty()); 38 | assert_eq!(res, 0b0000_1111); 39 | } 40 | 41 | #[test] 42 | fn parse_2_byte() { 43 | let varint = [0b1000_1111, 0b0000_1011]; 44 | let (i, res) = be_u64_varint(&varint).unwrap(); 45 | 46 | assert!(i.is_empty()); 47 | assert_eq!(res, 0b1111_000_1011); 48 | } 49 | 50 | #[test] 51 | fn parse_3_byte() { 52 | let varint = [0b1000_1111, 0b1000_1101, 0b0000_1011]; 53 | let (i, res) = be_u64_varint(&varint).unwrap(); 54 | 55 | assert!(i.is_empty()); 56 | assert_eq!(res, 0b1111_000_1101_000_1011); 57 | } 58 | 59 | #[test] 60 | fn parse_4_byte() { 61 | let varint = [0b1000_1111, 0b1000_0111, 0b1000_1101, 0b0000_1011]; 62 | let (i, res) = be_u64_varint(&varint).unwrap(); 63 | 64 | assert!(i.is_empty()); 65 | assert_eq!(res, 0b1111_000_0111_000_1101_000_1011); 66 | } 67 | 68 | #[test] 69 | fn parse_5_byte() { 70 | let varint = [ 71 | 0b1000_1111, 72 | 0b1000_1110, 73 | 0b1000_0111, 74 | 0b1000_1101, 75 | 0b0000_1011, 76 | ]; 77 | let (i, res) = be_u64_varint(&varint).unwrap(); 78 | 79 | assert!(i.is_empty()); 80 | assert_eq!(res, 0b1111_000_1110_000_0111_000_1101_000_1011); 81 | } 82 | 83 | #[test] 84 | fn ignore_rest() { 85 | let varint = [ 86 | 0b1000_1111, 87 | 0b1000_1110, 88 | 0b1000_0111, 89 | 0b1000_1101, 90 | 0b0000_1011, 91 | 0b0, 92 | ]; 93 | let (i, _) = be_u64_varint(&varint).unwrap(); 94 | 95 | assert_eq!(i.len(), 1); 96 | } 97 | } 98 | --------------------------------------------------------------------------------