├── Cargo.toml ├── LICENSE ├── README.md ├── run-tests.sh ├── src ├── lib.rs └── storage.rs └── tests └── storage_spec.rs /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "yakv" 3 | version = "0.2.0" 4 | authors = ["Konstantin Knizhnik "] 5 | edition = "2018" 6 | license = "MIT OR Apache-2.0" 7 | description = "Simple persistent key-value storage based on B-Tree" 8 | repository = "https://github.com/knizhnik/yakv.git" 9 | readme = "README.md" 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dependencies] 14 | crc32c = "0.6.0" 15 | fs2 = "0.4.3" 16 | anyhow = "1.0" 17 | 18 | [dev-dependencies] 19 | rand = "0.8.3" 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **YAKV** is very simple persistent-key value storage implemented in Rust 2 | using "traditional" architecture: B-Tree, buffer cache, ACID transaction, write-ahead log. 3 | **YAKV** implements simple MURSIW (multiple-reads-single-writer) access pattern 4 | and is first of all oriented on embedded applications. 5 | 6 | It has minimal dependencies from other modules and contains just 2k lines of code. 7 | API of storage is very simple: `put/remove` methods for updating information 8 | and `get/iter/range` range methods for retrieving it. 9 | `put` performs update or insert: it key is not present in the storage, then it is inserted 10 | otherwise associated value is updated. Bulk version of `put/remove` are available which accepts iterators 11 | of pairs/keys. Bulk updates are atomic: i.e. all operations are succeeded or rejected. 12 | 13 | Iteration can be done using bidirectional (_double-ended_) iterator and standard Rust ranges, which 14 | allows to simply specify any ranges with open/inclusive/exclusive boundaries. Iterators are not atomic: 15 | i.e. during iteration you can see most recent committed updates of the storage. Moreover, 16 | if concurrent updates delete key at current iterator position, then iteration will stop before processing all results. 17 | 18 | Another way of grouping operations is explicit start of transaction. 19 | Transaction has exclusive access to the database and can perform both update and lookup operations. 20 | At the end transaction should be explicitly committed or aborted, if it was not committed before leaving the scope, 21 | then it is implicitly aborted. 22 | 23 | **YAKV** supports multi-threaded access to the storage. All threads can share single storage instance (you should use Arc for it). 24 | Storage implementation is thread safe and all methods are immutable, so you can call them concurrently from different threads. 25 | But only one thread can update database at each moment of time: other readers or writers will be blocked until the end of transaction. 26 | Readers can work in parallel. 27 | 28 | **YAKV** requires key and value to be a vector of bytes. If you want to store other types, you need to serialize them first. 29 | If you need to preserve natural comparison order for underlying type, then you will have to use proper serializer. 30 | For example for unsigned integer types you need to use _big-endian_ encoding (i.e. `key.to_be_bytes()`) 31 | to make vector of bytes comparison produce the same result as comparison of two numbers. 32 | For signed or floating point types writing such serializer may require more efforts. 33 | 34 | **YAKV** optionally keeps write ahead log (WAL) to provide ACID. 35 | Maintaining WAL requires `fsync` system calls to force persisting data to the non-volatile media. 36 | Add add significant performance penalty especially for small transaction as (inserting just one pair). 37 | But without WAL database can be corrupted in case of abnormal program termination or power failure. 38 | To disable WAL just pass `None` instead of WAL file path. 39 | 40 | Below is an example of **YAKV** usage": 41 | 42 | ``` 43 | let store = Storage::open(data_path, Some(log_path), StorageConfig::default())?; 44 | 45 | // Simple insert/update: 46 | let key = b"Some key".to_vec(); 47 | let value = b"Some value".to_vec(); 48 | store.put(key, value)?; 49 | 50 | // Simple remove: 51 | let key = b"Some key".to_vec(); 52 | store.remove(key)?; 53 | 54 | // Bulk update 55 | store.put_all( 56 | &mut iter::repeat_with(|| { 57 | let key = rand.gen::<[u8; KEY_LEN]>().to_vec(); 58 | let value = rand.gen::<[u8; VALUE_LEN]>().to_vec(); 59 | Ok((key, value)) 60 | } ).take(TRANSACTION_SIZE))?; 61 | 62 | // Bulk delete 63 | store.remove_all( 64 | &mut iter::repeat_with(|| Ok(rand.gen::<[u8; 8]>().to_vec())) 65 | .take(TRANSACTION_SIZE), 66 | 67 | // Explicit transaction: 68 | { 69 | let trans = store.start_transaction(); 70 | trans.put(&1u64.to_be_bytes().to_vec(), &PAYLOAD)?; 71 | trans.put(&2u64.to_be_bytes().to_vec(), &PAYLOAD)?; 72 | trans.remove(&2u64.to_be_bytes().to_vec())?; 73 | trans.commit()?; 74 | } 75 | 76 | // Simple lookup 77 | let key = b"Some key".to_vec(); 78 | if let Some(value) = store.get(&key)? { 79 | println!("key={:?}, value={:?}", &key, &value); 80 | } 81 | 82 | // Iterate through all records: 83 | for entry in store.iter() { 84 | let kv = entry?; 85 | println!("key={:?}, value={:?}", &kv.0, &kv.1); 86 | } 87 | 88 | // Range iterator: 89 | let from_key = b"AAA".to_vec(); 90 | let till_key = b"ZZZ".to_vec(); 91 | for entry in store.range(from_key..till_key) { 92 | let kv = entry?; 93 | println!("key={:?}, value={:?}", &kv.0, &kv.1); 94 | } 95 | 96 | // Backward iterartion: 97 | let till_key = b"XYZ".to_vec(); 98 | let mut it = store.range(..=till_key); 99 | while let Some(entry) = it.next_back() { 100 | let kv = entry?; 101 | println!("key={:?}, value={:?}", &kv.0, &kv.1); 102 | } 103 | 104 | // Close storage 105 | store.close()?; 106 | ``` 107 | 108 | Performance comparison: 109 | 110 | Below are results (msec: smaller is better) of two benchamrks. 111 | 112 | SwayDB benchmark: insertion 1M records with 8 byte key and 8 byte value. 113 | 114 | | db | seq | rnd | 115 | | ------- | ----- | ----- | 116 | | SwayDB | 5526 | 14849 | 117 | | LevelDB | 1107 | 7969 | 118 | | yakv | 594 | 1263 | 119 | 120 | 121 | LMDB benchmark: insert+read of 1M records with 4-bytes key and 100 byte value. 122 | 123 | | db | seq-write | rnd-write | seq-read | rnd-read | 124 | | --------- | --------- | --------- | -------- | -------- | 125 | | Chronicle | 836 | 894 | 613 | 634 | 126 | | LevelDB | 1962 | 2089 | 2223 | 2476 | 127 | | LMDB | 144 | 896 | 108 | 634 | 128 | | MapDB | 8876 | 9304 | 9676 | 9707 | 129 | | MVStore | 1328 | 1675 | 7757 | 8420 | 130 | | RocksDB | 1800 | 1850 | 1814 | 2067 | 131 | | Xodus | 674 | 13981 | 3486 | 4978 | 132 | | kv | 5626 | 7546 | 742 | 1943 | 133 | | yakv | 1079 | 1617 | 549 | 1020 | 134 | 135 | 136 | Performance dependency on transaction size (LMDB vs. YAKV or COW vs. WAL). 137 | This benchmark inserts 1M random keys (as in LMDB benchmark), 138 | but inserts are grouped in transactions (time in msec): 139 | 140 | | tx size | yakv | LMDB | 141 | | ------- | ------ | ------ | 142 | | 1000000 | 1543 | 1367 | 143 | | 100000 | 3914 | 3022 | 144 | | 10000 | 16384 | 8139 | 145 | | 1000 | 30944 | 16881 | 146 | | 100 | 85268 | 70775 | 147 | | 10 | 192179 | 229538 | 148 | 149 | So for large transactions LMDB is slightly faster, for small transactions YAKV is faster 150 | and for medium size transactions LMDB is about two times faster than YAKV. 151 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | cargo +nightly test --release -v -- --nocapture --test-threads=1 2 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod storage; 2 | -------------------------------------------------------------------------------- /src/storage.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{ensure, Result}; 2 | use crc32c::*; 3 | use fs2::FileExt; 4 | use std::cmp::Ordering; 5 | use std::convert::TryInto; 6 | use std::fs::{File, OpenOptions}; 7 | use std::iter; 8 | use std::ops::Bound::*; 9 | use std::ops::{Bound, RangeBounds}; 10 | use std::os::unix::prelude::FileExt as UnixFileExt; 11 | use std::path::Path; 12 | use std::sync::{Condvar, Mutex, RwLock, RwLockWriteGuard}; 13 | 14 | type PageId = u32; // address of page in the file 15 | type BufferId = u32; // index of page in buffer cache 16 | type LSN = u64; // logical serial number: monotonically increased counter of database state changes 17 | type ItemPointer = usize; // offset within page, actually only 16 bits is enough, but use usize to avoid type casts when used as an index 18 | 19 | /// 20 | /// Storage key type. If you want to use some other types as a key, you will have to serialize them (for example using serde). 21 | /// As far as vectors are compared using byte-by-byte comparison, you need to take it in account during serialization if 22 | /// you need to preserve order for original type. For example unsigned integer type should be serialized as big-endian (most significant byte first). 23 | /// 24 | pub type Key = Vec; 25 | 26 | /// 27 | /// Storage value type. All other types should be serialized to vector of bytes. 28 | /// 29 | pub type Value = Vec; 30 | 31 | const PAGE_SIZE: usize = 8192; 32 | const MAGIC: u32 = 0xBACE2021; 33 | const VERSION: u32 = 1; 34 | const METADATA_SIZE: usize = 7 * 4; 35 | const PAGE_HEADER_SIZE: usize = 2; // now page header contains just number of items in the page 36 | const MAX_VALUE_LEN: usize = PAGE_SIZE / 4; // assume that pages may fit at least 3 items 37 | const MAX_KEY_LEN: usize = u8::MAX as usize; // should fit in one byte 38 | const N_BUSY_EVENTS: usize = 8; // number of condition variables used for waiting read completion 39 | 40 | // Flags for page state 41 | const PAGE_RAW: u16 = 1; // buffer content is uninitialized 42 | const PAGE_BUSY: u16 = 2; // buffer is loaded for the disk 43 | const PAGE_DIRTY: u16 = 4; // buffer was updates 44 | const PAGE_WAIT: u16 = 8; // some thread waits until buffer is loaded 45 | const PAGE_SYNCED: u16 = 16; // dirty pages was saved to log 46 | 47 | enum LookupOp<'a> { 48 | First, 49 | Last, 50 | Next, 51 | Prev, 52 | GreaterOrEqual(&'a Key), 53 | } 54 | 55 | #[derive(PartialEq, Copy, Clone, Debug)] 56 | pub enum DatabaseState { 57 | InRecovery, 58 | Opened, 59 | Closed, 60 | Corrupted, 61 | } 62 | 63 | /// 64 | /// Storage configuration parameters 65 | /// 66 | #[derive(Copy, Clone, Debug)] 67 | pub struct StorageConfig { 68 | /// Buffer pool (pages) 69 | pub cache_size: usize, 70 | /// Maximal size of WAL. When it is reached, database file is synced and WAL is rotated 71 | /// (write starts from the beginning) 72 | pub checkpoint_interval: u64, 73 | /// Threshold for flushing dirty pages to WAL (to reduce commit time) 74 | pub wal_flush_threshold: BufferId, 75 | } 76 | 77 | impl StorageConfig { 78 | pub fn default() -> StorageConfig { 79 | StorageConfig { 80 | cache_size: 128 * 1024, // 1Gb 81 | checkpoint_interval: 1u64 * 1024 * 1024 * 1024, // 1Gb 82 | wal_flush_threshold: BufferId::MAX, 83 | } 84 | } 85 | } 86 | 87 | #[derive(PartialEq)] 88 | enum AccessMode { 89 | ReadOnly, 90 | WriteOnly, 91 | } 92 | 93 | /// 94 | /// Status of transaction 95 | /// 96 | #[derive(PartialEq)] 97 | pub enum TransactionStatus { 98 | InProgress, 99 | Committed, 100 | Aborted, 101 | } 102 | 103 | /// 104 | /// Explicitly started transaction. Storage can be updated in autocommit mode 105 | /// or using explicitly started transaction. 106 | /// 107 | pub struct Transaction<'a> { 108 | pub status: TransactionStatus, 109 | storage: &'a Storage, 110 | db: RwLockWriteGuard<'a, Database>, 111 | } 112 | 113 | /// 114 | /// Status of automatic database recovery after open. 115 | /// In case of start after normal shutdown all fields should be zero. 116 | /// 117 | #[derive(Clone, Copy, Default)] 118 | pub struct RecoveryStatus { 119 | /// number of recovered transactions 120 | pub recovered_transactions: u64, 121 | /// size of WAL at the moment of recovery 122 | pub wal_size: u64, 123 | /// position of last recovery transaction in WAL 124 | pub recovery_end: u64, 125 | } 126 | 127 | /// 128 | /// Database info 129 | /// 130 | #[derive(Clone, Copy, Debug)] 131 | pub struct DatabaseInfo { 132 | /// Height of B-Tree 133 | pub tree_height: usize, 134 | /// Size of database file 135 | pub db_size: u64, 136 | /// Total size of used pages 137 | pub db_used: u64, 138 | /// Current WAL size 139 | pub log_size: u64, 140 | /// number of committed transactions in this session 141 | pub n_committed_transactions: u64, 142 | /// number of aborted transactions in this session 143 | pub n_aborted_transactions: u64, 144 | /// State of the database 145 | pub state: DatabaseState, 146 | } 147 | 148 | /// 149 | /// Buffer cache info 150 | /// 151 | #[derive(Clone, Copy, Debug)] 152 | pub struct CacheInfo { 153 | /// Number of pinned pages in buffer cache 154 | pub pinned: usize, 155 | /// Number of dirty pages in buffer cache 156 | pub dirtied: usize, 157 | /// Total number of pages in buffer cache 158 | pub used: usize, 159 | } 160 | 161 | /// 162 | /// Abstract storage bidirectional iterator 163 | /// 164 | pub struct StorageIterator<'a> { 165 | storage: &'a Storage, 166 | trans: Option<&'a Transaction<'a>>, 167 | from: Bound, 168 | till: Bound, 169 | left: TreePath, 170 | right: TreePath, 171 | } 172 | 173 | // 174 | // Position in the page saved during tree traversal 175 | // 176 | struct PagePos { 177 | pid: PageId, 178 | pos: usize, 179 | } 180 | 181 | // 182 | // Path in B-Tree to the current iterator's element 183 | // 184 | struct TreePath { 185 | curr: Option<(Key, Value)>, // current (key,value) pair if any 186 | result: Option>, 187 | stack: Vec, // stack of positions in B-Tree 188 | lsn: LSN, // LSN of last operation 189 | } 190 | 191 | impl TreePath { 192 | fn new() -> TreePath { 193 | TreePath { 194 | curr: None, 195 | result: None, 196 | stack: Vec::new(), 197 | lsn: 0, 198 | } 199 | } 200 | } 201 | 202 | impl<'a> StorageIterator<'a> { 203 | fn next_locked(&mut self, db: &Database) -> Option< as Iterator>::Item> { 204 | if self.left.stack.len() == 0 { 205 | match &self.from { 206 | Bound::Included(key) => { 207 | self.storage 208 | .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.left) 209 | } 210 | Bound::Excluded(key) => { 211 | self.storage 212 | .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.left); 213 | if let Some((curr_key, _value)) = &self.left.curr { 214 | if curr_key == key { 215 | self.storage.lookup(db, LookupOp::Next, &mut self.left); 216 | } 217 | } 218 | } 219 | Bound::Unbounded => self.storage.lookup(db, LookupOp::First, &mut self.left), 220 | } 221 | } else { 222 | self.storage.lookup(db, LookupOp::Next, &mut self.left); 223 | } 224 | if let Some((curr_key, _value)) = &self.left.curr { 225 | match &self.till { 226 | Bound::Included(key) => { 227 | if curr_key > key { 228 | return None; 229 | } 230 | } 231 | Bound::Excluded(key) => { 232 | if curr_key >= key { 233 | return None; 234 | } 235 | } 236 | Bound::Unbounded => {} 237 | } 238 | } 239 | self.left.result.take() 240 | } 241 | 242 | fn next_back_locked( 243 | &mut self, 244 | db: &Database, 245 | ) -> Option< as Iterator>::Item> { 246 | if self.right.stack.len() == 0 { 247 | match &self.till { 248 | Bound::Included(key) => { 249 | self.storage 250 | .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.right); 251 | if let Some((curr_key, _value)) = &self.right.curr { 252 | if curr_key > key { 253 | self.storage.lookup(db, LookupOp::Prev, &mut self.right); 254 | } 255 | } else { 256 | self.storage.lookup(db, LookupOp::Last, &mut self.right); 257 | } 258 | } 259 | Bound::Excluded(key) => { 260 | self.storage 261 | .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.right); 262 | if let Some((curr_key, _value)) = &self.right.curr { 263 | if curr_key >= key { 264 | self.storage.lookup(db, LookupOp::Prev, &mut self.right); 265 | } 266 | } else { 267 | self.storage.lookup(db, LookupOp::Last, &mut self.right); 268 | } 269 | } 270 | Bound::Unbounded => self.storage.lookup(db, LookupOp::Last, &mut self.right), 271 | } 272 | } else { 273 | self.storage.lookup(db, LookupOp::Prev, &mut self.right); 274 | } 275 | if let Some((curr_key, _value)) = &self.right.curr { 276 | match &self.from { 277 | Bound::Included(key) => { 278 | if curr_key < key { 279 | return None; 280 | } 281 | } 282 | Bound::Excluded(key) => { 283 | if curr_key <= key { 284 | return None; 285 | } 286 | } 287 | Bound::Unbounded => {} 288 | } 289 | } 290 | self.right.result.take() 291 | } 292 | } 293 | 294 | impl<'a> Iterator for StorageIterator<'a> { 295 | type Item = Result<(Key, Value)>; 296 | 297 | fn next(&mut self) -> Option { 298 | if let Some(trans) = self.trans { 299 | assert!(trans.status == TransactionStatus::InProgress); 300 | self.next_locked(&trans.db) 301 | } else { 302 | let db = self.storage.db.read().unwrap(); 303 | self.next_locked(&db) 304 | } 305 | } 306 | } 307 | 308 | impl<'a> DoubleEndedIterator for StorageIterator<'a> { 309 | fn next_back(&mut self) -> Option { 310 | if let Some(trans) = self.trans { 311 | assert!(trans.status == TransactionStatus::InProgress); 312 | self.next_back_locked(&trans.db) 313 | } else { 314 | let db = self.storage.db.read().unwrap(); 315 | self.next_back_locked(&db) 316 | } 317 | } 318 | } 319 | 320 | /// 321 | /// Persistent key-value storage implementation 322 | /// Update operations are atomic, select operations are non-atomic and observe most recent database state. 323 | /// 324 | pub struct Storage { 325 | db: RwLock, 326 | buf_mgr: Mutex, 327 | busy_events: [Condvar; N_BUSY_EVENTS], 328 | pool: Vec>, 329 | conf: StorageConfig, 330 | file: File, 331 | log: Option, 332 | } 333 | 334 | // 335 | // Page header in buffer manager 336 | // 337 | #[derive(Clone, Copy, Default)] 338 | struct PageHeader { 339 | pid: PageId, 340 | collision: BufferId, // collision chain 341 | // LRU l2-list 342 | next: BufferId, 343 | prev: BufferId, 344 | access_count: u16, 345 | state: u16, // bitmask of PAGE_RAW, PAGE_DIRTY, ... 346 | } 347 | 348 | impl PageHeader { 349 | fn new() -> PageHeader { 350 | Default::default() 351 | } 352 | } 353 | 354 | // 355 | // Database metadata 356 | // 357 | #[derive(Copy, Clone)] 358 | struct Metadata { 359 | magic: u32, // storage magic 360 | version: u32, // storage format version 361 | free: PageId, // L1 list of free pages 362 | size: PageId, // size of database (pages) 363 | used: PageId, // number of used database pages 364 | root: PageId, // B-Tree root page 365 | height: u32, // height of B-Tree 366 | } 367 | 368 | impl Metadata { 369 | fn pack(self) -> [u8; METADATA_SIZE] { 370 | unsafe { std::mem::transmute::(self) } 371 | } 372 | fn unpack(buf: &[u8]) -> Metadata { 373 | unsafe { 374 | std::mem::transmute::<[u8; METADATA_SIZE], Metadata>( 375 | buf[0..METADATA_SIZE].try_into().unwrap(), 376 | ) 377 | } 378 | } 379 | } 380 | 381 | // 382 | // Database shared state 383 | // 384 | struct Database { 385 | meta: Metadata, // cached metadata (stored in root page) 386 | meta_updated: bool, // whether metadata was updated 387 | lsn: LSN, // database modification counter 388 | n_aborted_txns: LSN, // number of aborted transactions 389 | state: DatabaseState, // database state 390 | wal_pos: u64, // current position in log file 391 | tx_crc: u32, // accumulated CRC of the current transaction 392 | tx_size: usize, // current transaction size 393 | recovery: RecoveryStatus, // status of recovery 394 | } 395 | 396 | impl Database { 397 | fn get_info(&self) -> DatabaseInfo { 398 | DatabaseInfo { 399 | db_size: self.meta.size as u64 * PAGE_SIZE as u64, 400 | db_used: self.meta.used as u64 * PAGE_SIZE as u64, 401 | tree_height: self.meta.height as usize, 402 | log_size: self.wal_pos, 403 | state: self.state, 404 | n_committed_transactions: self.lsn, 405 | n_aborted_transactions: self.n_aborted_txns, 406 | } 407 | } 408 | } 409 | 410 | // 411 | // Buffer manager is using L2-list for LRU cache eviction policy, 412 | // L1 lists for free and dirty pages. 413 | // All modified pages are pinned till the end of transaction. 414 | // Indexes are used instead of pointers to reduce memory footprint and bypass Rust ownership/visibility rules. 415 | // Page with index 0 is reserved in buffer manager for root page. It is not included in any list, so 0 is treated as terminator. 416 | // 417 | struct BufferManager { 418 | // LRU l2-list 419 | head: BufferId, 420 | tail: BufferId, 421 | 422 | free_pages: BufferId, // L1-list of free pages 423 | dirty_pages: BufferId, // L2-list of dirty pages 424 | next_sync: BufferId, // next page to be written to WAL 425 | 426 | used: BufferId, // used part of page pool 427 | pinned: BufferId, // amount of pinned pages 428 | dirtied: BufferId, // amount of dirty pages 429 | cached: BufferId, // amount of cached pages 430 | 431 | hash_table: Vec, // array containing indexes of collision chains 432 | pages: Vec, // page data 433 | } 434 | 435 | // 436 | // Wrapper class for accessing page data 437 | // 438 | struct PageData { 439 | data: [u8; PAGE_SIZE], 440 | } 441 | 442 | impl PageData { 443 | fn new() -> PageData { 444 | PageData { 445 | data: [0u8; PAGE_SIZE], 446 | } 447 | } 448 | } 449 | 450 | impl PageData { 451 | fn get_offs(&self, ip: ItemPointer) -> usize { 452 | self.get_u16(PAGE_HEADER_SIZE + ip * 2) as usize 453 | } 454 | 455 | fn set_offs(&mut self, ip: ItemPointer, offs: usize) { 456 | self.set_u16(PAGE_HEADER_SIZE + ip * 2, offs as u16) 457 | } 458 | 459 | fn get_child(&self, ip: ItemPointer) -> PageId { 460 | let offs = self.get_offs(ip); 461 | let key_len = self.data[offs] as usize; 462 | self.get_u32(offs + key_len + 1) 463 | } 464 | 465 | fn get_key(&self, ip: ItemPointer) -> Key { 466 | let offs = self.get_offs(ip); 467 | let key_len = self.data[offs] as usize; 468 | self.data[offs + 1..offs + 1 + key_len].to_vec() 469 | } 470 | 471 | fn get_last_key(&self) -> Key { 472 | let n_items = self.get_n_items(); 473 | self.get_key(n_items - 1) 474 | } 475 | 476 | fn get_item(&self, ip: ItemPointer) -> (Key, Value) { 477 | let (item_offs, item_len) = self.get_item_offs_len(ip); 478 | let key_len = self.data[item_offs] as usize; 479 | ( 480 | self.data[item_offs + 1..item_offs + 1 + key_len].to_vec(), 481 | self.data[item_offs + 1 + key_len..item_offs + item_len].to_vec(), 482 | ) 483 | } 484 | 485 | fn get_item_offs_len(&self, ip: ItemPointer) -> (usize, usize) { 486 | let offs = self.get_offs(ip); 487 | let next_offs = if ip == 0 { 488 | PAGE_SIZE 489 | } else { 490 | self.get_offs(ip - 1) 491 | }; 492 | debug_assert!(next_offs > offs); 493 | (offs, next_offs - offs) 494 | } 495 | 496 | fn set_u16(&mut self, offs: usize, data: u16) { 497 | self.copy(offs, &data.to_be_bytes()); 498 | } 499 | 500 | fn set_u32(&mut self, offs: usize, data: u32) { 501 | self.copy(offs, &data.to_be_bytes()); 502 | } 503 | 504 | fn get_u16(&self, offs: usize) -> u16 { 505 | u16::from_be_bytes(self.data[offs..offs + 2].try_into().unwrap()) 506 | } 507 | 508 | fn get_u32(&self, offs: usize) -> u32 { 509 | u32::from_be_bytes(self.data[offs..offs + 4].try_into().unwrap()) 510 | } 511 | 512 | fn get_n_items(&self) -> ItemPointer { 513 | self.get_u16(0) as ItemPointer 514 | } 515 | 516 | fn get_size(&self) -> ItemPointer { 517 | let n_items = self.get_n_items(); 518 | if n_items == 0 { 519 | 0 520 | } else { 521 | PAGE_SIZE - self.get_offs(n_items - 1) 522 | } 523 | } 524 | 525 | fn set_n_items(&mut self, n_items: ItemPointer) { 526 | self.set_u16(0, n_items as u16) 527 | } 528 | 529 | fn copy(&mut self, offs: usize, data: &[u8]) { 530 | let len = data.len(); 531 | self.data[offs..offs + len].copy_from_slice(&data); 532 | } 533 | 534 | fn compare_key(&self, ip: ItemPointer, key: &Key) -> Ordering { 535 | let offs = self.get_offs(ip); 536 | let key_len = self.data[offs] as usize; 537 | if key_len == 0 { 538 | // special handling of +inf in right-most internal nodes 539 | Ordering::Less 540 | } else { 541 | key[..].cmp(&self.data[offs + 1..offs + 1 + key_len]) 542 | } 543 | } 544 | 545 | fn remove_key(&mut self, ip: ItemPointer, leaf: bool) { 546 | let n_items = self.get_n_items(); 547 | let size = self.get_size(); 548 | let (item_offs, item_len) = self.get_item_offs_len(ip); 549 | for i in ip + 1..n_items { 550 | self.set_offs(i - 1, self.get_offs(i) + item_len); 551 | } 552 | let items_origin = PAGE_SIZE - size; 553 | if !leaf && n_items > 1 && ip + 1 == n_items { 554 | // If we are removing last child of internal page then copy it's key to the previous item 555 | let prev_item_offs = item_offs + item_len; 556 | let key_len = self.data[item_offs] as usize; 557 | let prev_key_len = self.data[prev_item_offs] as usize; 558 | let new_offs = prev_item_offs + prev_key_len - key_len; 559 | self.set_offs(ip - 1, new_offs); 560 | self.data 561 | .copy_within(item_offs..item_offs + prev_key_len + 1, new_offs); 562 | } else { 563 | self.data 564 | .copy_within(items_origin..item_offs, items_origin + item_len); 565 | } 566 | self.set_n_items(n_items - 1); 567 | } 568 | 569 | // 570 | // Insert item on the page is there is enough free space, otherwise return false 571 | // 572 | fn insert_item(&mut self, ip: ItemPointer, key: &Key, value: &[u8]) -> bool { 573 | let n_items = self.get_n_items(); 574 | let size = self.get_size(); 575 | let key_len = key.len(); 576 | let item_len = 1 + key_len + value.len(); 577 | if (n_items + 1) * 2 + size + item_len <= PAGE_SIZE - PAGE_HEADER_SIZE { 578 | // fit in page 579 | for i in (ip..n_items).rev() { 580 | self.set_offs(i + 1, self.get_offs(i) - item_len); 581 | } 582 | let item_offs = if ip != 0 { 583 | self.get_offs(ip - 1) - item_len 584 | } else { 585 | PAGE_SIZE - item_len 586 | }; 587 | self.set_offs(ip, item_offs); 588 | let items_origin = PAGE_SIZE - size; 589 | self.data 590 | .copy_within(items_origin..item_offs + item_len, items_origin - item_len); 591 | self.data[item_offs] = key_len as u8; 592 | self.data[item_offs + 1..item_offs + 1 + key_len].copy_from_slice(&key); 593 | self.data[item_offs + 1 + key_len..item_offs + item_len].copy_from_slice(&value); 594 | self.set_n_items(n_items + 1); 595 | true 596 | } else { 597 | false 598 | } 599 | } 600 | 601 | // 602 | // Split page into two approximately equal parts. Smallest keys are moved to the new page, 603 | // largest - left on original page. 604 | // Returns split position 605 | // 606 | fn split(&mut self, new_page: &mut PageData, ip: ItemPointer) -> ItemPointer { 607 | let n_items = self.get_n_items(); 608 | let size = self.get_size(); 609 | let mut r = n_items; 610 | 611 | if ip == r { 612 | // Optimization for insert of sequential keys: move all data to new page, 613 | // leaving original page empty. It will cause complete filling of B-Tree pages. 614 | r -= 1; 615 | } else { 616 | // Divide page in two approximately equal parts. 617 | let margin = PAGE_SIZE - size / 2; 618 | let mut l: ItemPointer = 0; 619 | while l < r { 620 | let m = (l + r) >> 1; 621 | if self.get_offs(m) > margin { 622 | // items are allocated from right to left 623 | l = m + 1; 624 | } else { 625 | r = m; 626 | } 627 | } 628 | debug_assert!(l == r); 629 | } 630 | // Move first r+1 elements to the new page 631 | let moved_size = PAGE_SIZE - self.get_offs(r); 632 | 633 | // copy item pointers 634 | new_page.data[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + (r + 1) * 2] 635 | .copy_from_slice(&self.data[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + (r + 1) * 2]); 636 | // copy items 637 | let dst = PAGE_SIZE - moved_size; 638 | new_page.data[dst..].copy_from_slice(&self.data[dst..]); 639 | 640 | // Adjust item pointers on old page 641 | for i in r + 1..n_items { 642 | self.set_offs(i - r - 1, self.get_offs(i) + moved_size); 643 | } 644 | let src = PAGE_SIZE - size; 645 | self.data.copy_within(src..dst, src + moved_size); 646 | new_page.set_n_items(r + 1); 647 | self.set_n_items(n_items - r - 1); 648 | r 649 | } 650 | } 651 | 652 | impl BufferManager { 653 | // 654 | // Link buffer to the head of LRU list (make it acceptable for eviction) 655 | // 656 | fn unpin(&mut self, id: BufferId) { 657 | debug_assert!(self.pages[id as usize].access_count == 1); 658 | self.pages[id as usize].access_count = 0; 659 | self.pages[id as usize].next = self.head; 660 | self.pages[id as usize].prev = 0; 661 | self.pinned -= 1; 662 | if self.head != 0 { 663 | self.pages[self.head as usize].prev = id; 664 | } else { 665 | self.tail = id; 666 | } 667 | self.head = id; 668 | } 669 | 670 | // 671 | // Unlink buffer from LRU list and so pin it in memory (protect from eviction) 672 | // 673 | fn pin(&mut self, id: BufferId) { 674 | debug_assert!(self.pages[id as usize].access_count == 0); 675 | let next = self.pages[id as usize].next; 676 | let prev = self.pages[id as usize].prev; 677 | if prev == 0 { 678 | self.head = next; 679 | } else { 680 | self.pages[prev as usize].next = next; 681 | } 682 | if next == 0 { 683 | self.tail = prev; 684 | } else { 685 | self.pages[next as usize].prev = prev; 686 | } 687 | self.pinned += 1; 688 | } 689 | 690 | // 691 | // Insert page in hash table 692 | // 693 | fn insert(&mut self, id: BufferId) { 694 | let h = self.pages[id as usize].pid as usize % self.hash_table.len(); 695 | self.pages[id as usize].collision = self.hash_table[h]; 696 | self.hash_table[h] = id; 697 | } 698 | 699 | // 700 | // Remove page from hash table 701 | // 702 | fn remove(&mut self, id: BufferId) { 703 | let h = self.pages[id as usize].pid as usize % self.hash_table.len(); 704 | let mut p = self.hash_table[h]; 705 | if p == id { 706 | self.hash_table[h] = self.pages[id as usize].collision; 707 | } else { 708 | while self.pages[p as usize].collision != id { 709 | p = self.pages[p as usize].collision; 710 | } 711 | self.pages[p as usize].collision = self.pages[id as usize].collision; 712 | } 713 | } 714 | 715 | // 716 | // Throw away buffer from cache (used by transaction rollback) 717 | // 718 | fn throw_buffer(&mut self, id: BufferId) { 719 | self.remove(id); 720 | self.pages[id as usize].next = self.free_pages; 721 | self.free_pages = id; 722 | self.cached -= 1; 723 | } 724 | 725 | // 726 | // If buffer is not yet marked as dirty then mark it as dirty and pin until the end of transaction 727 | // 728 | fn modify_buffer( 729 | &mut self, 730 | id: BufferId, 731 | wal_flush_threshold: BufferId, 732 | ) -> Result> { 733 | debug_assert!(self.pages[id as usize].access_count > 0); 734 | let mut next_sync: Option<(BufferId, PageId)> = None; 735 | if (self.pages[id as usize].state & PAGE_DIRTY) == 0 { 736 | self.pages[id as usize].access_count += 1; // pin dirty page in memory 737 | self.pages[id as usize].state = PAGE_DIRTY; 738 | self.dirtied += 1; 739 | if self.dirtied > wal_flush_threshold { 740 | let mut sync = self.next_sync; 741 | while sync != 0 { 742 | assert_eq!(self.pages[sync as usize].state, PAGE_DIRTY); 743 | if self.pages[sync as usize].access_count == 1 { 744 | self.pages[sync as usize].state |= PAGE_SYNCED; 745 | self.next_sync = self.pages[sync as usize].prev; 746 | let pid = self.pages[sync as usize].pid; 747 | next_sync = Some((sync, pid)); 748 | break; 749 | } 750 | sync = self.pages[sync as usize].prev; 751 | } 752 | } 753 | } else { 754 | // we have to write page to the log once again 755 | self.pages[id as usize].state &= !PAGE_SYNCED; 756 | 757 | let prev = self.pages[id as usize].prev; 758 | 759 | // Move page to the beginning of L2 list 760 | if prev == 0 { 761 | // already first page: do nothing 762 | return Ok(None); 763 | } 764 | 765 | // If this page was scheduled for flush, then use previous page instead 766 | if self.next_sync == id { 767 | self.next_sync = prev; 768 | } 769 | 770 | // unlink page 771 | let next = self.pages[id as usize].next; 772 | self.pages[prev as usize].next = next; 773 | if next != 0 { 774 | self.pages[next as usize].prev = prev; 775 | } 776 | } 777 | // link to the beginning of dirty list 778 | if self.dirty_pages != 0 { 779 | self.pages[self.dirty_pages as usize].prev = id; 780 | } 781 | if self.next_sync == 0 { 782 | self.next_sync = id; 783 | } 784 | self.pages[id as usize].next = self.dirty_pages; 785 | self.pages[id as usize].prev = 0; 786 | self.dirty_pages = id; 787 | Ok(next_sync) 788 | } 789 | 790 | // 791 | // Decrement buffer's access counter and release buffer if it is last reference 792 | // 793 | fn release_buffer(&mut self, id: BufferId) { 794 | debug_assert!(self.pages[id as usize].access_count > 0); 795 | if self.pages[id as usize].access_count == 1 { 796 | debug_assert!((self.pages[id as usize].state & PAGE_DIRTY) == 0); 797 | self.unpin(id); 798 | } else { 799 | self.pages[id as usize].access_count -= 1; 800 | } 801 | } 802 | 803 | // 804 | // Find buffer with specified page or allocate new buffer 805 | // 806 | fn get_buffer(&mut self, pid: PageId) -> Result { 807 | let hash = pid as usize % self.hash_table.len(); 808 | let mut h = self.hash_table[hash]; 809 | while h != 0 { 810 | if self.pages[h as usize].pid == pid { 811 | let access_count = self.pages[h as usize].access_count; 812 | debug_assert!(access_count < u16::MAX - 1); 813 | if access_count == 0 { 814 | self.pin(h); 815 | } 816 | self.pages[h as usize].access_count = access_count + 1; 817 | return Ok(h); 818 | } 819 | h = self.pages[h as usize].collision; 820 | } 821 | // page not found in cache 822 | h = self.free_pages; 823 | if h != 0 { 824 | // has some free pages 825 | self.free_pages = self.pages[h as usize].next; 826 | self.cached += 1; 827 | self.pinned += 1; 828 | } else { 829 | h = self.used; 830 | if (h as usize) < self.hash_table.len() { 831 | self.used += 1; 832 | self.cached += 1; 833 | self.pinned += 1; 834 | } else { 835 | // Replace least recently used page 836 | let victim = self.tail; 837 | ensure!(victim != 0); 838 | debug_assert!(self.pages[victim as usize].access_count == 0); 839 | debug_assert!((self.pages[victim as usize].state & PAGE_DIRTY) == 0); 840 | self.pin(victim); 841 | self.remove(victim); 842 | h = victim; 843 | } 844 | } 845 | self.pages[h as usize].access_count = 1; 846 | self.pages[h as usize].pid = pid; 847 | self.pages[h as usize].state = PAGE_RAW; 848 | self.insert(h); 849 | Ok(h) 850 | } 851 | } 852 | 853 | struct PageGuard<'a> { 854 | buf: BufferId, 855 | pid: PageId, 856 | storage: &'a Storage, 857 | } 858 | 859 | impl<'a> Drop for PageGuard<'a> { 860 | fn drop(&mut self) { 861 | self.storage.release_page(self.buf); 862 | } 863 | } 864 | 865 | // 866 | // Storage internal methods implementations 867 | // 868 | impl Storage { 869 | // 870 | // Unpin page (called by PageGuard) 871 | // 872 | fn release_page(&self, buf: BufferId) { 873 | let mut bm = self.buf_mgr.lock().unwrap(); 874 | bm.release_buffer(buf); 875 | } 876 | 877 | // 878 | // Allocate new page in storage and get buffer for it 879 | // 880 | fn new_page(&self, db: &mut Database) -> Result> { 881 | let free = db.meta.free; 882 | let buf; 883 | let mut bm = self.buf_mgr.lock().unwrap(); 884 | if free != 0 { 885 | buf = bm.get_buffer(free)?; 886 | let mut page = self.pool[buf as usize].write().unwrap(); 887 | if (bm.pages[buf as usize].state & PAGE_RAW) != 0 { 888 | self.file 889 | .read_exact_at(&mut page.data, free as u64 * PAGE_SIZE as u64)?; 890 | } 891 | db.meta.free = page.get_u32(0); 892 | page.data.fill(0u8); 893 | } else { 894 | // extend storage 895 | buf = bm.get_buffer(db.meta.size)?; 896 | db.meta.size += 1; 897 | let mut page = self.pool[buf as usize].write().unwrap(); 898 | page.data.fill(0u8); 899 | } 900 | db.meta.used += 1; 901 | db.meta_updated = true; 902 | self.modify_buffer(db, &mut bm, buf)?; 903 | 904 | Ok(PageGuard { 905 | buf, 906 | pid: bm.pages[buf as usize].pid, 907 | storage: &self, 908 | }) 909 | } 910 | 911 | // 912 | // Read page in buffer and return PageGuard with pinned buffer. 913 | // Buffer will be automatically released on exiting from scope 914 | // 915 | fn get_page(&self, pid: PageId, mode: AccessMode) -> Result> { 916 | let mut bm = self.buf_mgr.lock().unwrap(); 917 | let buf = bm.get_buffer(pid)?; 918 | if (bm.pages[buf as usize].state & PAGE_BUSY) != 0 { 919 | // Some other thread is loading buffer: just wait until it done 920 | bm.pages[buf as usize].state |= PAGE_WAIT; 921 | loop { 922 | debug_assert!((bm.pages[buf as usize].state & PAGE_WAIT) != 0); 923 | bm = self.busy_events[buf as usize % N_BUSY_EVENTS] 924 | .wait(bm) 925 | .unwrap(); 926 | if (bm.pages[buf as usize].state & PAGE_BUSY) == 0 { 927 | break; 928 | } 929 | } 930 | } else if (bm.pages[buf as usize].state & PAGE_RAW) != 0 { 931 | if mode != AccessMode::WriteOnly { 932 | // Read buffer if not in write-only mode 933 | bm.pages[buf as usize].state = PAGE_BUSY; 934 | drop(bm); // read page without holding lock 935 | { 936 | let mut page = self.pool[buf as usize].write().unwrap(); 937 | self.file 938 | .read_exact_at(&mut page.data, pid as u64 * PAGE_SIZE as u64)?; 939 | } 940 | bm = self.buf_mgr.lock().unwrap(); 941 | if (bm.pages[buf as usize].state & PAGE_WAIT) != 0 { 942 | // Somebody is waiting for us 943 | self.busy_events[buf as usize % N_BUSY_EVENTS].notify_all(); 944 | } 945 | } 946 | bm.pages[buf as usize].state = 0; 947 | } 948 | if mode != AccessMode::ReadOnly { 949 | bm.modify_buffer(buf, BufferId::MAX)?; 950 | } 951 | Ok(PageGuard { 952 | buf, 953 | pid, 954 | storage: &self, 955 | }) 956 | } 957 | 958 | // 959 | // Mark buffer as modified, pin it in memory and if it is needed, 960 | // write least recently modified page to WAL 961 | // 962 | fn modify_buffer( 963 | &self, 964 | db: &mut Database, 965 | bm: &mut BufferManager, 966 | buf: BufferId, 967 | ) -> Result<()> { 968 | if let Some((sync_buf, sync_pid)) = bm.modify_buffer(buf, self.conf.wal_flush_threshold)? { 969 | assert_eq!(bm.pages[sync_buf as usize].state, PAGE_DIRTY | PAGE_SYNCED); 970 | self.write_page_to_wal(db, sync_buf, sync_pid)?; 971 | } 972 | Ok(()) 973 | } 974 | 975 | // 976 | // Mark page as dirty and pin it in-memory until end of transaction 977 | // 978 | fn modify_page(&self, db: &mut Database, buf: BufferId) -> Result<()> { 979 | let mut bm = self.buf_mgr.lock().unwrap(); 980 | self.modify_buffer(db, &mut bm, buf) 981 | } 982 | 983 | pub fn start_transaction(&self) -> Transaction<'_> { 984 | Transaction { 985 | status: TransactionStatus::InProgress, 986 | storage: self, 987 | db: self.db.write().unwrap(), 988 | } 989 | } 990 | 991 | fn write_page_to_wal(&self, db: &mut Database, buf: BufferId, pid: PageId) -> Result<()> { 992 | if let Some(log) = &self.log { 993 | let mut tx_buf = [0u8; PAGE_SIZE + 4]; 994 | let page = self.pool[buf as usize].read().unwrap(); 995 | tx_buf[0..4].copy_from_slice(&pid.to_be_bytes()); 996 | tx_buf[4..].copy_from_slice(&page.data); 997 | db.tx_crc = crc32c_append(db.tx_crc, &tx_buf); 998 | log.write_all_at(&tx_buf, db.wal_pos)?; 999 | db.wal_pos += (4 + PAGE_SIZE) as u64; 1000 | db.tx_size += 4 + PAGE_SIZE; 1001 | } 1002 | Ok(()) 1003 | } 1004 | 1005 | fn commit(&self, db: &mut Database) -> Result<()> { 1006 | let mut bm = self.buf_mgr.lock().unwrap(); 1007 | 1008 | if db.meta_updated { 1009 | let meta = db.meta.pack(); 1010 | let mut page = self.pool[0].write().unwrap(); 1011 | page.data[0..METADATA_SIZE].copy_from_slice(&meta); 1012 | } 1013 | if let Some(log) = &self.log { 1014 | // Write dirty pages to log file 1015 | let mut dirty = bm.dirty_pages; 1016 | while dirty != 0 && (bm.pages[dirty as usize].state & PAGE_SYNCED) == 0 { 1017 | assert_eq!(bm.pages[dirty as usize].state, PAGE_DIRTY); 1018 | self.write_page_to_wal(db, dirty, bm.pages[dirty as usize].pid)?; 1019 | dirty = bm.pages[dirty as usize].next; 1020 | } 1021 | if bm.dirty_pages != 0 { 1022 | let mut buf = [0u8; METADATA_SIZE + 8]; 1023 | { 1024 | let page = self.pool[0].read().unwrap(); 1025 | buf[4..4 + METADATA_SIZE].copy_from_slice(&page.data[0..METADATA_SIZE]); 1026 | } 1027 | let crc = crc32c_append(db.tx_crc, &buf[..4 + METADATA_SIZE]); 1028 | buf[4 + METADATA_SIZE..].copy_from_slice(&crc.to_be_bytes()); 1029 | log.write_all_at(&buf, db.wal_pos)?; 1030 | db.wal_pos += (8 + METADATA_SIZE) as u64; 1031 | log.sync_all()?; 1032 | db.lsn += 1; 1033 | db.tx_crc = 0; 1034 | db.tx_size = 0; 1035 | 1036 | // Write pages to the data file 1037 | self.flush_buffers(&mut bm, db.meta_updated)?; 1038 | 1039 | if db.wal_pos >= self.conf.checkpoint_interval { 1040 | // Sync data file and restart from the beginning of WAL. 1041 | // So not truncate WAL to avoid file extension overhead. 1042 | self.file.sync_all()?; 1043 | db.wal_pos = 0; 1044 | } 1045 | } 1046 | } else { 1047 | // No WAL mode: just write dirty pages to the disk 1048 | if self.flush_buffers(&mut bm, db.meta_updated)? { 1049 | db.lsn += 1; 1050 | } 1051 | } 1052 | db.meta_updated = false; 1053 | Ok(()) 1054 | } 1055 | 1056 | // 1057 | // Flush dirty pages to the disk. Return true if database is changed. 1058 | // 1059 | fn flush_buffers(&self, bm: &mut BufferManager, save_meta: bool) -> Result { 1060 | let mut dirty = bm.dirty_pages; 1061 | if save_meta { 1062 | assert!(dirty != 0); // if we changed meta, then we should change or create at least one page 1063 | let page = self.pool[0].read().unwrap(); 1064 | self.file.write_all_at(&page.data, 0)?; 1065 | } 1066 | while dirty != 0 { 1067 | let pid = bm.pages[dirty as usize].pid; 1068 | let file_offs = pid as u64 * PAGE_SIZE as u64; 1069 | let page = self.pool[dirty as usize].read().unwrap(); 1070 | let next = bm.pages[dirty as usize].next; 1071 | self.file.write_all_at(&page.data, file_offs)?; 1072 | debug_assert!((bm.pages[dirty as usize].state & PAGE_DIRTY) != 0); 1073 | bm.pages[dirty as usize].state = 0; 1074 | bm.unpin(dirty); 1075 | dirty = next; 1076 | } 1077 | if bm.dirty_pages != 0 { 1078 | bm.dirty_pages = 0; 1079 | bm.dirtied = 0; 1080 | bm.next_sync = 0; 1081 | Ok(true) 1082 | } else { 1083 | Ok(false) 1084 | } 1085 | } 1086 | 1087 | // 1088 | // Rollback current transaction 1089 | // 1090 | fn rollback(&self, db: &mut Database) -> Result<()> { 1091 | let mut bm = self.buf_mgr.lock().unwrap(); 1092 | let mut dirty = bm.dirty_pages; 1093 | // Just throw away all dirty pages from buffer cache to force reloading of original pages 1094 | while dirty != 0 { 1095 | debug_assert!((bm.pages[dirty as usize].state & PAGE_DIRTY) != 0); 1096 | debug_assert!(bm.pages[dirty as usize].access_count == 1); 1097 | let next = bm.pages[dirty as usize].next; 1098 | bm.throw_buffer(dirty); 1099 | dirty = next; 1100 | } 1101 | bm.dirty_pages = 0; 1102 | bm.dirtied = 0; 1103 | bm.next_sync = 0; 1104 | db.wal_pos -= db.tx_size as u64; 1105 | db.tx_crc = 0; 1106 | db.tx_size = 0; 1107 | 1108 | if db.meta_updated { 1109 | // reread metadata from disk 1110 | let mut page = self.pool[0].write().unwrap(); 1111 | self.file.read_exact_at(&mut page.data, 0)?; 1112 | db.meta = Metadata::unpack(&page.data); 1113 | db.meta_updated = false; 1114 | } 1115 | db.n_aborted_txns += 1; 1116 | Ok(()) 1117 | } 1118 | 1119 | /// 1120 | /// Open database storage. If storage file doesn't exist, then it is created. 1121 | /// If path to transaction log is not specified, then WAL (write-ahead-log) is not used. 1122 | /// It will significantly increase performance but can cause database corruption in case of power failure or system crash. 1123 | /// 1124 | pub fn open(db_path: &Path, log_path: Option<&Path>, conf: StorageConfig) -> Result { 1125 | let mut buf = [0u8; PAGE_SIZE]; 1126 | let (file, meta) = if let Ok(file) = OpenOptions::new().write(true).read(true).open(db_path) 1127 | { 1128 | // open existed file 1129 | file.try_lock_exclusive()?; 1130 | file.read_exact_at(&mut buf, 0)?; 1131 | let meta = Metadata::unpack(&buf); 1132 | ensure!(meta.magic == MAGIC && meta.version == VERSION && meta.size >= 1); 1133 | (file, meta) 1134 | } else { 1135 | let file = OpenOptions::new() 1136 | .write(true) 1137 | .read(true) 1138 | .create(true) 1139 | .open(db_path)?; 1140 | file.try_lock_exclusive()?; 1141 | // create new file 1142 | let meta = Metadata { 1143 | magic: MAGIC, 1144 | version: VERSION, 1145 | free: 0, 1146 | size: 1, 1147 | used: 1, 1148 | root: 0, 1149 | height: 0, 1150 | }; 1151 | let metadata = meta.pack(); 1152 | buf[0..METADATA_SIZE].copy_from_slice(&metadata); 1153 | file.write_all_at(&mut buf, 0)?; 1154 | (file, meta) 1155 | }; 1156 | let log = if let Some(path) = log_path { 1157 | let log = OpenOptions::new() 1158 | .write(true) 1159 | .read(true) 1160 | .create(true) 1161 | .open(path)?; 1162 | log.try_lock_exclusive()?; 1163 | Some(log) 1164 | } else { 1165 | None 1166 | }; 1167 | let storage = Storage { 1168 | busy_events: [(); N_BUSY_EVENTS].map(|_| Condvar::new()), 1169 | buf_mgr: Mutex::new(BufferManager { 1170 | head: 0, 1171 | tail: 0, 1172 | free_pages: 0, 1173 | dirty_pages: 0, 1174 | next_sync: 0, 1175 | used: 1, // pinned root page 1176 | cached: 1, 1177 | pinned: 1, 1178 | dirtied: 0, 1179 | hash_table: vec![0; conf.cache_size], 1180 | pages: vec![PageHeader::new(); conf.cache_size], 1181 | }), 1182 | pool: iter::repeat_with(|| RwLock::new(PageData::new())) 1183 | .take(conf.cache_size) 1184 | .collect(), 1185 | file, 1186 | log, 1187 | conf, 1188 | db: RwLock::new(Database { 1189 | lsn: 0, 1190 | n_aborted_txns: 0, 1191 | meta, 1192 | meta_updated: false, 1193 | recovery: RecoveryStatus { 1194 | ..Default::default() 1195 | }, 1196 | state: DatabaseState::InRecovery, 1197 | wal_pos: 0, 1198 | tx_crc: 0, 1199 | tx_size: 0, 1200 | }), 1201 | }; 1202 | storage.recovery()?; 1203 | Ok(storage) 1204 | } 1205 | 1206 | // 1207 | // Recover database from WAL (if any) 1208 | // 1209 | fn recovery(&self) -> Result<()> { 1210 | let mut db = self.db.write().unwrap(); 1211 | if let Some(log) = &self.log { 1212 | let mut buf = [0u8; 4]; 1213 | let mut crc = 0u32; 1214 | let mut wal_pos = 0u64; 1215 | db.recovery.wal_size = log.metadata()?.len(); 1216 | loop { 1217 | let len = log.read_at(&mut buf, wal_pos)?; 1218 | if len != 4 { 1219 | // end of log 1220 | break; 1221 | } 1222 | wal_pos += 4; 1223 | let pid = PageId::from_be_bytes(buf); 1224 | crc = crc32c_append(crc, &buf); 1225 | if pid != 0 { 1226 | let pin = self.get_page(pid, AccessMode::WriteOnly)?; 1227 | let mut page = self.pool[pin.buf as usize].write().unwrap(); 1228 | let len = log.read_at(&mut page.data, wal_pos)?; 1229 | if len != PAGE_SIZE { 1230 | break; 1231 | } 1232 | wal_pos += len as u64; 1233 | crc = crc32c_append(crc, &page.data); 1234 | } else { 1235 | let mut meta_buf = [0u8; METADATA_SIZE]; 1236 | let len = log.read_at(&mut meta_buf, wal_pos)?; 1237 | if len != METADATA_SIZE { 1238 | break; 1239 | } 1240 | wal_pos += len as u64; 1241 | crc = crc32c_append(crc, &meta_buf); 1242 | let len = log.read_at(&mut buf, wal_pos)?; 1243 | if len != 4 { 1244 | break; 1245 | } 1246 | wal_pos += 4; 1247 | if u32::from_be_bytes(buf) != crc { 1248 | // CRC mismatch 1249 | break; 1250 | } 1251 | { 1252 | let mut page = self.pool[0].write().unwrap(); 1253 | page.data[0..METADATA_SIZE].copy_from_slice(&meta_buf); 1254 | db.meta_updated = true; 1255 | } 1256 | let mut bm = self.buf_mgr.lock().unwrap(); 1257 | self.flush_buffers(&mut bm, true)?; 1258 | db.meta_updated = false; 1259 | db.recovery.recovered_transactions += 1; 1260 | db.recovery.recovery_end = wal_pos; 1261 | crc = 0u32; 1262 | } 1263 | } 1264 | self.rollback(&mut db)?; 1265 | 1266 | // reset WAL 1267 | self.file.sync_all()?; 1268 | db.wal_pos = 0; 1269 | log.set_len(0)?; // truncate log 1270 | } 1271 | // reread metadata 1272 | let mut page = self.pool[0].write().unwrap(); 1273 | self.file.read_exact_at(&mut page.data, 0)?; 1274 | db.meta = Metadata::unpack(&page.data); 1275 | 1276 | db.state = DatabaseState::Opened; 1277 | 1278 | Ok(()) 1279 | } 1280 | 1281 | // 1282 | // Bulk update 1283 | // 1284 | fn do_updates( 1285 | &self, 1286 | db: &mut Database, 1287 | to_upsert: &mut dyn Iterator>, 1288 | to_remove: &mut dyn Iterator>, 1289 | ) -> Result<()> { 1290 | for pair in to_upsert { 1291 | let kv = pair?; 1292 | self.do_upsert(db, &kv.0, &kv.1)?; 1293 | } 1294 | for key in to_remove { 1295 | self.do_remove(db, &key?)?; 1296 | } 1297 | Ok(()) 1298 | } 1299 | 1300 | // 1301 | // Allocate new B-Tree leaf page with single (key,value) element 1302 | // 1303 | fn btree_allocate_leaf_page( 1304 | &self, 1305 | db: &mut Database, 1306 | key: &Key, 1307 | value: &Value, 1308 | ) -> Result { 1309 | let pin = self.new_page(db)?; 1310 | let mut page = self.pool[pin.buf as usize].write().unwrap(); 1311 | page.set_n_items(0); 1312 | page.insert_item(0, key, value); 1313 | Ok(pin.pid) 1314 | } 1315 | 1316 | // 1317 | // Allocate new B-Tree internal page referencing two children 1318 | // 1319 | fn btree_allocate_internal_page( 1320 | &self, 1321 | db: &mut Database, 1322 | key: &Key, 1323 | left_child: PageId, 1324 | right_child: PageId, 1325 | ) -> Result { 1326 | let pin = self.new_page(db)?; 1327 | let mut page = self.pool[pin.buf as usize].write().unwrap(); 1328 | page.set_n_items(0); 1329 | debug_assert!(left_child != 0); 1330 | debug_assert!(right_child != 0); 1331 | page.insert_item(0, key, &left_child.to_be_bytes().to_vec()); 1332 | page.insert_item(1, &vec![], &right_child.to_be_bytes().to_vec()); 1333 | Ok(pin.pid) 1334 | } 1335 | 1336 | // 1337 | // Insert item at the specified position in B-Tree page. 1338 | // If B-Tree pages is full then split it, evenly distribute items between pages: smaller items moved to new page, larger items left on original page. 1339 | // Value of largest key on new page and its identifiers are returned in case of overflow. 1340 | // 1341 | fn btree_insert_in_page( 1342 | &self, 1343 | db: &mut Database, 1344 | page: &mut PageData, 1345 | ip: ItemPointer, 1346 | key: &Key, 1347 | value: &Value, 1348 | ) -> Result> { 1349 | if !page.insert_item(ip, key, value) { 1350 | // page is full then divide page 1351 | let pin = self.new_page(db)?; 1352 | let mut new_page = self.pool[pin.buf as usize].write().unwrap(); 1353 | let split = page.split(&mut new_page, ip); 1354 | let ok = if ip > split { 1355 | page.insert_item(ip - split - 1, key, value) 1356 | } else { 1357 | new_page.insert_item(ip, key, value) 1358 | }; 1359 | ensure!(ok); 1360 | Ok(Some((new_page.get_last_key(), pin.pid))) 1361 | } else { 1362 | Ok(None) 1363 | } 1364 | } 1365 | 1366 | // 1367 | // Remove key from B-Tree. Recursively traverse B-Tree and return true in case of underflow. 1368 | // Right now we do not redistribute nodes between pages or merge pages, underflow is reported only if page becomes empty. 1369 | // If key is not found, then nothing is performed and no error is reported. 1370 | // 1371 | fn btree_remove(&self, db: &mut Database, pid: PageId, key: &Key, height: u32) -> Result { 1372 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1373 | let mut page = self.pool[pin.buf as usize].write().unwrap(); 1374 | let mut l: ItemPointer = 0; 1375 | let n = page.get_n_items(); 1376 | let mut r = n; 1377 | while l < r { 1378 | let m = (l + r) >> 1; 1379 | if page.compare_key(m, key) == Ordering::Greater { 1380 | l = m + 1; 1381 | } else { 1382 | r = m; 1383 | } 1384 | } 1385 | debug_assert!(l == r); 1386 | if height == 1 { 1387 | // leaf page 1388 | if r < n && page.compare_key(r, key) == Ordering::Equal { 1389 | self.modify_page(db, pin.buf)?; 1390 | page.remove_key(r, true); 1391 | } 1392 | } else { 1393 | // recurse to next level 1394 | debug_assert!(r < n); 1395 | let underflow = self.btree_remove(db, page.get_child(r), key, height - 1)?; 1396 | if underflow { 1397 | self.modify_page(db, pin.buf)?; 1398 | page.remove_key(r, false); 1399 | } 1400 | } 1401 | if page.get_n_items() == 0 { 1402 | // free page 1403 | page.set_u32(0, db.meta.free); 1404 | db.meta.free = pid; 1405 | db.meta.used -= 1; 1406 | db.meta_updated = true; 1407 | Ok(true) 1408 | } else { 1409 | Ok(false) 1410 | } 1411 | } 1412 | 1413 | // 1414 | // Insert item in B-Tree. Recursively traverse B-Tree and return position of new page in case of overflow. 1415 | // 1416 | fn btree_insert( 1417 | &self, 1418 | db: &mut Database, 1419 | pid: PageId, 1420 | key: &Key, 1421 | value: &Value, 1422 | height: u32, 1423 | ) -> Result> { 1424 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1425 | let mut page = self.pool[pin.buf as usize].write().unwrap(); 1426 | let mut l: ItemPointer = 0; 1427 | let n = page.get_n_items(); 1428 | let mut r = n; 1429 | while l < r { 1430 | let m = (l + r) >> 1; 1431 | if page.compare_key(m, key) == Ordering::Greater { 1432 | l = m + 1; 1433 | } else { 1434 | r = m; 1435 | } 1436 | } 1437 | debug_assert!(l == r); 1438 | if height == 1 { 1439 | // leaf page 1440 | self.modify_page(db, pin.buf)?; 1441 | if r < n && page.compare_key(r, key) == Ordering::Equal { 1442 | // replace old value with new one: just remove old one and reinsert new key-value pair 1443 | page.remove_key(r, true); 1444 | } 1445 | self.btree_insert_in_page(db, &mut page, r, key, value) 1446 | } else { 1447 | // recurse to next level 1448 | debug_assert!(r < n); 1449 | let overflow = self.btree_insert(db, page.get_child(r), key, value, height - 1)?; 1450 | if let Some((key, child)) = overflow { 1451 | // insert new page before original 1452 | self.modify_page(db, pin.buf)?; 1453 | debug_assert!(child != 0); 1454 | self.btree_insert_in_page(db, &mut page, r, &key, &child.to_be_bytes().to_vec()) 1455 | } else { 1456 | Ok(None) 1457 | } 1458 | } 1459 | } 1460 | 1461 | // 1462 | // Insert or update key in the storage 1463 | // 1464 | fn do_upsert(&self, db: &mut Database, key: &Key, value: &Value) -> Result<()> { 1465 | ensure!(key.len() != 0 && key.len() <= MAX_KEY_LEN && value.len() <= MAX_VALUE_LEN); 1466 | if db.meta.root == 0 { 1467 | db.meta.root = self.btree_allocate_leaf_page(db, key, value)?; 1468 | db.meta.height = 1; 1469 | db.meta_updated = true; 1470 | } else if let Some((key, page)) = 1471 | self.btree_insert(db, db.meta.root, key, value, db.meta.height)? 1472 | { 1473 | // overflow 1474 | db.meta.root = self.btree_allocate_internal_page(db, &key, page, db.meta.root)?; 1475 | db.meta.height += 1; 1476 | db.meta_updated = true; 1477 | } 1478 | Ok(()) 1479 | } 1480 | 1481 | // 1482 | // Remove key from the storage. Does nothing it key not exists. 1483 | // 1484 | fn do_remove(&self, db: &mut Database, key: &Key) -> Result<()> { 1485 | if db.meta.root != 0 { 1486 | let underflow = self.btree_remove(db, db.meta.root, key, db.meta.height)?; 1487 | if underflow { 1488 | db.meta.height = 0; 1489 | db.meta.root = 0; 1490 | db.meta_updated = true; 1491 | } 1492 | } 1493 | Ok(()) 1494 | } 1495 | 1496 | // 1497 | // Perform lookup operation and fill `path` structure with located item and path to it in the tree. 1498 | // If item is not found, then make path empty and current element `None`. 1499 | // 1500 | fn do_lookup( 1501 | &self, 1502 | db: &Database, 1503 | op: LookupOp, 1504 | path: &mut TreePath, 1505 | ) -> Result> { 1506 | ensure!(db.state == DatabaseState::Opened); 1507 | match op { 1508 | LookupOp::First => { 1509 | // Locate left-most element in the tree 1510 | let mut pid = db.meta.root; 1511 | if pid != 0 { 1512 | let mut level = db.meta.height; 1513 | loop { 1514 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1515 | let page = self.pool[pin.buf as usize].read().unwrap(); 1516 | path.stack.push(PagePos { pid, pos: 0 }); 1517 | level -= 1; 1518 | if level == 0 { 1519 | path.curr = Some(page.get_item(0)); 1520 | path.lsn = db.lsn; 1521 | break; 1522 | } else { 1523 | pid = page.get_child(0) 1524 | } 1525 | } 1526 | } 1527 | } 1528 | LookupOp::Last => { 1529 | // Locate right-most element in the tree 1530 | let mut pid = db.meta.root; 1531 | if pid != 0 { 1532 | let mut level = db.meta.height; 1533 | loop { 1534 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1535 | let page = self.pool[pin.buf as usize].read().unwrap(); 1536 | let pos = page.get_n_items() - 1; 1537 | level -= 1; 1538 | path.stack.push(PagePos { pid, pos }); 1539 | if level == 0 { 1540 | path.curr = Some(page.get_item(pos)); 1541 | path.lsn = db.lsn; 1542 | break; 1543 | } else { 1544 | pid = page.get_child(pos) 1545 | } 1546 | } 1547 | } 1548 | } 1549 | LookupOp::Next => { 1550 | if path.lsn == db.lsn || self.reconstruct_path(path, db)? { 1551 | self.move_forward(path, db.meta.height)?; 1552 | } 1553 | } 1554 | LookupOp::Prev => { 1555 | if path.lsn == db.lsn || self.reconstruct_path(path, db)? { 1556 | self.move_backward(path, db.meta.height)?; 1557 | } 1558 | } 1559 | LookupOp::GreaterOrEqual(key) => { 1560 | if db.meta.root != 0 && self.find(db.meta.root, path, &key, db.meta.height)? { 1561 | path.lsn = db.lsn; 1562 | } 1563 | } 1564 | } 1565 | Ok(path.curr.clone()) 1566 | } 1567 | 1568 | // 1569 | // Perform lookup in the database. Initialize path in the tree current element or reset path if no element is found or end of set is reached. 1570 | // 1571 | fn lookup(&self, db: &Database, op: LookupOp, path: &mut TreePath) { 1572 | let result = self.do_lookup(db, op, path); 1573 | if result.is_err() { 1574 | path.curr = None; 1575 | } 1576 | path.result = result.transpose(); 1577 | } 1578 | 1579 | // 1580 | // Locate greater or equal key. 1581 | // Returns true and initializes path to this element if such key is found, 1582 | // reset path and returns false otherwise. 1583 | // 1584 | fn find(&self, pid: PageId, path: &mut TreePath, key: &Key, height: u32) -> Result { 1585 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1586 | let page = self.pool[pin.buf as usize].read().unwrap(); 1587 | let n = page.get_n_items(); 1588 | let mut l: ItemPointer = 0; 1589 | let mut r = n; 1590 | 1591 | while l < r { 1592 | let m = (l + r) >> 1; 1593 | if page.compare_key(m, key) == Ordering::Greater { 1594 | l = m + 1; 1595 | } else { 1596 | r = m; 1597 | } 1598 | } 1599 | debug_assert!(l == r); 1600 | path.stack.push(PagePos { pid, pos: r }); 1601 | if height == 1 { 1602 | // leaf page 1603 | if r < n { 1604 | path.curr = Some(page.get_item(r)); 1605 | Ok(true) 1606 | } else { 1607 | path.curr = None; 1608 | path.stack.pop(); 1609 | Ok(false) 1610 | } 1611 | } else { 1612 | debug_assert!(r < n); 1613 | loop { 1614 | debug_assert!(page.get_child(r) != 0); 1615 | if self.find(page.get_child(r), path, key, height - 1)? { 1616 | return Ok(true); 1617 | } 1618 | path.stack.pop(); 1619 | r += 1; 1620 | if r < n { 1621 | path.stack.push(PagePos { pid, pos: r }); 1622 | } else { 1623 | break; 1624 | } 1625 | } 1626 | Ok(false) 1627 | } 1628 | } 1629 | 1630 | // 1631 | // If storage is updated between iterations then try to reconstruct path by locating current element. 1632 | // Returns true is such element is found and path is successfully reconstructed, false otherwise. 1633 | // 1634 | fn reconstruct_path(&self, path: &mut TreePath, db: &Database) -> Result { 1635 | path.stack.clear(); 1636 | if let Some((key, _value)) = &path.curr.clone() { 1637 | if self.find(db.meta.root, path, &key, db.meta.height)? { 1638 | if let Some((ge_key, _value)) = &path.curr { 1639 | if ge_key == key { 1640 | path.lsn = db.lsn; 1641 | return Ok(true); 1642 | } 1643 | } 1644 | } 1645 | } 1646 | path.curr = None; 1647 | Ok(false) 1648 | } 1649 | 1650 | // 1651 | // Move cursor forward 1652 | // 1653 | fn move_forward(&self, path: &mut TreePath, height: u32) -> Result<()> { 1654 | let mut inc: usize = 1; 1655 | path.curr = None; 1656 | while !path.stack.is_empty() { 1657 | let top = path.stack.pop().unwrap(); 1658 | let pin = self.get_page(top.pid, AccessMode::ReadOnly)?; 1659 | let page = self.pool[pin.buf as usize].read().unwrap(); 1660 | let n_items = page.get_n_items(); 1661 | let pos = top.pos + inc; 1662 | if pos < n_items { 1663 | path.stack.push(PagePos { 1664 | pid: top.pid, 1665 | pos: pos, 1666 | }); 1667 | if path.stack.len() == height as usize { 1668 | let item = page.get_item(pos); 1669 | path.curr = Some(item); 1670 | break; 1671 | } 1672 | // We have to use this trick with `inc` variable on the way down because 1673 | // Rust will detect overflow if we path -1 as pos 1674 | debug_assert!(page.get_child(pos) != 0); 1675 | path.stack.push(PagePos { 1676 | pid: page.get_child(pos), 1677 | pos: 0, 1678 | }); 1679 | inc = 0; 1680 | } else { 1681 | // traverse up 1682 | inc = 1; 1683 | } 1684 | } 1685 | Ok(()) 1686 | } 1687 | 1688 | // 1689 | // Move cursor backward 1690 | // 1691 | fn move_backward(&self, path: &mut TreePath, height: u32) -> Result<()> { 1692 | path.curr = None; 1693 | while !path.stack.is_empty() { 1694 | let top = path.stack.pop().unwrap(); 1695 | let pin = self.get_page(top.pid, AccessMode::ReadOnly)?; 1696 | let page = self.pool[pin.buf as usize].read().unwrap(); 1697 | let pos = if top.pos == usize::MAX { 1698 | page.get_n_items() 1699 | } else { 1700 | top.pos 1701 | }; 1702 | if pos != 0 { 1703 | path.stack.push(PagePos { 1704 | pid: top.pid, 1705 | pos: pos - 1, 1706 | }); 1707 | if path.stack.len() == height as usize { 1708 | let item = page.get_item(pos - 1); 1709 | path.curr = Some(item); 1710 | break; 1711 | } 1712 | path.stack.push(PagePos { 1713 | pid: page.get_child(pos - 1), 1714 | pos: usize::MAX, // start from last element of the page 1715 | }); 1716 | } 1717 | } 1718 | Ok(()) 1719 | } 1720 | 1721 | fn traverse(&self, pid: PageId, prev_key: &mut Key, height: u32) -> Result { 1722 | let pin = self.get_page(pid, AccessMode::ReadOnly)?; 1723 | let page = self.pool[pin.buf as usize].read().unwrap(); 1724 | let n_items = page.get_n_items(); 1725 | let mut count = 0u64; 1726 | if height == 1 { 1727 | for i in 0..n_items { 1728 | ensure!(page.compare_key(i, prev_key) == Ordering::Less); 1729 | *prev_key = page.get_key(i); 1730 | } 1731 | count += n_items as u64; 1732 | } else { 1733 | for i in 0..n_items { 1734 | count += self.traverse(page.get_child(i), prev_key, height - 1)?; 1735 | let ord = page.compare_key(i, prev_key); 1736 | ensure!(ord == Ordering::Less || ord == Ordering::Equal); 1737 | } 1738 | } 1739 | Ok(count) 1740 | } 1741 | } 1742 | 1743 | // 1744 | // Implementation of public methods. 1745 | // I have a problems with extracting them in separate treat. 1746 | // 1747 | impl Storage { 1748 | /// 1749 | /// Perform atomic updates: insert or update `to_upsert` tuples 1750 | /// and remove `to_remove` tuples (if exist) 1751 | /// If all operations are successful, then we commit transaction, otherwise rollback it. 1752 | /// If commit or rollback itself returns error, then database is switched to the corrupted state and no further access to the database is possible. 1753 | /// You will have to close and reopen it. 1754 | /// 1755 | pub fn update( 1756 | &self, 1757 | to_upsert: &mut dyn Iterator>, 1758 | to_remove: &mut dyn Iterator>, 1759 | ) -> Result<()> { 1760 | let mut db = self.db.write().unwrap(); // prevent concurrent access to the database during update operations (MURSIW) 1761 | ensure!(db.state == DatabaseState::Opened); 1762 | let mut result = self.do_updates(&mut db, to_upsert, to_remove); 1763 | if result.is_ok() { 1764 | result = self.commit(&mut db); 1765 | if !result.is_ok() { 1766 | db.state = DatabaseState::Corrupted; 1767 | } 1768 | } else { 1769 | if !self.rollback(&mut db).is_ok() { 1770 | db.state = DatabaseState::Corrupted; 1771 | } 1772 | } 1773 | result 1774 | } 1775 | 1776 | /// 1777 | /// Traverse B-Tree, check B-Tree invariants and return total number of keys in B-Tree 1778 | /// 1779 | pub fn verify(&self) -> Result { 1780 | let db = self.db.read().unwrap(); 1781 | ensure!(db.state == DatabaseState::Opened); 1782 | if db.meta.root != 0 { 1783 | let mut prev_key = Vec::new(); 1784 | self.traverse(db.meta.root, &mut prev_key, db.meta.height) 1785 | } else { 1786 | Ok(0) 1787 | } 1788 | } 1789 | 1790 | /// 1791 | /// Put (key,value) pair in the storage, if such key already exist, associated value is updated 1792 | /// 1793 | pub fn put(&self, key: Key, value: Value) -> Result<()> { 1794 | self.update(&mut iter::once(Ok((key, value))), &mut iter::empty()) 1795 | } 1796 | 1797 | /// 1798 | /// Store value for u32 key 1799 | /// 1800 | pub fn put_u32(&self, key: u32, value: Value) -> Result<()> { 1801 | self.put(key.to_be_bytes().to_vec(), value) 1802 | } 1803 | 1804 | /// 1805 | /// Store value for u64 key 1806 | /// 1807 | pub fn put_u64(&self, key: u64, value: Value) -> Result<()> { 1808 | self.put(key.to_be_bytes().to_vec(), value) 1809 | } 1810 | 1811 | /// 1812 | /// Put (key,value) pairs in the storage, exited keys are updated 1813 | /// 1814 | pub fn put_all(&self, pairs: &mut dyn Iterator>) -> Result<()> { 1815 | self.update(pairs, &mut iter::empty()) 1816 | } 1817 | 1818 | /// 1819 | /// Remove key from the storage, do nothing if not found 1820 | /// 1821 | pub fn remove(&self, key: Key) -> Result<()> { 1822 | self.update(&mut iter::empty(), &mut iter::once(Ok(key))) 1823 | } 1824 | 1825 | /// 1826 | /// Remove u32 key from the storage, do nothing if not found 1827 | /// 1828 | pub fn remove_u32(&self, key: u32) -> Result<()> { 1829 | self.remove(key.to_be_bytes().to_vec()) 1830 | } 1831 | 1832 | /// 1833 | /// Remove u64 key from the storage, do nothing if not found 1834 | /// 1835 | pub fn remove_u64(&self, key: u64) -> Result<()> { 1836 | self.remove(key.to_be_bytes().to_vec()) 1837 | } 1838 | 1839 | /// 1840 | /// Remove keys from the storage, do nothing if not found 1841 | /// 1842 | pub fn remove_all(&self, keys: &mut dyn Iterator>) -> Result<()> { 1843 | self.update(&mut iter::empty(), keys) 1844 | } 1845 | 1846 | /// 1847 | /// Iterator through pairs in key ascending order. 1848 | /// Byte-wise comparison is used, to it is up to serializer to enforce proper ordering, 1849 | /// for example for unsigned integer type big-endian encoding should be used. 1850 | /// 1851 | pub fn iter(&self) -> StorageIterator<'_> { 1852 | self.range(..) 1853 | } 1854 | 1855 | /// 1856 | /// Lookup key in the storage. 1857 | /// 1858 | pub fn get(&self, key: &Key) -> Result> { 1859 | let mut iter = self.range((Included(key), Included(key))); 1860 | Ok(iter.next().transpose()?.map(|kv| kv.1)) 1861 | } 1862 | 1863 | /// 1864 | /// Lookup u32 key in the storage. 1865 | /// 1866 | pub fn get_u32(&self, key: u32) -> Result> { 1867 | self.get(&key.to_be_bytes().to_vec()) 1868 | } 1869 | 1870 | /// 1871 | /// Lookup u64 key in the storage. 1872 | /// 1873 | pub fn get_u64(&self, key: u64) -> Result> { 1874 | self.get(&key.to_be_bytes().to_vec()) 1875 | } 1876 | 1877 | /// 1878 | /// Returns bidirectional iterator 1879 | /// 1880 | pub fn range>(&self, range: R) -> StorageIterator<'_> { 1881 | StorageIterator { 1882 | storage: &self, 1883 | trans: None, 1884 | from: range.start_bound().cloned(), 1885 | till: range.end_bound().cloned(), 1886 | left: TreePath::new(), 1887 | right: TreePath::new(), 1888 | } 1889 | } 1890 | 1891 | /// 1892 | /// Close storage. Close data and WAL files and truncate WAL file. 1893 | /// 1894 | pub fn close(&self) -> Result<()> { 1895 | if let Ok(mut db) = self.db.write() { 1896 | // avoid poisoned lock 1897 | if db.state == DatabaseState::Opened { 1898 | let mut delayed_commit = false; 1899 | if let Ok(bm) = self.buf_mgr.lock() { 1900 | // avoid poisoned mutex 1901 | if bm.dirty_pages != 0 { 1902 | delayed_commit = true; 1903 | } 1904 | } 1905 | if delayed_commit { 1906 | self.commit(&mut db)?; 1907 | } 1908 | // Sync data file and truncate log in case of normal shutdown 1909 | self.file.sync_all()?; 1910 | if let Some(log) = &self.log { 1911 | log.set_len(0)?; // truncate WAL 1912 | } 1913 | db.state = DatabaseState::Closed; 1914 | } 1915 | } 1916 | Ok(()) 1917 | } 1918 | 1919 | /// 1920 | /// Shutdown storage. Unlike close it does't commit delayed transactions, flush data file and truncatate WAL. 1921 | /// 1922 | pub fn shutdown(&self) -> Result<()> { 1923 | let mut db = self.db.write().unwrap(); 1924 | ensure!(db.state == DatabaseState::Opened); 1925 | db.state = DatabaseState::Closed; 1926 | Ok(()) 1927 | } 1928 | 1929 | /// 1930 | /// Get recovery status 1931 | /// 1932 | pub fn get_recovery_status(&self) -> RecoveryStatus { 1933 | let db = self.db.read().unwrap(); 1934 | db.recovery 1935 | } 1936 | 1937 | /// 1938 | /// Get database info 1939 | /// 1940 | pub fn get_database_info(&self) -> DatabaseInfo { 1941 | let db = self.db.read().unwrap(); 1942 | db.get_info() 1943 | } 1944 | 1945 | /// 1946 | /// Get cache info 1947 | /// 1948 | pub fn get_cache_info(&self) -> CacheInfo { 1949 | let bm = self.buf_mgr.lock().unwrap(); 1950 | CacheInfo { 1951 | used: bm.cached as usize, 1952 | pinned: bm.pinned as usize, 1953 | dirtied: bm.dirtied as usize, 1954 | } 1955 | } 1956 | } 1957 | 1958 | impl Drop for Storage { 1959 | fn drop(&mut self) { 1960 | self.close().unwrap(); 1961 | } 1962 | } 1963 | 1964 | impl<'a> Transaction<'_> { 1965 | /// 1966 | /// Commit transaction 1967 | /// 1968 | pub fn commit(&mut self) -> Result<()> { 1969 | ensure!(self.status == TransactionStatus::InProgress); 1970 | self.storage.commit(&mut self.db)?; 1971 | self.status = TransactionStatus::Committed; 1972 | Ok(()) 1973 | } 1974 | 1975 | /// 1976 | /// Delay commit of transaction 1977 | /// 1978 | pub fn delay(&mut self) -> Result<()> { 1979 | ensure!(self.status == TransactionStatus::InProgress); 1980 | self.db.lsn += 1; 1981 | // mark transaction as committed to prevent implicit rollback by destructor 1982 | self.status = TransactionStatus::Committed; 1983 | Ok(()) 1984 | } 1985 | 1986 | /// 1987 | /// Rollback transaction undoing all changes 1988 | /// 1989 | pub fn rollback(&mut self) -> Result<()> { 1990 | ensure!(self.status == TransactionStatus::InProgress); 1991 | self.storage.rollback(&mut self.db)?; 1992 | self.status = TransactionStatus::Aborted; 1993 | Ok(()) 1994 | } 1995 | 1996 | /// 1997 | /// Insert new key in the storage or update existed key as part of this transaction. 1998 | /// 1999 | pub fn put(&mut self, key: &Key, value: &Value) -> Result<()> { 2000 | ensure!(self.status == TransactionStatus::InProgress); 2001 | self.storage.do_upsert(&mut self.db, key, value)?; 2002 | Ok(()) 2003 | } 2004 | 2005 | /// 2006 | /// Store value for u32 key 2007 | /// 2008 | pub fn put_u32(&mut self, key: u32, value: &Value) -> Result<()> { 2009 | self.put(&key.to_be_bytes().to_vec(), value) 2010 | } 2011 | 2012 | /// 2013 | /// Store value for u64 key 2014 | /// 2015 | pub fn put_u64(&mut self, key: u64, value: &Value) -> Result<()> { 2016 | self.put(&key.to_be_bytes().to_vec(), value) 2017 | } 2018 | 2019 | /// 2020 | /// Remove key from storage as part of this transaction. 2021 | /// Does nothing if key not exist. 2022 | /// 2023 | pub fn remove(&mut self, key: &Key) -> Result<()> { 2024 | ensure!(self.status == TransactionStatus::InProgress); 2025 | self.storage.do_remove(&mut self.db, key)?; 2026 | Ok(()) 2027 | } 2028 | 2029 | /// 2030 | /// Remove u32 key from the storage, do nothing if not found 2031 | /// 2032 | pub fn remove_u32(&mut self, key: u32) -> Result<()> { 2033 | self.remove(&key.to_be_bytes().to_vec()) 2034 | } 2035 | 2036 | /// 2037 | /// Remove u64 key from the storage, do nothing if not found 2038 | /// 2039 | pub fn remove_u64(&mut self, key: u64) -> Result<()> { 2040 | self.remove(&key.to_be_bytes().to_vec()) 2041 | } 2042 | 2043 | /// 2044 | /// Iterator through pairs in key ascending order. 2045 | /// Byte-wise comparison is used, to it is up to serializer to enforce proper ordering, 2046 | /// for example for unsigned integer type big-endian encoding should be used. 2047 | /// 2048 | pub fn iter(&self) -> StorageIterator<'_> { 2049 | self.range(..) 2050 | } 2051 | 2052 | /// 2053 | /// Lookup key in the storage. 2054 | /// 2055 | pub fn get(&self, key: &Key) -> Result> { 2056 | let mut iter = self.range((Included(key), Included(key))); 2057 | Ok(iter.next().transpose()?.map(|kv| kv.1)) 2058 | } 2059 | 2060 | /// 2061 | /// Lookup u32 key in the storage. 2062 | /// 2063 | pub fn get_u32(&self, key: u32) -> Result> { 2064 | self.get(&key.to_be_bytes().to_vec()) 2065 | } 2066 | 2067 | /// 2068 | /// Lookup u64 key in the storage. 2069 | /// 2070 | pub fn get_u64(&self, key: u64) -> Result> { 2071 | self.get(&key.to_be_bytes().to_vec()) 2072 | } 2073 | 2074 | /// 2075 | /// Returns bidirectional iterator 2076 | /// 2077 | pub fn range>(&self, range: R) -> StorageIterator<'_> { 2078 | StorageIterator { 2079 | storage: self.storage, 2080 | trans: Some(&self), 2081 | from: range.start_bound().cloned(), 2082 | till: range.end_bound().cloned(), 2083 | left: TreePath::new(), 2084 | right: TreePath::new(), 2085 | } 2086 | } 2087 | /// 2088 | /// Traverse B-Tree, check B-Tree invariants and return total number of keys in B-Tree 2089 | /// 2090 | pub fn verify(&self) -> Result { 2091 | ensure!(self.status == TransactionStatus::InProgress); 2092 | if self.db.meta.root != 0 { 2093 | let mut prev_key = Vec::new(); 2094 | self.storage 2095 | .traverse(self.db.meta.root, &mut prev_key, self.db.meta.height) 2096 | } else { 2097 | Ok(0) 2098 | } 2099 | } 2100 | 2101 | /// 2102 | /// Get database info 2103 | /// 2104 | pub fn get_database_info(&self) -> DatabaseInfo { 2105 | self.db.get_info() 2106 | } 2107 | 2108 | /// 2109 | /// Get cache info 2110 | /// 2111 | pub fn get_cache_info(&self) -> CacheInfo { 2112 | self.storage.get_cache_info() 2113 | } 2114 | } 2115 | 2116 | impl<'a> Drop for Transaction<'a> { 2117 | fn drop(&mut self) { 2118 | if self.status == TransactionStatus::InProgress { 2119 | self.storage.rollback(&mut self.db).unwrap(); 2120 | } 2121 | } 2122 | } 2123 | -------------------------------------------------------------------------------- /tests/storage_spec.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{bail, Result}; 2 | use rand::rngs::StdRng; 3 | use rand::{Rng, SeedableRng}; 4 | use std::convert::TryInto; 5 | use std::iter; 6 | use std::path::Path; 7 | use std::sync::Arc; 8 | use std::thread; 9 | use std::time::Instant; 10 | 11 | use yakv::storage::*; 12 | 13 | const RAND_SEED: u64 = 2021; 14 | const N_RECORDS_LARGE: usize = 1000000; 15 | const N_RECORDS_SMALL: usize = 10000; 16 | 17 | #[test] 18 | fn test_basic_ops() { 19 | let store = open_store("test1.dbs", Some("test1.log")); 20 | { 21 | let mut trans = store.start_transaction(); 22 | trans.put(&v(b"1"), &v(b"one")).unwrap(); 23 | trans.put(&v(b"2"), &v(b"two")).unwrap(); 24 | trans.put(&v(b"3"), &v(b"three")).unwrap(); 25 | trans.put(&v(b"4"), &v(b"four")).unwrap(); 26 | trans.put(&v(b"5"), &v(b"five")).unwrap(); 27 | assert_eq!(trans.get(&v(b"1")).unwrap().unwrap(), v(b"one")); 28 | trans.commit().unwrap(); 29 | } 30 | assert_eq!(store.get(&v(b"1")).unwrap().unwrap(), v(b"one")); 31 | 32 | let mut b = b'1'; 33 | for kv in store.iter().flatten() { 34 | assert_eq!(kv.0, vec![b]); 35 | b += 1; 36 | } 37 | assert_eq!(b, b'6'); 38 | 39 | assert_eq!( 40 | store 41 | .range(..v(b"3")) 42 | .flatten() 43 | .collect::>(), 44 | [(v(b"1"), v(b"one")), (v(b"2"), v(b"two"))] 45 | ); 46 | assert_eq!( 47 | store 48 | .range(v(b"3")..v(b"4")) 49 | .flatten() 50 | .collect::>(), 51 | [(v(b"3"), v(b"three"))] 52 | ); 53 | assert_eq!( 54 | store 55 | .range(v(b"1")..=v(b"2")) 56 | .flatten() 57 | .collect::>(), 58 | [(v(b"1"), v(b"one")), (v(b"2"), v(b"two"))] 59 | ); 60 | assert_eq!( 61 | store 62 | .range(v(b"5")..) 63 | .flatten() 64 | .collect::>(), 65 | [(v(b"5"), v(b"five"))] 66 | ); 67 | 68 | { 69 | let mut it = store.iter(); 70 | assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one"))); 71 | assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two"))); 72 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"5"), v(b"five"))); 73 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"4"), v(b"four"))); 74 | } 75 | { 76 | let mut it = store.range(..v(b"4")); 77 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"3"), v(b"three"))); 78 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"2"), v(b"two"))); 79 | assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one"))); 80 | assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two"))); 81 | } 82 | { 83 | let mut it = store.range(v(b"1")..=v(b"2")); 84 | assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one"))); 85 | assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two"))); 86 | assert!(it.next().is_none()); 87 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"2"), v(b"two"))); 88 | assert_eq!(it.next_back().unwrap().unwrap(), (v(b"1"), v(b"one"))); 89 | assert!(it.next_back().is_none()); 90 | } 91 | store.put(v(b"2"), v(b"two-two")).unwrap(); 92 | assert_eq!(store.get(&v(b"1")).unwrap().unwrap(), v(b"one")); 93 | assert_eq!(store.get(&v(b"2")).unwrap().unwrap(), v(b"two-two")); 94 | assert_eq!(store.get(&v(b"3")).unwrap().unwrap(), v(b"three")); 95 | 96 | store.remove(v(b"3")).unwrap(); 97 | assert_eq!( 98 | store 99 | .range(v(b"2")..v(b"5")) 100 | .flatten() 101 | .collect::>(), 102 | [(v(b"2"), v(b"two-two")), (v(b"4"), v(b"four"))] 103 | ); 104 | } 105 | 106 | fn seq_benchmark( 107 | db_path: &str, 108 | log_path: Option<&str>, 109 | n_records: usize, 110 | transaction_size: usize, 111 | ) -> Result<()> { 112 | let payload1: Vec = vec![1u8; 100]; 113 | let payload2: Vec = vec![2u8; 100]; 114 | { 115 | let store = open_store(db_path, log_path); 116 | let mut key: u64 = 0; 117 | 118 | let mut now = Instant::now(); 119 | for _ in 0..n_records / transaction_size { 120 | store.put_all( 121 | &mut iter::repeat_with(|| { 122 | key += 1; 123 | Ok((pack(key), payload1.clone())) 124 | }) 125 | .take(transaction_size), 126 | )?; 127 | } 128 | println!( 129 | "Elapsed time for {} inserts: {:?}", 130 | n_records, 131 | now.elapsed() 132 | ); 133 | 134 | now = Instant::now(); 135 | for i in 1..=n_records { 136 | let key = (i as u64).to_be_bytes().to_vec(); 137 | assert_eq!(store.get(&key).unwrap().unwrap(), payload1); 138 | } 139 | println!( 140 | "Elapsed time for {} hot lookups: {:?}", 141 | n_records, 142 | now.elapsed() 143 | ); 144 | 145 | now = Instant::now(); 146 | key = 0; 147 | for _ in 0..n_records / transaction_size { 148 | let mut trans = store.start_transaction(); 149 | for _ in 0..transaction_size { 150 | key += 1; 151 | trans.put(&pack(key), &payload2)?; 152 | } 153 | trans.commit()?; 154 | } 155 | println!( 156 | "Elapsed time for {} updates: {:?}", 157 | n_records, 158 | now.elapsed() 159 | ); 160 | 161 | for i in 1..=n_records { 162 | let key = (i as u64).to_be_bytes().to_vec(); 163 | assert_eq!(store.get(&key).unwrap().unwrap(), payload2); 164 | } 165 | } 166 | { 167 | // reopen database 168 | let store = reopen_store(db_path, log_path); 169 | 170 | let mut now = Instant::now(); 171 | for i in 1..=n_records { 172 | let key = (i as u64).to_be_bytes().to_vec(); 173 | assert_eq!(store.get(&key).unwrap().unwrap(), payload2); 174 | } 175 | println!( 176 | "Elapsed time for {} cold lookups: {:?}", 177 | n_records, 178 | now.elapsed() 179 | ); 180 | 181 | now = Instant::now(); 182 | let mut key = 0; 183 | for _ in 0..n_records / transaction_size { 184 | store.remove_all( 185 | &mut iter::repeat_with(|| { 186 | key += 1; 187 | Ok(pack(key)) 188 | }) 189 | .take(transaction_size), 190 | )?; 191 | } 192 | println!( 193 | "Elapsed time for {} removes: {:?}", 194 | n_records, 195 | now.elapsed() 196 | ); 197 | } 198 | Ok(()) 199 | } 200 | 201 | fn rnd_benchmark( 202 | db_path: &str, 203 | log_path: Option<&str>, 204 | n_records: usize, 205 | transaction_size: usize, 206 | ) -> Result<()> { 207 | let payload1: Vec = vec![1u8; 100]; 208 | let payload2: Vec = vec![2u8; 100]; 209 | { 210 | let store = open_store(db_path, log_path); 211 | 212 | let mut rand = StdRng::seed_from_u64(RAND_SEED); 213 | let mut now = Instant::now(); 214 | for _ in 0..n_records / transaction_size { 215 | store.put_all( 216 | &mut iter::repeat_with(|| Ok((rand.gen::<[u8; 8]>().to_vec(), payload1.clone()))) 217 | .take(transaction_size), 218 | )?; 219 | } 220 | println!( 221 | "Elapsed time for {} inserts: {:?}", 222 | n_records, 223 | now.elapsed() 224 | ); 225 | 226 | now = Instant::now(); 227 | rand = StdRng::seed_from_u64(RAND_SEED); 228 | for _ in 0..n_records { 229 | let key = rand.gen::<[u8; 8]>().to_vec(); 230 | assert_eq!(store.get(&key).unwrap().unwrap(), payload1); 231 | } 232 | println!( 233 | "Elapsed time for {} hot lookups: {:?}", 234 | n_records, 235 | now.elapsed() 236 | ); 237 | 238 | now = Instant::now(); 239 | rand = StdRng::seed_from_u64(RAND_SEED); 240 | for _ in 0..n_records / transaction_size { 241 | let mut trans = store.start_transaction(); 242 | for _ in 0..transaction_size { 243 | trans.put(&rand.gen::<[u8; 8]>().to_vec(), &payload2)?; 244 | } 245 | trans.commit()?; 246 | } 247 | println!( 248 | "Elapsed time for {} updates: {:?}", 249 | n_records, 250 | now.elapsed() 251 | ); 252 | 253 | rand = StdRng::seed_from_u64(RAND_SEED); 254 | for _ in 0..n_records { 255 | let key = rand.gen::<[u8; 8]>().to_vec(); 256 | assert_eq!(store.get(&key).unwrap().unwrap(), payload2); 257 | } 258 | } 259 | { 260 | // reopen database 261 | let store = reopen_store(db_path, log_path); 262 | 263 | let mut now = Instant::now(); 264 | let mut rand = StdRng::seed_from_u64(RAND_SEED); 265 | for _ in 1..=n_records { 266 | let key = rand.gen::<[u8; 8]>().to_vec(); 267 | assert_eq!(store.get(&key).unwrap().unwrap(), payload2); 268 | } 269 | println!( 270 | "Elapsed time for {} cold lookups: {:?}", 271 | n_records, 272 | now.elapsed() 273 | ); 274 | 275 | now = Instant::now(); 276 | rand = StdRng::seed_from_u64(RAND_SEED); 277 | for _ in 0..n_records / transaction_size { 278 | store.remove_all( 279 | &mut iter::repeat_with(|| Ok(rand.gen::<[u8; 8]>().to_vec())) 280 | .take(transaction_size), 281 | )?; 282 | } 283 | println!( 284 | "Elapsed time for {} removes: {:?}", 285 | n_records, 286 | now.elapsed() 287 | ); 288 | } 289 | Ok(()) 290 | } 291 | 292 | #[test] 293 | fn seq_benchmark_wal_large_trans() { 294 | assert!(seq_benchmark("test2.dbs", Some("test2.log"), N_RECORDS_LARGE, 1000,).is_ok()); 295 | } 296 | 297 | #[test] 298 | fn seq_benchmark_wal_small_trans() { 299 | assert!(seq_benchmark("test3.dbs", Some("test3.log"), N_RECORDS_SMALL, 1,).is_ok()); 300 | } 301 | 302 | #[test] 303 | fn seq_benchmark_nowal_large_trans() { 304 | assert!(seq_benchmark("test4.dbs", None, N_RECORDS_LARGE, 1000,).is_ok()); 305 | } 306 | 307 | #[test] 308 | fn seq_benchmark_nowal_small_trans() { 309 | assert!(seq_benchmark("test5.dbs", None, N_RECORDS_LARGE, 1,).is_ok()); 310 | } 311 | 312 | #[test] 313 | fn rnd_benchmark_wal_large_trans() { 314 | assert!(rnd_benchmark("test6.dbs", Some("test6.log"), N_RECORDS_LARGE, 1000,).is_ok()); 315 | } 316 | 317 | #[test] 318 | fn rnd_benchmark_wal_small_trans() { 319 | assert!(rnd_benchmark("test7.dbs", Some("test7.log"), N_RECORDS_SMALL, 1,).is_ok()); 320 | } 321 | 322 | #[test] 323 | fn rnd_benchmark_nowal_large_trans() { 324 | assert!(rnd_benchmark("test8.dbs", None, N_RECORDS_LARGE, 1000,).is_ok()); 325 | } 326 | 327 | #[test] 328 | fn rnd_benchmark_nowal_small_trans() { 329 | assert!(rnd_benchmark("test9.dbs", None, N_RECORDS_LARGE, 1,).is_ok()); 330 | } 331 | 332 | #[test] 333 | fn test_acid() { 334 | let store = open_store("test10.dbs", Some("test10.log")); 335 | 336 | assert!(store 337 | .put_all(&mut (0..100).map(|key| { 338 | if key == 50 { 339 | bail!("Simulate failure") 340 | } else { 341 | Ok((pack(key), v(b"hello world!"))) 342 | } 343 | })) 344 | .is_err()); 345 | 346 | assert_eq!(store.iter().count(), 0); 347 | 348 | assert!(store 349 | .put_all(&mut (0..100).map(|key| { Ok((pack(key), v(b"hello world!"))) })) 350 | .is_ok()); 351 | 352 | assert!(store 353 | .put_all(&mut (0..100).map(|key| { 354 | if key == 50 { 355 | bail!("Simulate failure") 356 | } else { 357 | Ok((pack(key), v(b"good bye!"))) 358 | } 359 | })) 360 | .is_err()); 361 | 362 | assert_eq!( 363 | store 364 | .iter() 365 | .flatten() 366 | .map(|kv| assert_eq!(kv.1, v(b"hello world!"))) 367 | .count(), 368 | 100 369 | ); 370 | 371 | assert!(store 372 | .remove_all(&mut (0..100).map(|key| { 373 | if key == 50 { 374 | bail!("Simulate failure") 375 | } else { 376 | Ok(pack(key)) 377 | } 378 | })) 379 | .is_err()); 380 | 381 | assert_eq!(store.iter().count(), 100); 382 | } 383 | 384 | #[test] 385 | fn test_recovery() { 386 | let data_path = Path::new("test11.dbs"); 387 | let log_path = Path::new("test11.log"); 388 | const N_KEYS: u64 = 100000; 389 | { 390 | let _ = std::fs::remove_file(&data_path); 391 | let _ = std::fs::remove_file(&log_path); 392 | let mut cfg = StorageConfig::default(); 393 | cfg.wal_flush_threshold = 1; 394 | let store = Storage::open(data_path, Some(log_path), cfg).unwrap(); 395 | { 396 | let mut trans = store.start_transaction(); 397 | for key in 0..N_KEYS { 398 | trans.put(&pack(key), &v(b"first")).unwrap(); 399 | } 400 | trans.commit().unwrap(); 401 | } 402 | { 403 | let mut trans = store.start_transaction(); 404 | for key in 0..N_KEYS { 405 | trans.put(&pack(key), &v(b"two")).unwrap(); 406 | } 407 | trans.commit().unwrap(); 408 | } 409 | { 410 | let mut trans = store.start_transaction(); 411 | for key in 0..N_KEYS { 412 | trans.put(&pack(key), &v(b"three")).unwrap(); 413 | } 414 | // transaction shoud be implicitly aborted 415 | } 416 | store.shutdown().unwrap(); // do not truncate WAL 417 | } 418 | { 419 | let store = Storage::open(data_path, Some(log_path), StorageConfig::default()).unwrap(); 420 | let recovery = store.get_recovery_status(); 421 | assert_eq!(recovery.recovered_transactions, 2); 422 | assert!(recovery.wal_size > recovery.recovery_end); 423 | for key in 0..N_KEYS { 424 | assert_eq!(store.get(&pack(key)).unwrap().unwrap(), v(b"two")); 425 | } 426 | } 427 | } 428 | 429 | fn do_inserts(s: Arc, tid: u32, n_records: u32) -> Result<()> { 430 | let tid_bytes = tid.to_be_bytes(); 431 | for id in 0..n_records { 432 | let mut key: Vec = Vec::new(); 433 | key.extend_from_slice(&id.to_be_bytes()); 434 | key.extend_from_slice(&tid_bytes); 435 | s.put(key, tid_bytes.to_vec())?; 436 | } 437 | Ok(()) 438 | } 439 | 440 | fn do_selects(s: Arc, n_records: usize) { 441 | while s.iter().count() != n_records {} 442 | } 443 | 444 | #[test] 445 | fn test_parallel_access() { 446 | let store = Arc::new(open_store("test1.dbs", None)); 447 | let n_writers = 10u32; 448 | let n_records = 10000u32; 449 | let mut threads = Vec::new(); 450 | for i in 0..n_writers { 451 | let s = store.clone(); 452 | threads.push(thread::spawn(move || { 453 | do_inserts(s, i, n_records).unwrap(); 454 | })); 455 | } 456 | let s = store.clone(); 457 | threads.push(thread::spawn(move || { 458 | do_selects(s, n_records as usize * n_writers as usize); 459 | })); 460 | for t in threads { 461 | t.join().expect("Thread crashed"); 462 | } 463 | let mut id = 0u32; 464 | let mut tid = 0u32; 465 | for entry in store.iter() { 466 | let pair = entry.unwrap(); 467 | let key = pair.0; 468 | let value = pair.1; 469 | let curr_id = u32::from_be_bytes(key[0..4].try_into().unwrap()); 470 | let curr_tid = u32::from_be_bytes(key[4..8].try_into().unwrap()); 471 | let curr_value = u32::from_be_bytes(value.try_into().unwrap()); 472 | assert_eq!(curr_id, id); 473 | assert_eq!(curr_tid, tid); 474 | assert_eq!(curr_value, tid); 475 | tid += 1; 476 | if tid == n_writers { 477 | tid = 0; 478 | id += 1; 479 | } 480 | } 481 | assert_eq!(id, n_records); 482 | } 483 | 484 | fn v(b: &[u8]) -> Key { 485 | b.to_vec() 486 | } 487 | 488 | fn pack(key: u64) -> Vec { 489 | key.to_be_bytes().to_vec() 490 | } 491 | 492 | fn open_store(data_file: &str, log_file: Option<&str>) -> Storage { 493 | let data_path = Path::new(data_file); 494 | let log_path = log_file.map(|wal| Path::new(wal)); 495 | let _ = std::fs::remove_file(&data_path); 496 | if let Some(log) = log_path { 497 | let _ = std::fs::remove_file(&log); 498 | } 499 | Storage::open(data_path, log_path, StorageConfig::default()).unwrap() 500 | } 501 | 502 | fn reopen_store(data_file: &str, log_file: Option<&str>) -> Storage { 503 | let data_path = Path::new(data_file); 504 | let log_path = log_file.map(|wal| Path::new(wal)); 505 | Storage::open(data_path, log_path, StorageConfig::default()).unwrap() 506 | } 507 | --------------------------------------------------------------------------------