├── Cargo.toml
├── LICENSE
├── README.md
├── run-tests.sh
├── src
    ├── lib.rs
    └── storage.rs
└── tests
    └── storage_spec.rs


/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "yakv"
 3 | version = "0.2.0"
 4 | authors = ["Konstantin Knizhnik <knizhnik@garret.ru>"]
 5 | edition = "2018"
 6 | license = "MIT OR Apache-2.0"
 7 | description = "Simple persistent key-value storage based on B-Tree"
 8 | repository = "https://github.com/knizhnik/yakv.git"
 9 | readme = "README.md"
10 | 
11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
12 | 
13 | [dependencies]
14 | crc32c = "0.6.0"
15 | fs2 = "0.4.3"
16 | anyhow = "1.0"
17 | 
18 | [dev-dependencies]
19 | rand = "0.8.3"
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **YAKV** is very simple persistent-key value storage implemented in Rust
  2 | using "traditional" architecture: B-Tree, buffer cache, ACID transaction, write-ahead log.
  3 | **YAKV** implements simple MURSIW (multiple-reads-single-writer) access pattern
  4 | and is first of all oriented on embedded applications.
  5 | 
  6 | It has minimal dependencies from other modules and contains just 2k lines of code.
  7 | API of storage is very simple: `put/remove` methods for updating information
  8 | and `get/iter/range` range methods for retrieving it.
  9 | `put` performs update or insert: it key is not present in the storage, then it is inserted
 10 | otherwise associated value is updated. Bulk version of `put/remove` are available which accepts iterators
 11 | of pairs/keys. Bulk updates are atomic: i.e. all operations are succeeded or rejected.
 12 | 
 13 | Iteration can be done using bidirectional (_double-ended_) iterator and standard Rust ranges, which
 14 | allows to simply specify any ranges with open/inclusive/exclusive boundaries. Iterators are not atomic:
 15 | i.e. during iteration you can see most recent committed updates of the storage. Moreover,
 16 | if concurrent updates delete key at current iterator position, then iteration will stop before processing all results.
 17 | 
 18 | Another way of grouping operations is explicit start of transaction.
 19 | Transaction has exclusive access to the database and can perform both update and lookup operations.
 20 | At the end transaction should be explicitly committed or aborted, if it was not committed before leaving the scope,
 21 | then it is implicitly aborted.
 22 | 
 23 | **YAKV** supports multi-threaded access to the storage. All threads can share single storage instance (you should use Arc for it).
 24 | Storage implementation is thread safe and all methods are immutable, so you can call them concurrently from different threads.
 25 | But only one thread can update database at each moment of time: other readers or writers will be blocked until the end of transaction.
 26 | Readers can work in parallel.
 27 | 
 28 | **YAKV** requires key and value to be a vector of bytes. If you want to store other types, you need to serialize them first.
 29 | If you need to preserve natural comparison order for underlying type, then you will have to use proper serializer.
 30 | For example for unsigned integer types you need to use _big-endian_ encoding (i.e. `key.to_be_bytes()`)
 31 | to make vector of bytes comparison produce the same result as comparison of two numbers.
 32 | For signed or floating point types writing such serializer may require more efforts.
 33 | 
 34 | **YAKV** optionally keeps write ahead log (WAL) to provide ACID.
 35 | Maintaining WAL requires `fsync` system calls to force persisting data to the non-volatile media.
 36 | Add add significant performance penalty especially for small transaction as (inserting just one pair).
 37 | But without WAL database can be corrupted in case of abnormal program termination or power failure.
 38 | To disable WAL just pass `None` instead of WAL file path.
 39 | 
 40 | Below is an example of **YAKV** usage":
 41 | 
 42 | ```
 43 | let store = Storage::open(data_path, Some(log_path), StorageConfig::default())?;
 44 | 
 45 | // Simple insert/update:
 46 | let key = b"Some key".to_vec();
 47 | let value = b"Some value".to_vec();
 48 | store.put(key, value)?;
 49 | 
 50 | // Simple remove:
 51 | let key = b"Some key".to_vec();
 52 | store.remove(key)?;
 53 | 
 54 | // Bulk update
 55 | store.put_all(
 56 | 	&mut iter::repeat_with(|| {
 57 | 	    let key = rand.gen::<[u8; KEY_LEN]>().to_vec();
 58 | 	    let value = rand.gen::<[u8; VALUE_LEN]>().to_vec();
 59 |         Ok((key, value))
 60 | 	} ).take(TRANSACTION_SIZE))?;
 61 | 
 62 | // Bulk delete
 63 | store.remove_all(
 64 |     &mut iter::repeat_with(|| Ok(rand.gen::<[u8; 8]>().to_vec()))
 65 |        .take(TRANSACTION_SIZE),
 66 | 
 67 | // Explicit transaction:
 68 | {
 69 |     let trans = store.start_transaction();
 70 |     trans.put(&1u64.to_be_bytes().to_vec(), &PAYLOAD)?;
 71 |     trans.put(&2u64.to_be_bytes().to_vec(), &PAYLOAD)?;
 72 |     trans.remove(&2u64.to_be_bytes().to_vec())?;
 73 |     trans.commit()?;
 74 | }
 75 | 
 76 | // Simple lookup
 77 | let key = b"Some key".to_vec();
 78 | if let Some(value) = store.get(&key)? {
 79 |     println!("key={:?}, value={:?}", &key, &value);
 80 | }
 81 | 
 82 | // Iterate through all records:
 83 | for entry in store.iter() {
 84 |     let kv = entry?;
 85 | 	println!("key={:?}, value={:?}", &kv.0, &kv.1);
 86 | }
 87 | 
 88 | // Range iterator:
 89 | let from_key = b"AAA".to_vec();
 90 | let till_key = b"ZZZ".to_vec();
 91 | for entry in store.range(from_key..till_key) {
 92 |     let kv = entry?;
 93 | 	println!("key={:?}, value={:?}", &kv.0, &kv.1);
 94 | }
 95 | 
 96 | // Backward iterartion:
 97 | let till_key = b"XYZ".to_vec();
 98 | let mut it = store.range(..=till_key);
 99 | while let Some(entry) = it.next_back() {
100 |     let kv = entry?;
101 | 	println!("key={:?}, value={:?}", &kv.0, &kv.1);
102 | }
103 | 
104 | // Close storage
105 | store.close()?;
106 | ```
107 | 
108 | Performance comparison:
109 | 
110 | Below are results (msec: smaller is better) of two benchamrks.
111 | 
112 | SwayDB benchmark: insertion 1M records with 8 byte key and 8 byte value.
113 | 
114 | | db      |  seq  |  rnd  |
115 | | ------- | ----- | ----- |
116 | | SwayDB  | 5526  | 14849 |
117 | | LevelDB | 1107  | 7969  |
118 | | yakv    |  594  | 1263  |
119 | 
120 | 
121 | LMDB benchmark: insert+read of 1M records with 4-bytes key and 100 byte value.
122 | 
123 | | db        | seq-write | rnd-write | seq-read | rnd-read |
124 | | --------- | --------- | --------- | -------- | -------- |
125 | | Chronicle | 836       | 894       | 613      | 634      |
126 | | LevelDB   | 1962      | 2089      | 2223     | 2476     |
127 | | LMDB      | 144       | 896       | 108      | 634      |
128 | | MapDB     | 8876      | 9304      | 9676     | 9707     |
129 | | MVStore   | 1328      | 1675      | 7757     | 8420     |
130 | | RocksDB   | 1800      | 1850      | 1814     | 2067     |
131 | | Xodus     | 674       | 13981     | 3486     | 4978     |
132 | | kv        | 5626      | 7546      | 742      | 1943     |
133 | | yakv      | 1079      | 1617      | 549      | 1020     |
134 | 
135 | 
136 | Performance dependency on transaction size (LMDB vs. YAKV or COW vs. WAL).
137 | This benchmark inserts 1M random keys (as in LMDB benchmark),
138 | but inserts are grouped in transactions (time in msec):
139 | 
140 | | tx size |  yakv  |  LMDB  |
141 | | ------- | ------ | ------ |
142 | | 1000000 |   1543 |   1367 |
143 | | 100000  |   3914 |   3022 |
144 | | 10000   |  16384 |   8139 |
145 | | 1000    |  30944 |  16881 |
146 | | 100     |  85268 |  70775 |
147 | | 10      | 192179 | 229538 |
148 | 
149 | So for large transactions LMDB is slightly faster, for small transactions YAKV is faster
150 | and for medium size transactions LMDB is about two times faster than YAKV.
151 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
1 | cargo +nightly test   --release -v -- --nocapture --test-threads=1
2 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod storage;
2 | 


--------------------------------------------------------------------------------
/src/storage.rs:
--------------------------------------------------------------------------------
   1 | use anyhow::{ensure, Result};
   2 | use crc32c::*;
   3 | use fs2::FileExt;
   4 | use std::cmp::Ordering;
   5 | use std::convert::TryInto;
   6 | use std::fs::{File, OpenOptions};
   7 | use std::iter;
   8 | use std::ops::Bound::*;
   9 | use std::ops::{Bound, RangeBounds};
  10 | use std::os::unix::prelude::FileExt as UnixFileExt;
  11 | use std::path::Path;
  12 | use std::sync::{Condvar, Mutex, RwLock, RwLockWriteGuard};
  13 | 
  14 | type PageId = u32; // address of page in the file
  15 | type BufferId = u32; // index of page in buffer cache
  16 | type LSN = u64; // logical serial number: monotonically increased counter of database state changes
  17 | type ItemPointer = usize; // offset within page, actually only 16 bits is enough, but use usize to avoid type casts when used as an index
  18 | 
  19 | ///
  20 | /// Storage key type. If you want to use some other types as a key, you will have to serialize them (for example using serde).
  21 | /// As far as vectors are compared using byte-by-byte comparison, you need to take it in account during serialization if
  22 | /// you need to preserve order for original type. For example unsigned integer type should be serialized as big-endian (most significant byte first).
  23 | ///
  24 | pub type Key = Vec<u8>;
  25 | 
  26 | ///
  27 | /// Storage value type. All other types should be serialized to vector of bytes.
  28 | ///
  29 | pub type Value = Vec<u8>;
  30 | 
  31 | const PAGE_SIZE: usize = 8192;
  32 | const MAGIC: u32 = 0xBACE2021;
  33 | const VERSION: u32 = 1;
  34 | const METADATA_SIZE: usize = 7 * 4;
  35 | const PAGE_HEADER_SIZE: usize = 2; // now page header contains just number of items in the page
  36 | const MAX_VALUE_LEN: usize = PAGE_SIZE / 4; // assume that pages may fit at least 3 items
  37 | const MAX_KEY_LEN: usize = u8::MAX as usize; // should fit in one byte
  38 | const N_BUSY_EVENTS: usize = 8; // number of condition variables used for waiting read completion
  39 | 
  40 | // Flags for page state
  41 | const PAGE_RAW: u16 = 1; // buffer content is uninitialized
  42 | const PAGE_BUSY: u16 = 2; // buffer is loaded for the disk
  43 | const PAGE_DIRTY: u16 = 4; // buffer was updates
  44 | const PAGE_WAIT: u16 = 8; // some thread waits until buffer is loaded
  45 | const PAGE_SYNCED: u16 = 16; // dirty pages was saved to log
  46 | 
  47 | enum LookupOp<'a> {
  48 |     First,
  49 |     Last,
  50 |     Next,
  51 |     Prev,
  52 |     GreaterOrEqual(&'a Key),
  53 | }
  54 | 
  55 | #[derive(PartialEq, Copy, Clone, Debug)]
  56 | pub enum DatabaseState {
  57 |     InRecovery,
  58 |     Opened,
  59 |     Closed,
  60 |     Corrupted,
  61 | }
  62 | 
  63 | ///
  64 | /// Storage configuration parameters
  65 | ///
  66 | #[derive(Copy, Clone, Debug)]
  67 | pub struct StorageConfig {
  68 |     /// Buffer pool (pages)
  69 |     pub cache_size: usize,
  70 |     /// Maximal size of WAL. When it is reached, database file is synced and WAL is rotated
  71 |     /// (write starts from the beginning)
  72 |     pub checkpoint_interval: u64,
  73 |     /// Threshold for flushing dirty pages to WAL (to reduce commit time)
  74 |     pub wal_flush_threshold: BufferId,
  75 | }
  76 | 
  77 | impl StorageConfig {
  78 |     pub fn default() -> StorageConfig {
  79 |         StorageConfig {
  80 |             cache_size: 128 * 1024,                         // 1Gb
  81 |             checkpoint_interval: 1u64 * 1024 * 1024 * 1024, // 1Gb
  82 |             wal_flush_threshold: BufferId::MAX,
  83 |         }
  84 |     }
  85 | }
  86 | 
  87 | #[derive(PartialEq)]
  88 | enum AccessMode {
  89 |     ReadOnly,
  90 |     WriteOnly,
  91 | }
  92 | 
  93 | ///
  94 | /// Status of transaction
  95 | ///
  96 | #[derive(PartialEq)]
  97 | pub enum TransactionStatus {
  98 |     InProgress,
  99 |     Committed,
 100 |     Aborted,
 101 | }
 102 | 
 103 | ///
 104 | /// Explicitly started transaction. Storage can be updated in autocommit mode
 105 | /// or using explicitly started transaction.
 106 | ///
 107 | pub struct Transaction<'a> {
 108 |     pub status: TransactionStatus,
 109 |     storage: &'a Storage,
 110 |     db: RwLockWriteGuard<'a, Database>,
 111 | }
 112 | 
 113 | ///
 114 | /// Status of automatic database recovery after open.
 115 | /// In case of start after normal shutdown all fields should be zero.
 116 | ///
 117 | #[derive(Clone, Copy, Default)]
 118 | pub struct RecoveryStatus {
 119 |     /// number of recovered transactions
 120 |     pub recovered_transactions: u64,
 121 |     /// size of WAL at the moment of recovery
 122 |     pub wal_size: u64,
 123 |     /// position of last recovery transaction in WAL
 124 |     pub recovery_end: u64,
 125 | }
 126 | 
 127 | ///
 128 | /// Database info
 129 | ///
 130 | #[derive(Clone, Copy, Debug)]
 131 | pub struct DatabaseInfo {
 132 |     /// Height of B-Tree
 133 |     pub tree_height: usize,
 134 |     /// Size of database file
 135 |     pub db_size: u64,
 136 |     /// Total size of used pages
 137 |     pub db_used: u64,
 138 |     /// Current WAL size
 139 |     pub log_size: u64,
 140 |     /// number of committed transactions in this session
 141 |     pub n_committed_transactions: u64,
 142 |     /// number of aborted transactions in this session
 143 |     pub n_aborted_transactions: u64,
 144 |     /// State of the database
 145 |     pub state: DatabaseState,
 146 | }
 147 | 
 148 | ///
 149 | /// Buffer cache info
 150 | ///
 151 | #[derive(Clone, Copy, Debug)]
 152 | pub struct CacheInfo {
 153 |     /// Number of pinned pages in buffer cache
 154 |     pub pinned: usize,
 155 |     /// Number of dirty pages in buffer cache
 156 |     pub dirtied: usize,
 157 |     /// Total number of pages in buffer cache
 158 |     pub used: usize,
 159 | }
 160 | 
 161 | ///
 162 | /// Abstract storage bidirectional iterator
 163 | ///
 164 | pub struct StorageIterator<'a> {
 165 |     storage: &'a Storage,
 166 |     trans: Option<&'a Transaction<'a>>,
 167 |     from: Bound<Key>,
 168 |     till: Bound<Key>,
 169 |     left: TreePath,
 170 |     right: TreePath,
 171 | }
 172 | 
 173 | //
 174 | // Position in the page saved during tree traversal
 175 | //
 176 | struct PagePos {
 177 |     pid: PageId,
 178 |     pos: usize,
 179 | }
 180 | 
 181 | //
 182 | // Path in B-Tree to the current iterator's element
 183 | //
 184 | struct TreePath {
 185 |     curr: Option<(Key, Value)>, // current (key,value) pair if any
 186 |     result: Option<Result<(Key, Value)>>,
 187 |     stack: Vec<PagePos>, // stack of positions in B-Tree
 188 |     lsn: LSN,            // LSN of last operation
 189 | }
 190 | 
 191 | impl TreePath {
 192 |     fn new() -> TreePath {
 193 |         TreePath {
 194 |             curr: None,
 195 |             result: None,
 196 |             stack: Vec::new(),
 197 |             lsn: 0,
 198 |         }
 199 |     }
 200 | }
 201 | 
 202 | impl<'a> StorageIterator<'a> {
 203 |     fn next_locked(&mut self, db: &Database) -> Option<<StorageIterator<'a> as Iterator>::Item> {
 204 |         if self.left.stack.len() == 0 {
 205 |             match &self.from {
 206 |                 Bound::Included(key) => {
 207 |                     self.storage
 208 |                         .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.left)
 209 |                 }
 210 |                 Bound::Excluded(key) => {
 211 |                     self.storage
 212 |                         .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.left);
 213 |                     if let Some((curr_key, _value)) = &self.left.curr {
 214 |                         if curr_key == key {
 215 |                             self.storage.lookup(db, LookupOp::Next, &mut self.left);
 216 |                         }
 217 |                     }
 218 |                 }
 219 |                 Bound::Unbounded => self.storage.lookup(db, LookupOp::First, &mut self.left),
 220 |             }
 221 |         } else {
 222 |             self.storage.lookup(db, LookupOp::Next, &mut self.left);
 223 |         }
 224 |         if let Some((curr_key, _value)) = &self.left.curr {
 225 |             match &self.till {
 226 |                 Bound::Included(key) => {
 227 |                     if curr_key > key {
 228 |                         return None;
 229 |                     }
 230 |                 }
 231 |                 Bound::Excluded(key) => {
 232 |                     if curr_key >= key {
 233 |                         return None;
 234 |                     }
 235 |                 }
 236 |                 Bound::Unbounded => {}
 237 |             }
 238 |         }
 239 |         self.left.result.take()
 240 |     }
 241 | 
 242 |     fn next_back_locked(
 243 |         &mut self,
 244 |         db: &Database,
 245 |     ) -> Option<<StorageIterator<'a> as Iterator>::Item> {
 246 |         if self.right.stack.len() == 0 {
 247 |             match &self.till {
 248 |                 Bound::Included(key) => {
 249 |                     self.storage
 250 |                         .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.right);
 251 |                     if let Some((curr_key, _value)) = &self.right.curr {
 252 |                         if curr_key > key {
 253 |                             self.storage.lookup(db, LookupOp::Prev, &mut self.right);
 254 |                         }
 255 |                     } else {
 256 |                         self.storage.lookup(db, LookupOp::Last, &mut self.right);
 257 |                     }
 258 |                 }
 259 |                 Bound::Excluded(key) => {
 260 |                     self.storage
 261 |                         .lookup(db, LookupOp::GreaterOrEqual(key), &mut self.right);
 262 |                     if let Some((curr_key, _value)) = &self.right.curr {
 263 |                         if curr_key >= key {
 264 |                             self.storage.lookup(db, LookupOp::Prev, &mut self.right);
 265 |                         }
 266 |                     } else {
 267 |                         self.storage.lookup(db, LookupOp::Last, &mut self.right);
 268 |                     }
 269 |                 }
 270 |                 Bound::Unbounded => self.storage.lookup(db, LookupOp::Last, &mut self.right),
 271 |             }
 272 |         } else {
 273 |             self.storage.lookup(db, LookupOp::Prev, &mut self.right);
 274 |         }
 275 |         if let Some((curr_key, _value)) = &self.right.curr {
 276 |             match &self.from {
 277 |                 Bound::Included(key) => {
 278 |                     if curr_key < key {
 279 |                         return None;
 280 |                     }
 281 |                 }
 282 |                 Bound::Excluded(key) => {
 283 |                     if curr_key <= key {
 284 |                         return None;
 285 |                     }
 286 |                 }
 287 |                 Bound::Unbounded => {}
 288 |             }
 289 |         }
 290 |         self.right.result.take()
 291 |     }
 292 | }
 293 | 
 294 | impl<'a> Iterator for StorageIterator<'a> {
 295 |     type Item = Result<(Key, Value)>;
 296 | 
 297 |     fn next(&mut self) -> Option<Self::Item> {
 298 |         if let Some(trans) = self.trans {
 299 |             assert!(trans.status == TransactionStatus::InProgress);
 300 |             self.next_locked(&trans.db)
 301 |         } else {
 302 |             let db = self.storage.db.read().unwrap();
 303 |             self.next_locked(&db)
 304 |         }
 305 |     }
 306 | }
 307 | 
 308 | impl<'a> DoubleEndedIterator for StorageIterator<'a> {
 309 |     fn next_back(&mut self) -> Option<Self::Item> {
 310 |         if let Some(trans) = self.trans {
 311 |             assert!(trans.status == TransactionStatus::InProgress);
 312 |             self.next_back_locked(&trans.db)
 313 |         } else {
 314 |             let db = self.storage.db.read().unwrap();
 315 |             self.next_back_locked(&db)
 316 |         }
 317 |     }
 318 | }
 319 | 
 320 | ///
 321 | /// Persistent key-value storage implementation
 322 | /// Update operations are atomic, select operations are non-atomic and observe most recent database state.
 323 | ///
 324 | pub struct Storage {
 325 |     db: RwLock<Database>,
 326 |     buf_mgr: Mutex<BufferManager>,
 327 |     busy_events: [Condvar; N_BUSY_EVENTS],
 328 |     pool: Vec<RwLock<PageData>>,
 329 |     conf: StorageConfig,
 330 |     file: File,
 331 |     log: Option<File>,
 332 | }
 333 | 
 334 | //
 335 | // Page header in buffer manager
 336 | //
 337 | #[derive(Clone, Copy, Default)]
 338 | struct PageHeader {
 339 |     pid: PageId,
 340 |     collision: BufferId, // collision chain
 341 |     // LRU l2-list
 342 |     next: BufferId,
 343 |     prev: BufferId,
 344 |     access_count: u16,
 345 |     state: u16, // bitmask of PAGE_RAW, PAGE_DIRTY, ...
 346 | }
 347 | 
 348 | impl PageHeader {
 349 |     fn new() -> PageHeader {
 350 |         Default::default()
 351 |     }
 352 | }
 353 | 
 354 | //
 355 | // Database metadata
 356 | //
 357 | #[derive(Copy, Clone)]
 358 | struct Metadata {
 359 |     magic: u32,   // storage magic
 360 |     version: u32, // storage format version
 361 |     free: PageId, // L1 list of free pages
 362 |     size: PageId, // size of database (pages)
 363 |     used: PageId, // number of used database pages
 364 |     root: PageId, // B-Tree root page
 365 |     height: u32,  // height of B-Tree
 366 | }
 367 | 
 368 | impl Metadata {
 369 |     fn pack(self) -> [u8; METADATA_SIZE] {
 370 |         unsafe { std::mem::transmute::<Metadata, [u8; METADATA_SIZE]>(self) }
 371 |     }
 372 |     fn unpack(buf: &[u8]) -> Metadata {
 373 |         unsafe {
 374 |             std::mem::transmute::<[u8; METADATA_SIZE], Metadata>(
 375 |                 buf[0..METADATA_SIZE].try_into().unwrap(),
 376 |             )
 377 |         }
 378 |     }
 379 | }
 380 | 
 381 | //
 382 | // Database shared state
 383 | //
 384 | struct Database {
 385 |     meta: Metadata,           // cached metadata (stored in root page)
 386 |     meta_updated: bool,       // whether metadata was updated
 387 |     lsn: LSN,                 // database modification counter
 388 |     n_aborted_txns: LSN,      // number of aborted transactions
 389 |     state: DatabaseState,     // database state
 390 |     wal_pos: u64,             // current position in log file
 391 |     tx_crc: u32,              // accumulated CRC of the current transaction
 392 |     tx_size: usize,           // current transaction size
 393 |     recovery: RecoveryStatus, // status of recovery
 394 | }
 395 | 
 396 | impl Database {
 397 |     fn get_info(&self) -> DatabaseInfo {
 398 |         DatabaseInfo {
 399 |             db_size: self.meta.size as u64 * PAGE_SIZE as u64,
 400 |             db_used: self.meta.used as u64 * PAGE_SIZE as u64,
 401 |             tree_height: self.meta.height as usize,
 402 |             log_size: self.wal_pos,
 403 |             state: self.state,
 404 |             n_committed_transactions: self.lsn,
 405 |             n_aborted_transactions: self.n_aborted_txns,
 406 |         }
 407 |     }
 408 | }
 409 | 
 410 | //
 411 | // Buffer manager is using L2-list for LRU cache eviction policy,
 412 | // L1 lists for free and dirty pages.
 413 | // All modified pages are pinned till the end of transaction.
 414 | // Indexes are used instead of pointers to reduce memory footprint and bypass Rust ownership/visibility rules.
 415 | // Page with index 0 is reserved in buffer manager for root page. It is not included in any list, so 0 is treated as terminator.
 416 | //
 417 | struct BufferManager {
 418 |     // LRU l2-list
 419 |     head: BufferId,
 420 |     tail: BufferId,
 421 | 
 422 |     free_pages: BufferId,  // L1-list of free pages
 423 |     dirty_pages: BufferId, // L2-list of dirty pages
 424 |     next_sync: BufferId,   // next page to be written to WAL
 425 | 
 426 |     used: BufferId,    // used part of page pool
 427 |     pinned: BufferId,  // amount of pinned pages
 428 |     dirtied: BufferId, // amount of dirty pages
 429 |     cached: BufferId,  // amount of cached pages
 430 | 
 431 |     hash_table: Vec<BufferId>, // array containing indexes of collision chains
 432 |     pages: Vec<PageHeader>,    // page data
 433 | }
 434 | 
 435 | //
 436 | // Wrapper class for accessing page data
 437 | //
 438 | struct PageData {
 439 |     data: [u8; PAGE_SIZE],
 440 | }
 441 | 
 442 | impl PageData {
 443 |     fn new() -> PageData {
 444 |         PageData {
 445 |             data: [0u8; PAGE_SIZE],
 446 |         }
 447 |     }
 448 | }
 449 | 
 450 | impl PageData {
 451 |     fn get_offs(&self, ip: ItemPointer) -> usize {
 452 |         self.get_u16(PAGE_HEADER_SIZE + ip * 2) as usize
 453 |     }
 454 | 
 455 |     fn set_offs(&mut self, ip: ItemPointer, offs: usize) {
 456 |         self.set_u16(PAGE_HEADER_SIZE + ip * 2, offs as u16)
 457 |     }
 458 | 
 459 |     fn get_child(&self, ip: ItemPointer) -> PageId {
 460 |         let offs = self.get_offs(ip);
 461 |         let key_len = self.data[offs] as usize;
 462 |         self.get_u32(offs + key_len + 1)
 463 |     }
 464 | 
 465 |     fn get_key(&self, ip: ItemPointer) -> Key {
 466 |         let offs = self.get_offs(ip);
 467 |         let key_len = self.data[offs] as usize;
 468 |         self.data[offs + 1..offs + 1 + key_len].to_vec()
 469 |     }
 470 | 
 471 |     fn get_last_key(&self) -> Key {
 472 |         let n_items = self.get_n_items();
 473 |         self.get_key(n_items - 1)
 474 |     }
 475 | 
 476 |     fn get_item(&self, ip: ItemPointer) -> (Key, Value) {
 477 |         let (item_offs, item_len) = self.get_item_offs_len(ip);
 478 |         let key_len = self.data[item_offs] as usize;
 479 |         (
 480 |             self.data[item_offs + 1..item_offs + 1 + key_len].to_vec(),
 481 |             self.data[item_offs + 1 + key_len..item_offs + item_len].to_vec(),
 482 |         )
 483 |     }
 484 | 
 485 |     fn get_item_offs_len(&self, ip: ItemPointer) -> (usize, usize) {
 486 |         let offs = self.get_offs(ip);
 487 |         let next_offs = if ip == 0 {
 488 |             PAGE_SIZE
 489 |         } else {
 490 |             self.get_offs(ip - 1)
 491 |         };
 492 |         debug_assert!(next_offs > offs);
 493 |         (offs, next_offs - offs)
 494 |     }
 495 | 
 496 |     fn set_u16(&mut self, offs: usize, data: u16) {
 497 |         self.copy(offs, &data.to_be_bytes());
 498 |     }
 499 | 
 500 |     fn set_u32(&mut self, offs: usize, data: u32) {
 501 |         self.copy(offs, &data.to_be_bytes());
 502 |     }
 503 | 
 504 |     fn get_u16(&self, offs: usize) -> u16 {
 505 |         u16::from_be_bytes(self.data[offs..offs + 2].try_into().unwrap())
 506 |     }
 507 | 
 508 |     fn get_u32(&self, offs: usize) -> u32 {
 509 |         u32::from_be_bytes(self.data[offs..offs + 4].try_into().unwrap())
 510 |     }
 511 | 
 512 |     fn get_n_items(&self) -> ItemPointer {
 513 |         self.get_u16(0) as ItemPointer
 514 |     }
 515 | 
 516 |     fn get_size(&self) -> ItemPointer {
 517 |         let n_items = self.get_n_items();
 518 |         if n_items == 0 {
 519 |             0
 520 |         } else {
 521 |             PAGE_SIZE - self.get_offs(n_items - 1)
 522 |         }
 523 |     }
 524 | 
 525 |     fn set_n_items(&mut self, n_items: ItemPointer) {
 526 |         self.set_u16(0, n_items as u16)
 527 |     }
 528 | 
 529 |     fn copy(&mut self, offs: usize, data: &[u8]) {
 530 |         let len = data.len();
 531 |         self.data[offs..offs + len].copy_from_slice(&data);
 532 |     }
 533 | 
 534 |     fn compare_key(&self, ip: ItemPointer, key: &Key) -> Ordering {
 535 |         let offs = self.get_offs(ip);
 536 |         let key_len = self.data[offs] as usize;
 537 |         if key_len == 0 {
 538 |             // special handling of +inf in right-most internal nodes
 539 |             Ordering::Less
 540 |         } else {
 541 |             key[..].cmp(&self.data[offs + 1..offs + 1 + key_len])
 542 |         }
 543 |     }
 544 | 
 545 |     fn remove_key(&mut self, ip: ItemPointer, leaf: bool) {
 546 |         let n_items = self.get_n_items();
 547 |         let size = self.get_size();
 548 |         let (item_offs, item_len) = self.get_item_offs_len(ip);
 549 |         for i in ip + 1..n_items {
 550 |             self.set_offs(i - 1, self.get_offs(i) + item_len);
 551 |         }
 552 |         let items_origin = PAGE_SIZE - size;
 553 |         if !leaf && n_items > 1 && ip + 1 == n_items {
 554 |             // If we are removing last child of internal page then copy it's key to the previous item
 555 |             let prev_item_offs = item_offs + item_len;
 556 |             let key_len = self.data[item_offs] as usize;
 557 |             let prev_key_len = self.data[prev_item_offs] as usize;
 558 |             let new_offs = prev_item_offs + prev_key_len - key_len;
 559 |             self.set_offs(ip - 1, new_offs);
 560 |             self.data
 561 |                 .copy_within(item_offs..item_offs + prev_key_len + 1, new_offs);
 562 |         } else {
 563 |             self.data
 564 |                 .copy_within(items_origin..item_offs, items_origin + item_len);
 565 |         }
 566 |         self.set_n_items(n_items - 1);
 567 |     }
 568 | 
 569 |     //
 570 |     // Insert item on the page is there is enough free space, otherwise return false
 571 |     //
 572 |     fn insert_item(&mut self, ip: ItemPointer, key: &Key, value: &[u8]) -> bool {
 573 |         let n_items = self.get_n_items();
 574 |         let size = self.get_size();
 575 |         let key_len = key.len();
 576 |         let item_len = 1 + key_len + value.len();
 577 |         if (n_items + 1) * 2 + size + item_len <= PAGE_SIZE - PAGE_HEADER_SIZE {
 578 |             // fit in page
 579 |             for i in (ip..n_items).rev() {
 580 |                 self.set_offs(i + 1, self.get_offs(i) - item_len);
 581 |             }
 582 |             let item_offs = if ip != 0 {
 583 |                 self.get_offs(ip - 1) - item_len
 584 |             } else {
 585 |                 PAGE_SIZE - item_len
 586 |             };
 587 |             self.set_offs(ip, item_offs);
 588 |             let items_origin = PAGE_SIZE - size;
 589 |             self.data
 590 |                 .copy_within(items_origin..item_offs + item_len, items_origin - item_len);
 591 |             self.data[item_offs] = key_len as u8;
 592 |             self.data[item_offs + 1..item_offs + 1 + key_len].copy_from_slice(&key);
 593 |             self.data[item_offs + 1 + key_len..item_offs + item_len].copy_from_slice(&value);
 594 |             self.set_n_items(n_items + 1);
 595 |             true
 596 |         } else {
 597 |             false
 598 |         }
 599 |     }
 600 | 
 601 |     //
 602 |     // Split page into two approximately equal parts. Smallest keys are moved to the new page,
 603 |     // largest - left on original page.
 604 |     // Returns split position
 605 |     //
 606 |     fn split(&mut self, new_page: &mut PageData, ip: ItemPointer) -> ItemPointer {
 607 |         let n_items = self.get_n_items();
 608 |         let size = self.get_size();
 609 |         let mut r = n_items;
 610 | 
 611 |         if ip == r {
 612 |             // Optimization for insert of sequential keys: move all data to new page,
 613 |             // leaving original page empty. It will cause complete filling of B-Tree pages.
 614 |             r -= 1;
 615 |         } else {
 616 |             // Divide page in two approximately equal parts.
 617 |             let margin = PAGE_SIZE - size / 2;
 618 |             let mut l: ItemPointer = 0;
 619 |             while l < r {
 620 |                 let m = (l + r) >> 1;
 621 |                 if self.get_offs(m) > margin {
 622 |                     // items are allocated from right to left
 623 |                     l = m + 1;
 624 |                 } else {
 625 |                     r = m;
 626 |                 }
 627 |             }
 628 |             debug_assert!(l == r);
 629 |         }
 630 |         // Move first r+1 elements to the new page
 631 |         let moved_size = PAGE_SIZE - self.get_offs(r);
 632 | 
 633 |         // copy item pointers
 634 |         new_page.data[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + (r + 1) * 2]
 635 |             .copy_from_slice(&self.data[PAGE_HEADER_SIZE..PAGE_HEADER_SIZE + (r + 1) * 2]);
 636 |         // copy items
 637 |         let dst = PAGE_SIZE - moved_size;
 638 |         new_page.data[dst..].copy_from_slice(&self.data[dst..]);
 639 | 
 640 |         // Adjust item pointers on old page
 641 |         for i in r + 1..n_items {
 642 |             self.set_offs(i - r - 1, self.get_offs(i) + moved_size);
 643 |         }
 644 |         let src = PAGE_SIZE - size;
 645 |         self.data.copy_within(src..dst, src + moved_size);
 646 |         new_page.set_n_items(r + 1);
 647 |         self.set_n_items(n_items - r - 1);
 648 |         r
 649 |     }
 650 | }
 651 | 
 652 | impl BufferManager {
 653 |     //
 654 |     // Link buffer to the head of LRU list (make it acceptable for eviction)
 655 |     //
 656 |     fn unpin(&mut self, id: BufferId) {
 657 |         debug_assert!(self.pages[id as usize].access_count == 1);
 658 |         self.pages[id as usize].access_count = 0;
 659 |         self.pages[id as usize].next = self.head;
 660 |         self.pages[id as usize].prev = 0;
 661 |         self.pinned -= 1;
 662 |         if self.head != 0 {
 663 |             self.pages[self.head as usize].prev = id;
 664 |         } else {
 665 |             self.tail = id;
 666 |         }
 667 |         self.head = id;
 668 |     }
 669 | 
 670 |     //
 671 |     // Unlink buffer from LRU list and so pin it in memory (protect from eviction)
 672 |     //
 673 |     fn pin(&mut self, id: BufferId) {
 674 |         debug_assert!(self.pages[id as usize].access_count == 0);
 675 |         let next = self.pages[id as usize].next;
 676 |         let prev = self.pages[id as usize].prev;
 677 |         if prev == 0 {
 678 |             self.head = next;
 679 |         } else {
 680 |             self.pages[prev as usize].next = next;
 681 |         }
 682 |         if next == 0 {
 683 |             self.tail = prev;
 684 |         } else {
 685 |             self.pages[next as usize].prev = prev;
 686 |         }
 687 |         self.pinned += 1;
 688 |     }
 689 | 
 690 |     //
 691 |     // Insert page in hash table
 692 |     //
 693 |     fn insert(&mut self, id: BufferId) {
 694 |         let h = self.pages[id as usize].pid as usize % self.hash_table.len();
 695 |         self.pages[id as usize].collision = self.hash_table[h];
 696 |         self.hash_table[h] = id;
 697 |     }
 698 | 
 699 |     //
 700 |     // Remove page from hash table
 701 |     //
 702 |     fn remove(&mut self, id: BufferId) {
 703 |         let h = self.pages[id as usize].pid as usize % self.hash_table.len();
 704 |         let mut p = self.hash_table[h];
 705 |         if p == id {
 706 |             self.hash_table[h] = self.pages[id as usize].collision;
 707 |         } else {
 708 |             while self.pages[p as usize].collision != id {
 709 |                 p = self.pages[p as usize].collision;
 710 |             }
 711 |             self.pages[p as usize].collision = self.pages[id as usize].collision;
 712 |         }
 713 |     }
 714 | 
 715 |     //
 716 |     // Throw away buffer from cache (used by transaction rollback)
 717 |     //
 718 |     fn throw_buffer(&mut self, id: BufferId) {
 719 |         self.remove(id);
 720 |         self.pages[id as usize].next = self.free_pages;
 721 |         self.free_pages = id;
 722 |         self.cached -= 1;
 723 |     }
 724 | 
 725 |     //
 726 |     // If buffer is not yet marked as dirty then mark it as dirty and pin until the end of transaction
 727 |     //
 728 |     fn modify_buffer(
 729 |         &mut self,
 730 |         id: BufferId,
 731 |         wal_flush_threshold: BufferId,
 732 |     ) -> Result<Option<(BufferId, PageId)>> {
 733 |         debug_assert!(self.pages[id as usize].access_count > 0);
 734 |         let mut next_sync: Option<(BufferId, PageId)> = None;
 735 |         if (self.pages[id as usize].state & PAGE_DIRTY) == 0 {
 736 |             self.pages[id as usize].access_count += 1; // pin dirty page in memory
 737 |             self.pages[id as usize].state = PAGE_DIRTY;
 738 |             self.dirtied += 1;
 739 |             if self.dirtied > wal_flush_threshold {
 740 |                 let mut sync = self.next_sync;
 741 |                 while sync != 0 {
 742 |                     assert_eq!(self.pages[sync as usize].state, PAGE_DIRTY);
 743 |                     if self.pages[sync as usize].access_count == 1 {
 744 |                         self.pages[sync as usize].state |= PAGE_SYNCED;
 745 |                         self.next_sync = self.pages[sync as usize].prev;
 746 |                         let pid = self.pages[sync as usize].pid;
 747 |                         next_sync = Some((sync, pid));
 748 |                         break;
 749 |                     }
 750 |                     sync = self.pages[sync as usize].prev;
 751 |                 }
 752 |             }
 753 |         } else {
 754 |             // we have to write page to the log once again
 755 |             self.pages[id as usize].state &= !PAGE_SYNCED;
 756 | 
 757 |             let prev = self.pages[id as usize].prev;
 758 | 
 759 |             // Move page to the beginning of L2 list
 760 |             if prev == 0 {
 761 |                 // already first page: do nothing
 762 |                 return Ok(None);
 763 |             }
 764 | 
 765 |             // If this page was scheduled for flush, then use previous page instead
 766 |             if self.next_sync == id {
 767 |                 self.next_sync = prev;
 768 |             }
 769 | 
 770 |             // unlink page
 771 |             let next = self.pages[id as usize].next;
 772 |             self.pages[prev as usize].next = next;
 773 |             if next != 0 {
 774 |                 self.pages[next as usize].prev = prev;
 775 |             }
 776 |         }
 777 |         // link to the beginning of dirty list
 778 |         if self.dirty_pages != 0 {
 779 |             self.pages[self.dirty_pages as usize].prev = id;
 780 |         }
 781 |         if self.next_sync == 0 {
 782 |             self.next_sync = id;
 783 |         }
 784 |         self.pages[id as usize].next = self.dirty_pages;
 785 |         self.pages[id as usize].prev = 0;
 786 |         self.dirty_pages = id;
 787 |         Ok(next_sync)
 788 |     }
 789 | 
 790 |     //
 791 |     // Decrement buffer's access counter and release buffer if it is last reference
 792 |     //
 793 |     fn release_buffer(&mut self, id: BufferId) {
 794 |         debug_assert!(self.pages[id as usize].access_count > 0);
 795 |         if self.pages[id as usize].access_count == 1 {
 796 |             debug_assert!((self.pages[id as usize].state & PAGE_DIRTY) == 0);
 797 |             self.unpin(id);
 798 |         } else {
 799 |             self.pages[id as usize].access_count -= 1;
 800 |         }
 801 |     }
 802 | 
 803 |     //
 804 |     // Find buffer with specified page or allocate new buffer
 805 |     //
 806 |     fn get_buffer(&mut self, pid: PageId) -> Result<BufferId> {
 807 |         let hash = pid as usize % self.hash_table.len();
 808 |         let mut h = self.hash_table[hash];
 809 |         while h != 0 {
 810 |             if self.pages[h as usize].pid == pid {
 811 |                 let access_count = self.pages[h as usize].access_count;
 812 |                 debug_assert!(access_count < u16::MAX - 1);
 813 |                 if access_count == 0 {
 814 |                     self.pin(h);
 815 |                 }
 816 |                 self.pages[h as usize].access_count = access_count + 1;
 817 |                 return Ok(h);
 818 |             }
 819 |             h = self.pages[h as usize].collision;
 820 |         }
 821 |         // page not found in cache
 822 |         h = self.free_pages;
 823 |         if h != 0 {
 824 |             // has some free pages
 825 |             self.free_pages = self.pages[h as usize].next;
 826 |             self.cached += 1;
 827 |             self.pinned += 1;
 828 |         } else {
 829 |             h = self.used;
 830 |             if (h as usize) < self.hash_table.len() {
 831 |                 self.used += 1;
 832 |                 self.cached += 1;
 833 |                 self.pinned += 1;
 834 |             } else {
 835 |                 // Replace least recently used page
 836 |                 let victim = self.tail;
 837 |                 ensure!(victim != 0);
 838 |                 debug_assert!(self.pages[victim as usize].access_count == 0);
 839 |                 debug_assert!((self.pages[victim as usize].state & PAGE_DIRTY) == 0);
 840 |                 self.pin(victim);
 841 |                 self.remove(victim);
 842 |                 h = victim;
 843 |             }
 844 |         }
 845 |         self.pages[h as usize].access_count = 1;
 846 |         self.pages[h as usize].pid = pid;
 847 |         self.pages[h as usize].state = PAGE_RAW;
 848 |         self.insert(h);
 849 |         Ok(h)
 850 |     }
 851 | }
 852 | 
 853 | struct PageGuard<'a> {
 854 |     buf: BufferId,
 855 |     pid: PageId,
 856 |     storage: &'a Storage,
 857 | }
 858 | 
 859 | impl<'a> Drop for PageGuard<'a> {
 860 |     fn drop(&mut self) {
 861 |         self.storage.release_page(self.buf);
 862 |     }
 863 | }
 864 | 
 865 | //
 866 | // Storage internal methods implementations
 867 | //
 868 | impl Storage {
 869 |     //
 870 |     // Unpin page (called by PageGuard)
 871 |     //
 872 |     fn release_page(&self, buf: BufferId) {
 873 |         let mut bm = self.buf_mgr.lock().unwrap();
 874 |         bm.release_buffer(buf);
 875 |     }
 876 | 
 877 |     //
 878 |     // Allocate new page in storage and get buffer for it
 879 |     //
 880 |     fn new_page(&self, db: &mut Database) -> Result<PageGuard<'_>> {
 881 |         let free = db.meta.free;
 882 |         let buf;
 883 |         let mut bm = self.buf_mgr.lock().unwrap();
 884 |         if free != 0 {
 885 |             buf = bm.get_buffer(free)?;
 886 |             let mut page = self.pool[buf as usize].write().unwrap();
 887 |             if (bm.pages[buf as usize].state & PAGE_RAW) != 0 {
 888 |                 self.file
 889 |                     .read_exact_at(&mut page.data, free as u64 * PAGE_SIZE as u64)?;
 890 |             }
 891 |             db.meta.free = page.get_u32(0);
 892 |             page.data.fill(0u8);
 893 |         } else {
 894 |             // extend storage
 895 |             buf = bm.get_buffer(db.meta.size)?;
 896 |             db.meta.size += 1;
 897 |             let mut page = self.pool[buf as usize].write().unwrap();
 898 |             page.data.fill(0u8);
 899 |         }
 900 |         db.meta.used += 1;
 901 |         db.meta_updated = true;
 902 |         self.modify_buffer(db, &mut bm, buf)?;
 903 | 
 904 |         Ok(PageGuard {
 905 |             buf,
 906 |             pid: bm.pages[buf as usize].pid,
 907 |             storage: &self,
 908 |         })
 909 |     }
 910 | 
 911 |     //
 912 |     // Read page in buffer and return PageGuard with pinned buffer.
 913 |     // Buffer will be automatically released on exiting from scope
 914 |     //
 915 |     fn get_page(&self, pid: PageId, mode: AccessMode) -> Result<PageGuard<'_>> {
 916 |         let mut bm = self.buf_mgr.lock().unwrap();
 917 |         let buf = bm.get_buffer(pid)?;
 918 |         if (bm.pages[buf as usize].state & PAGE_BUSY) != 0 {
 919 |             // Some other thread is loading buffer: just wait until it done
 920 |             bm.pages[buf as usize].state |= PAGE_WAIT;
 921 |             loop {
 922 |                 debug_assert!((bm.pages[buf as usize].state & PAGE_WAIT) != 0);
 923 |                 bm = self.busy_events[buf as usize % N_BUSY_EVENTS]
 924 |                     .wait(bm)
 925 |                     .unwrap();
 926 |                 if (bm.pages[buf as usize].state & PAGE_BUSY) == 0 {
 927 |                     break;
 928 |                 }
 929 |             }
 930 |         } else if (bm.pages[buf as usize].state & PAGE_RAW) != 0 {
 931 |             if mode != AccessMode::WriteOnly {
 932 |                 // Read buffer if not in write-only mode
 933 |                 bm.pages[buf as usize].state = PAGE_BUSY;
 934 |                 drop(bm); // read page without holding lock
 935 |                 {
 936 |                     let mut page = self.pool[buf as usize].write().unwrap();
 937 |                     self.file
 938 |                         .read_exact_at(&mut page.data, pid as u64 * PAGE_SIZE as u64)?;
 939 |                 }
 940 |                 bm = self.buf_mgr.lock().unwrap();
 941 |                 if (bm.pages[buf as usize].state & PAGE_WAIT) != 0 {
 942 |                     // Somebody is waiting for us
 943 |                     self.busy_events[buf as usize % N_BUSY_EVENTS].notify_all();
 944 |                 }
 945 |             }
 946 |             bm.pages[buf as usize].state = 0;
 947 |         }
 948 |         if mode != AccessMode::ReadOnly {
 949 |             bm.modify_buffer(buf, BufferId::MAX)?;
 950 |         }
 951 |         Ok(PageGuard {
 952 |             buf,
 953 |             pid,
 954 |             storage: &self,
 955 |         })
 956 |     }
 957 | 
 958 |     //
 959 |     // Mark buffer as modified, pin it in memory and if it is needed,
 960 |     // write least recently modified page to WAL
 961 |     //
 962 |     fn modify_buffer(
 963 |         &self,
 964 |         db: &mut Database,
 965 |         bm: &mut BufferManager,
 966 |         buf: BufferId,
 967 |     ) -> Result<()> {
 968 |         if let Some((sync_buf, sync_pid)) = bm.modify_buffer(buf, self.conf.wal_flush_threshold)? {
 969 |             assert_eq!(bm.pages[sync_buf as usize].state, PAGE_DIRTY | PAGE_SYNCED);
 970 |             self.write_page_to_wal(db, sync_buf, sync_pid)?;
 971 |         }
 972 |         Ok(())
 973 |     }
 974 | 
 975 |     //
 976 |     // Mark page as dirty and pin it in-memory until end of transaction
 977 |     //
 978 |     fn modify_page(&self, db: &mut Database, buf: BufferId) -> Result<()> {
 979 |         let mut bm = self.buf_mgr.lock().unwrap();
 980 |         self.modify_buffer(db, &mut bm, buf)
 981 |     }
 982 | 
 983 |     pub fn start_transaction(&self) -> Transaction<'_> {
 984 |         Transaction {
 985 |             status: TransactionStatus::InProgress,
 986 |             storage: self,
 987 |             db: self.db.write().unwrap(),
 988 |         }
 989 |     }
 990 | 
 991 |     fn write_page_to_wal(&self, db: &mut Database, buf: BufferId, pid: PageId) -> Result<()> {
 992 |         if let Some(log) = &self.log {
 993 |             let mut tx_buf = [0u8; PAGE_SIZE + 4];
 994 |             let page = self.pool[buf as usize].read().unwrap();
 995 |             tx_buf[0..4].copy_from_slice(&pid.to_be_bytes());
 996 |             tx_buf[4..].copy_from_slice(&page.data);
 997 |             db.tx_crc = crc32c_append(db.tx_crc, &tx_buf);
 998 |             log.write_all_at(&tx_buf, db.wal_pos)?;
 999 |             db.wal_pos += (4 + PAGE_SIZE) as u64;
1000 |             db.tx_size += 4 + PAGE_SIZE;
1001 |         }
1002 |         Ok(())
1003 |     }
1004 | 
1005 |     fn commit(&self, db: &mut Database) -> Result<()> {
1006 |         let mut bm = self.buf_mgr.lock().unwrap();
1007 | 
1008 |         if db.meta_updated {
1009 |             let meta = db.meta.pack();
1010 |             let mut page = self.pool[0].write().unwrap();
1011 |             page.data[0..METADATA_SIZE].copy_from_slice(&meta);
1012 |         }
1013 |         if let Some(log) = &self.log {
1014 |             // Write dirty pages to log file
1015 |             let mut dirty = bm.dirty_pages;
1016 |             while dirty != 0 && (bm.pages[dirty as usize].state & PAGE_SYNCED) == 0 {
1017 |                 assert_eq!(bm.pages[dirty as usize].state, PAGE_DIRTY);
1018 |                 self.write_page_to_wal(db, dirty, bm.pages[dirty as usize].pid)?;
1019 |                 dirty = bm.pages[dirty as usize].next;
1020 |             }
1021 |             if bm.dirty_pages != 0 {
1022 |                 let mut buf = [0u8; METADATA_SIZE + 8];
1023 |                 {
1024 |                     let page = self.pool[0].read().unwrap();
1025 |                     buf[4..4 + METADATA_SIZE].copy_from_slice(&page.data[0..METADATA_SIZE]);
1026 |                 }
1027 |                 let crc = crc32c_append(db.tx_crc, &buf[..4 + METADATA_SIZE]);
1028 |                 buf[4 + METADATA_SIZE..].copy_from_slice(&crc.to_be_bytes());
1029 |                 log.write_all_at(&buf, db.wal_pos)?;
1030 |                 db.wal_pos += (8 + METADATA_SIZE) as u64;
1031 |                 log.sync_all()?;
1032 |                 db.lsn += 1;
1033 |                 db.tx_crc = 0;
1034 |                 db.tx_size = 0;
1035 | 
1036 |                 // Write pages to the data file
1037 |                 self.flush_buffers(&mut bm, db.meta_updated)?;
1038 | 
1039 |                 if db.wal_pos >= self.conf.checkpoint_interval {
1040 |                     // Sync data file and restart from the beginning of WAL.
1041 |                     // So not truncate WAL to avoid file extension overhead.
1042 |                     self.file.sync_all()?;
1043 |                     db.wal_pos = 0;
1044 |                 }
1045 |             }
1046 |         } else {
1047 |             // No WAL mode: just write dirty pages to the disk
1048 |             if self.flush_buffers(&mut bm, db.meta_updated)? {
1049 |                 db.lsn += 1;
1050 |             }
1051 |         }
1052 |         db.meta_updated = false;
1053 |         Ok(())
1054 |     }
1055 | 
1056 |     //
1057 |     // Flush dirty pages to the disk. Return true if database is changed.
1058 |     //
1059 |     fn flush_buffers(&self, bm: &mut BufferManager, save_meta: bool) -> Result<bool> {
1060 |         let mut dirty = bm.dirty_pages;
1061 |         if save_meta {
1062 |             assert!(dirty != 0); // if we changed meta, then we should change or create at least one page
1063 |             let page = self.pool[0].read().unwrap();
1064 |             self.file.write_all_at(&page.data, 0)?;
1065 |         }
1066 |         while dirty != 0 {
1067 |             let pid = bm.pages[dirty as usize].pid;
1068 |             let file_offs = pid as u64 * PAGE_SIZE as u64;
1069 |             let page = self.pool[dirty as usize].read().unwrap();
1070 |             let next = bm.pages[dirty as usize].next;
1071 |             self.file.write_all_at(&page.data, file_offs)?;
1072 |             debug_assert!((bm.pages[dirty as usize].state & PAGE_DIRTY) != 0);
1073 |             bm.pages[dirty as usize].state = 0;
1074 |             bm.unpin(dirty);
1075 |             dirty = next;
1076 |         }
1077 |         if bm.dirty_pages != 0 {
1078 |             bm.dirty_pages = 0;
1079 |             bm.dirtied = 0;
1080 |             bm.next_sync = 0;
1081 |             Ok(true)
1082 |         } else {
1083 |             Ok(false)
1084 |         }
1085 |     }
1086 | 
1087 |     //
1088 |     // Rollback current transaction
1089 |     //
1090 |     fn rollback(&self, db: &mut Database) -> Result<()> {
1091 |         let mut bm = self.buf_mgr.lock().unwrap();
1092 |         let mut dirty = bm.dirty_pages;
1093 |         // Just throw away all dirty pages from buffer cache to force reloading of original pages
1094 |         while dirty != 0 {
1095 |             debug_assert!((bm.pages[dirty as usize].state & PAGE_DIRTY) != 0);
1096 |             debug_assert!(bm.pages[dirty as usize].access_count == 1);
1097 |             let next = bm.pages[dirty as usize].next;
1098 |             bm.throw_buffer(dirty);
1099 |             dirty = next;
1100 |         }
1101 |         bm.dirty_pages = 0;
1102 |         bm.dirtied = 0;
1103 |         bm.next_sync = 0;
1104 |         db.wal_pos -= db.tx_size as u64;
1105 |         db.tx_crc = 0;
1106 |         db.tx_size = 0;
1107 | 
1108 |         if db.meta_updated {
1109 |             // reread metadata from disk
1110 |             let mut page = self.pool[0].write().unwrap();
1111 |             self.file.read_exact_at(&mut page.data, 0)?;
1112 |             db.meta = Metadata::unpack(&page.data);
1113 |             db.meta_updated = false;
1114 |         }
1115 |         db.n_aborted_txns += 1;
1116 |         Ok(())
1117 |     }
1118 | 
1119 |     ///
1120 |     /// Open database storage. If storage file doesn't exist, then it is created.
1121 |     /// If path to transaction log is not specified, then WAL (write-ahead-log) is not used.
1122 |     /// It will significantly increase performance but can cause database corruption in case of power failure or system crash.
1123 |     ///
1124 |     pub fn open(db_path: &Path, log_path: Option<&Path>, conf: StorageConfig) -> Result<Storage> {
1125 |         let mut buf = [0u8; PAGE_SIZE];
1126 |         let (file, meta) = if let Ok(file) = OpenOptions::new().write(true).read(true).open(db_path)
1127 |         {
1128 |             // open existed file
1129 |             file.try_lock_exclusive()?;
1130 |             file.read_exact_at(&mut buf, 0)?;
1131 |             let meta = Metadata::unpack(&buf);
1132 |             ensure!(meta.magic == MAGIC && meta.version == VERSION && meta.size >= 1);
1133 |             (file, meta)
1134 |         } else {
1135 |             let file = OpenOptions::new()
1136 |                 .write(true)
1137 |                 .read(true)
1138 |                 .create(true)
1139 |                 .open(db_path)?;
1140 |             file.try_lock_exclusive()?;
1141 |             // create new file
1142 |             let meta = Metadata {
1143 |                 magic: MAGIC,
1144 |                 version: VERSION,
1145 |                 free: 0,
1146 |                 size: 1,
1147 |                 used: 1,
1148 |                 root: 0,
1149 |                 height: 0,
1150 |             };
1151 |             let metadata = meta.pack();
1152 |             buf[0..METADATA_SIZE].copy_from_slice(&metadata);
1153 |             file.write_all_at(&mut buf, 0)?;
1154 |             (file, meta)
1155 |         };
1156 |         let log = if let Some(path) = log_path {
1157 |             let log = OpenOptions::new()
1158 |                 .write(true)
1159 |                 .read(true)
1160 |                 .create(true)
1161 |                 .open(path)?;
1162 |             log.try_lock_exclusive()?;
1163 |             Some(log)
1164 |         } else {
1165 |             None
1166 |         };
1167 |         let storage = Storage {
1168 |             busy_events: [(); N_BUSY_EVENTS].map(|_| Condvar::new()),
1169 |             buf_mgr: Mutex::new(BufferManager {
1170 |                 head: 0,
1171 |                 tail: 0,
1172 |                 free_pages: 0,
1173 |                 dirty_pages: 0,
1174 |                 next_sync: 0,
1175 |                 used: 1, // pinned root page
1176 |                 cached: 1,
1177 |                 pinned: 1,
1178 |                 dirtied: 0,
1179 |                 hash_table: vec![0; conf.cache_size],
1180 |                 pages: vec![PageHeader::new(); conf.cache_size],
1181 |             }),
1182 |             pool: iter::repeat_with(|| RwLock::new(PageData::new()))
1183 |                 .take(conf.cache_size)
1184 |                 .collect(),
1185 |             file,
1186 |             log,
1187 |             conf,
1188 |             db: RwLock::new(Database {
1189 |                 lsn: 0,
1190 |                 n_aborted_txns: 0,
1191 |                 meta,
1192 |                 meta_updated: false,
1193 |                 recovery: RecoveryStatus {
1194 |                     ..Default::default()
1195 |                 },
1196 |                 state: DatabaseState::InRecovery,
1197 |                 wal_pos: 0,
1198 |                 tx_crc: 0,
1199 |                 tx_size: 0,
1200 |             }),
1201 |         };
1202 |         storage.recovery()?;
1203 |         Ok(storage)
1204 |     }
1205 | 
1206 |     //
1207 |     // Recover database from WAL (if any)
1208 |     //
1209 |     fn recovery(&self) -> Result<()> {
1210 |         let mut db = self.db.write().unwrap();
1211 |         if let Some(log) = &self.log {
1212 |             let mut buf = [0u8; 4];
1213 |             let mut crc = 0u32;
1214 |             let mut wal_pos = 0u64;
1215 |             db.recovery.wal_size = log.metadata()?.len();
1216 |             loop {
1217 |                 let len = log.read_at(&mut buf, wal_pos)?;
1218 |                 if len != 4 {
1219 |                     // end of log
1220 |                     break;
1221 |                 }
1222 |                 wal_pos += 4;
1223 |                 let pid = PageId::from_be_bytes(buf);
1224 |                 crc = crc32c_append(crc, &buf);
1225 |                 if pid != 0 {
1226 |                     let pin = self.get_page(pid, AccessMode::WriteOnly)?;
1227 |                     let mut page = self.pool[pin.buf as usize].write().unwrap();
1228 |                     let len = log.read_at(&mut page.data, wal_pos)?;
1229 |                     if len != PAGE_SIZE {
1230 |                         break;
1231 |                     }
1232 |                     wal_pos += len as u64;
1233 |                     crc = crc32c_append(crc, &page.data);
1234 |                 } else {
1235 |                     let mut meta_buf = [0u8; METADATA_SIZE];
1236 |                     let len = log.read_at(&mut meta_buf, wal_pos)?;
1237 |                     if len != METADATA_SIZE {
1238 |                         break;
1239 |                     }
1240 |                     wal_pos += len as u64;
1241 |                     crc = crc32c_append(crc, &meta_buf);
1242 |                     let len = log.read_at(&mut buf, wal_pos)?;
1243 |                     if len != 4 {
1244 |                         break;
1245 |                     }
1246 |                     wal_pos += 4;
1247 |                     if u32::from_be_bytes(buf) != crc {
1248 |                         // CRC mismatch
1249 |                         break;
1250 |                     }
1251 |                     {
1252 |                         let mut page = self.pool[0].write().unwrap();
1253 |                         page.data[0..METADATA_SIZE].copy_from_slice(&meta_buf);
1254 |                         db.meta_updated = true;
1255 |                     }
1256 |                     let mut bm = self.buf_mgr.lock().unwrap();
1257 |                     self.flush_buffers(&mut bm, true)?;
1258 |                     db.meta_updated = false;
1259 |                     db.recovery.recovered_transactions += 1;
1260 |                     db.recovery.recovery_end = wal_pos;
1261 |                     crc = 0u32;
1262 |                 }
1263 |             }
1264 |             self.rollback(&mut db)?;
1265 | 
1266 |             // reset WAL
1267 |             self.file.sync_all()?;
1268 |             db.wal_pos = 0;
1269 |             log.set_len(0)?; // truncate log
1270 |         }
1271 |         // reread metadata
1272 |         let mut page = self.pool[0].write().unwrap();
1273 |         self.file.read_exact_at(&mut page.data, 0)?;
1274 |         db.meta = Metadata::unpack(&page.data);
1275 | 
1276 |         db.state = DatabaseState::Opened;
1277 | 
1278 |         Ok(())
1279 |     }
1280 | 
1281 |     //
1282 |     // Bulk update
1283 |     //
1284 |     fn do_updates(
1285 |         &self,
1286 |         db: &mut Database,
1287 |         to_upsert: &mut dyn Iterator<Item = Result<(Key, Value)>>,
1288 |         to_remove: &mut dyn Iterator<Item = Result<Key>>,
1289 |     ) -> Result<()> {
1290 |         for pair in to_upsert {
1291 |             let kv = pair?;
1292 |             self.do_upsert(db, &kv.0, &kv.1)?;
1293 |         }
1294 |         for key in to_remove {
1295 |             self.do_remove(db, &key?)?;
1296 |         }
1297 |         Ok(())
1298 |     }
1299 | 
1300 |     //
1301 |     // Allocate new B-Tree leaf page with single (key,value) element
1302 |     //
1303 |     fn btree_allocate_leaf_page(
1304 |         &self,
1305 |         db: &mut Database,
1306 |         key: &Key,
1307 |         value: &Value,
1308 |     ) -> Result<PageId> {
1309 |         let pin = self.new_page(db)?;
1310 |         let mut page = self.pool[pin.buf as usize].write().unwrap();
1311 |         page.set_n_items(0);
1312 |         page.insert_item(0, key, value);
1313 |         Ok(pin.pid)
1314 |     }
1315 | 
1316 |     //
1317 |     // Allocate new B-Tree internal page referencing two children
1318 |     //
1319 |     fn btree_allocate_internal_page(
1320 |         &self,
1321 |         db: &mut Database,
1322 |         key: &Key,
1323 |         left_child: PageId,
1324 |         right_child: PageId,
1325 |     ) -> Result<PageId> {
1326 |         let pin = self.new_page(db)?;
1327 |         let mut page = self.pool[pin.buf as usize].write().unwrap();
1328 |         page.set_n_items(0);
1329 |         debug_assert!(left_child != 0);
1330 |         debug_assert!(right_child != 0);
1331 |         page.insert_item(0, key, &left_child.to_be_bytes().to_vec());
1332 |         page.insert_item(1, &vec![], &right_child.to_be_bytes().to_vec());
1333 |         Ok(pin.pid)
1334 |     }
1335 | 
1336 |     //
1337 |     // Insert item at the specified position in B-Tree page.
1338 |     // If B-Tree pages is full then split it, evenly distribute items between pages: smaller items moved to new page, larger items left on original page.
1339 |     // Value of largest key on new page and its identifiers are returned in case of overflow.
1340 |     //
1341 |     fn btree_insert_in_page(
1342 |         &self,
1343 |         db: &mut Database,
1344 |         page: &mut PageData,
1345 |         ip: ItemPointer,
1346 |         key: &Key,
1347 |         value: &Value,
1348 |     ) -> Result<Option<(Key, PageId)>> {
1349 |         if !page.insert_item(ip, key, value) {
1350 |             // page is full then divide page
1351 |             let pin = self.new_page(db)?;
1352 |             let mut new_page = self.pool[pin.buf as usize].write().unwrap();
1353 |             let split = page.split(&mut new_page, ip);
1354 |             let ok = if ip > split {
1355 |                 page.insert_item(ip - split - 1, key, value)
1356 |             } else {
1357 |                 new_page.insert_item(ip, key, value)
1358 |             };
1359 |             ensure!(ok);
1360 |             Ok(Some((new_page.get_last_key(), pin.pid)))
1361 |         } else {
1362 |             Ok(None)
1363 |         }
1364 |     }
1365 | 
1366 |     //
1367 |     // Remove key from B-Tree. Recursively traverse B-Tree and return true in case of underflow.
1368 |     // Right now we do not redistribute nodes between pages or merge pages, underflow is reported only if page becomes empty.
1369 |     // If key is not found, then nothing is performed and no error is reported.
1370 |     //
1371 |     fn btree_remove(&self, db: &mut Database, pid: PageId, key: &Key, height: u32) -> Result<bool> {
1372 |         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1373 |         let mut page = self.pool[pin.buf as usize].write().unwrap();
1374 |         let mut l: ItemPointer = 0;
1375 |         let n = page.get_n_items();
1376 |         let mut r = n;
1377 |         while l < r {
1378 |             let m = (l + r) >> 1;
1379 |             if page.compare_key(m, key) == Ordering::Greater {
1380 |                 l = m + 1;
1381 |             } else {
1382 |                 r = m;
1383 |             }
1384 |         }
1385 |         debug_assert!(l == r);
1386 |         if height == 1 {
1387 |             // leaf page
1388 |             if r < n && page.compare_key(r, key) == Ordering::Equal {
1389 |                 self.modify_page(db, pin.buf)?;
1390 |                 page.remove_key(r, true);
1391 |             }
1392 |         } else {
1393 |             // recurse to next level
1394 |             debug_assert!(r < n);
1395 |             let underflow = self.btree_remove(db, page.get_child(r), key, height - 1)?;
1396 |             if underflow {
1397 |                 self.modify_page(db, pin.buf)?;
1398 |                 page.remove_key(r, false);
1399 |             }
1400 |         }
1401 |         if page.get_n_items() == 0 {
1402 |             // free page
1403 |             page.set_u32(0, db.meta.free);
1404 |             db.meta.free = pid;
1405 |             db.meta.used -= 1;
1406 |             db.meta_updated = true;
1407 |             Ok(true)
1408 |         } else {
1409 |             Ok(false)
1410 |         }
1411 |     }
1412 | 
1413 |     //
1414 |     // Insert item in B-Tree. Recursively traverse B-Tree and return position of new page in case of overflow.
1415 |     //
1416 |     fn btree_insert(
1417 |         &self,
1418 |         db: &mut Database,
1419 |         pid: PageId,
1420 |         key: &Key,
1421 |         value: &Value,
1422 |         height: u32,
1423 |     ) -> Result<Option<(Key, PageId)>> {
1424 |         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1425 |         let mut page = self.pool[pin.buf as usize].write().unwrap();
1426 |         let mut l: ItemPointer = 0;
1427 |         let n = page.get_n_items();
1428 |         let mut r = n;
1429 |         while l < r {
1430 |             let m = (l + r) >> 1;
1431 |             if page.compare_key(m, key) == Ordering::Greater {
1432 |                 l = m + 1;
1433 |             } else {
1434 |                 r = m;
1435 |             }
1436 |         }
1437 |         debug_assert!(l == r);
1438 |         if height == 1 {
1439 |             // leaf page
1440 |             self.modify_page(db, pin.buf)?;
1441 |             if r < n && page.compare_key(r, key) == Ordering::Equal {
1442 |                 // replace old value with new one: just remove old one and reinsert new key-value pair
1443 |                 page.remove_key(r, true);
1444 |             }
1445 |             self.btree_insert_in_page(db, &mut page, r, key, value)
1446 |         } else {
1447 |             // recurse to next level
1448 |             debug_assert!(r < n);
1449 |             let overflow = self.btree_insert(db, page.get_child(r), key, value, height - 1)?;
1450 |             if let Some((key, child)) = overflow {
1451 |                 // insert new page before original
1452 |                 self.modify_page(db, pin.buf)?;
1453 |                 debug_assert!(child != 0);
1454 |                 self.btree_insert_in_page(db, &mut page, r, &key, &child.to_be_bytes().to_vec())
1455 |             } else {
1456 |                 Ok(None)
1457 |             }
1458 |         }
1459 |     }
1460 | 
1461 |     //
1462 |     // Insert or update key in the storage
1463 |     //
1464 |     fn do_upsert(&self, db: &mut Database, key: &Key, value: &Value) -> Result<()> {
1465 |         ensure!(key.len() != 0 && key.len() <= MAX_KEY_LEN && value.len() <= MAX_VALUE_LEN);
1466 |         if db.meta.root == 0 {
1467 |             db.meta.root = self.btree_allocate_leaf_page(db, key, value)?;
1468 |             db.meta.height = 1;
1469 |             db.meta_updated = true;
1470 |         } else if let Some((key, page)) =
1471 |             self.btree_insert(db, db.meta.root, key, value, db.meta.height)?
1472 |         {
1473 |             // overflow
1474 |             db.meta.root = self.btree_allocate_internal_page(db, &key, page, db.meta.root)?;
1475 |             db.meta.height += 1;
1476 |             db.meta_updated = true;
1477 |         }
1478 |         Ok(())
1479 |     }
1480 | 
1481 |     //
1482 |     // Remove key from the storage. Does nothing it key not exists.
1483 |     //
1484 |     fn do_remove(&self, db: &mut Database, key: &Key) -> Result<()> {
1485 |         if db.meta.root != 0 {
1486 |             let underflow = self.btree_remove(db, db.meta.root, key, db.meta.height)?;
1487 |             if underflow {
1488 |                 db.meta.height = 0;
1489 |                 db.meta.root = 0;
1490 |                 db.meta_updated = true;
1491 |             }
1492 |         }
1493 |         Ok(())
1494 |     }
1495 | 
1496 |     //
1497 |     // Perform lookup operation and fill `path` structure with located item and path to it in the tree.
1498 |     // If item is not found, then make path empty and current element `None`.
1499 |     //
1500 |     fn do_lookup(
1501 |         &self,
1502 |         db: &Database,
1503 |         op: LookupOp,
1504 |         path: &mut TreePath,
1505 |     ) -> Result<Option<(Key, Value)>> {
1506 |         ensure!(db.state == DatabaseState::Opened);
1507 |         match op {
1508 |             LookupOp::First => {
1509 |                 // Locate left-most element in the tree
1510 |                 let mut pid = db.meta.root;
1511 |                 if pid != 0 {
1512 |                     let mut level = db.meta.height;
1513 |                     loop {
1514 |                         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1515 |                         let page = self.pool[pin.buf as usize].read().unwrap();
1516 |                         path.stack.push(PagePos { pid, pos: 0 });
1517 |                         level -= 1;
1518 |                         if level == 0 {
1519 |                             path.curr = Some(page.get_item(0));
1520 |                             path.lsn = db.lsn;
1521 |                             break;
1522 |                         } else {
1523 |                             pid = page.get_child(0)
1524 |                         }
1525 |                     }
1526 |                 }
1527 |             }
1528 |             LookupOp::Last => {
1529 |                 // Locate right-most element in the tree
1530 |                 let mut pid = db.meta.root;
1531 |                 if pid != 0 {
1532 |                     let mut level = db.meta.height;
1533 |                     loop {
1534 |                         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1535 |                         let page = self.pool[pin.buf as usize].read().unwrap();
1536 |                         let pos = page.get_n_items() - 1;
1537 |                         level -= 1;
1538 |                         path.stack.push(PagePos { pid, pos });
1539 |                         if level == 0 {
1540 |                             path.curr = Some(page.get_item(pos));
1541 |                             path.lsn = db.lsn;
1542 |                             break;
1543 |                         } else {
1544 |                             pid = page.get_child(pos)
1545 |                         }
1546 |                     }
1547 |                 }
1548 |             }
1549 |             LookupOp::Next => {
1550 |                 if path.lsn == db.lsn || self.reconstruct_path(path, db)? {
1551 |                     self.move_forward(path, db.meta.height)?;
1552 |                 }
1553 |             }
1554 |             LookupOp::Prev => {
1555 |                 if path.lsn == db.lsn || self.reconstruct_path(path, db)? {
1556 |                     self.move_backward(path, db.meta.height)?;
1557 |                 }
1558 |             }
1559 |             LookupOp::GreaterOrEqual(key) => {
1560 |                 if db.meta.root != 0 && self.find(db.meta.root, path, &key, db.meta.height)? {
1561 |                     path.lsn = db.lsn;
1562 |                 }
1563 |             }
1564 |         }
1565 |         Ok(path.curr.clone())
1566 |     }
1567 | 
1568 |     //
1569 |     // Perform lookup in the database. Initialize path in the tree current element or reset path if no element is found or end of set is reached.
1570 |     //
1571 |     fn lookup(&self, db: &Database, op: LookupOp, path: &mut TreePath) {
1572 |         let result = self.do_lookup(db, op, path);
1573 |         if result.is_err() {
1574 |             path.curr = None;
1575 |         }
1576 |         path.result = result.transpose();
1577 |     }
1578 | 
1579 |     //
1580 |     // Locate greater or equal key.
1581 |     // Returns true and initializes path to this element if such key is found,
1582 |     // reset path and returns false otherwise.
1583 |     //
1584 |     fn find(&self, pid: PageId, path: &mut TreePath, key: &Key, height: u32) -> Result<bool> {
1585 |         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1586 |         let page = self.pool[pin.buf as usize].read().unwrap();
1587 |         let n = page.get_n_items();
1588 |         let mut l: ItemPointer = 0;
1589 |         let mut r = n;
1590 | 
1591 |         while l < r {
1592 |             let m = (l + r) >> 1;
1593 |             if page.compare_key(m, key) == Ordering::Greater {
1594 |                 l = m + 1;
1595 |             } else {
1596 |                 r = m;
1597 |             }
1598 |         }
1599 |         debug_assert!(l == r);
1600 |         path.stack.push(PagePos { pid, pos: r });
1601 |         if height == 1 {
1602 |             // leaf page
1603 |             if r < n {
1604 |                 path.curr = Some(page.get_item(r));
1605 |                 Ok(true)
1606 |             } else {
1607 |                 path.curr = None;
1608 |                 path.stack.pop();
1609 |                 Ok(false)
1610 |             }
1611 |         } else {
1612 |             debug_assert!(r < n);
1613 |             loop {
1614 |                 debug_assert!(page.get_child(r) != 0);
1615 |                 if self.find(page.get_child(r), path, key, height - 1)? {
1616 |                     return Ok(true);
1617 |                 }
1618 |                 path.stack.pop();
1619 |                 r += 1;
1620 |                 if r < n {
1621 |                     path.stack.push(PagePos { pid, pos: r });
1622 |                 } else {
1623 |                     break;
1624 |                 }
1625 |             }
1626 |             Ok(false)
1627 |         }
1628 |     }
1629 | 
1630 |     //
1631 |     // If storage is updated between iterations then try to reconstruct path by locating current element.
1632 |     // Returns true is such element is found and path is successfully reconstructed, false otherwise.
1633 |     //
1634 |     fn reconstruct_path(&self, path: &mut TreePath, db: &Database) -> Result<bool> {
1635 |         path.stack.clear();
1636 |         if let Some((key, _value)) = &path.curr.clone() {
1637 |             if self.find(db.meta.root, path, &key, db.meta.height)? {
1638 |                 if let Some((ge_key, _value)) = &path.curr {
1639 |                     if ge_key == key {
1640 |                         path.lsn = db.lsn;
1641 |                         return Ok(true);
1642 |                     }
1643 |                 }
1644 |             }
1645 |         }
1646 |         path.curr = None;
1647 |         Ok(false)
1648 |     }
1649 | 
1650 |     //
1651 |     // Move cursor forward
1652 |     //
1653 |     fn move_forward(&self, path: &mut TreePath, height: u32) -> Result<()> {
1654 |         let mut inc: usize = 1;
1655 |         path.curr = None;
1656 |         while !path.stack.is_empty() {
1657 |             let top = path.stack.pop().unwrap();
1658 |             let pin = self.get_page(top.pid, AccessMode::ReadOnly)?;
1659 |             let page = self.pool[pin.buf as usize].read().unwrap();
1660 |             let n_items = page.get_n_items();
1661 |             let pos = top.pos + inc;
1662 |             if pos < n_items {
1663 |                 path.stack.push(PagePos {
1664 |                     pid: top.pid,
1665 |                     pos: pos,
1666 |                 });
1667 |                 if path.stack.len() == height as usize {
1668 |                     let item = page.get_item(pos);
1669 |                     path.curr = Some(item);
1670 |                     break;
1671 |                 }
1672 |                 // We have to use this trick with `inc` variable on the way down because
1673 |                 // Rust will detect overflow if we path -1 as pos
1674 |                 debug_assert!(page.get_child(pos) != 0);
1675 |                 path.stack.push(PagePos {
1676 |                     pid: page.get_child(pos),
1677 |                     pos: 0,
1678 |                 });
1679 |                 inc = 0;
1680 |             } else {
1681 |                 // traverse up
1682 |                 inc = 1;
1683 |             }
1684 |         }
1685 |         Ok(())
1686 |     }
1687 | 
1688 |     //
1689 |     // Move cursor backward
1690 |     //
1691 |     fn move_backward(&self, path: &mut TreePath, height: u32) -> Result<()> {
1692 |         path.curr = None;
1693 |         while !path.stack.is_empty() {
1694 |             let top = path.stack.pop().unwrap();
1695 |             let pin = self.get_page(top.pid, AccessMode::ReadOnly)?;
1696 |             let page = self.pool[pin.buf as usize].read().unwrap();
1697 |             let pos = if top.pos == usize::MAX {
1698 |                 page.get_n_items()
1699 |             } else {
1700 |                 top.pos
1701 |             };
1702 |             if pos != 0 {
1703 |                 path.stack.push(PagePos {
1704 |                     pid: top.pid,
1705 |                     pos: pos - 1,
1706 |                 });
1707 |                 if path.stack.len() == height as usize {
1708 |                     let item = page.get_item(pos - 1);
1709 |                     path.curr = Some(item);
1710 |                     break;
1711 |                 }
1712 |                 path.stack.push(PagePos {
1713 |                     pid: page.get_child(pos - 1),
1714 |                     pos: usize::MAX, // start from last element of the page
1715 |                 });
1716 |             }
1717 |         }
1718 |         Ok(())
1719 |     }
1720 | 
1721 |     fn traverse(&self, pid: PageId, prev_key: &mut Key, height: u32) -> Result<u64> {
1722 |         let pin = self.get_page(pid, AccessMode::ReadOnly)?;
1723 |         let page = self.pool[pin.buf as usize].read().unwrap();
1724 |         let n_items = page.get_n_items();
1725 |         let mut count = 0u64;
1726 |         if height == 1 {
1727 |             for i in 0..n_items {
1728 |                 ensure!(page.compare_key(i, prev_key) == Ordering::Less);
1729 |                 *prev_key = page.get_key(i);
1730 |             }
1731 |             count += n_items as u64;
1732 |         } else {
1733 |             for i in 0..n_items {
1734 |                 count += self.traverse(page.get_child(i), prev_key, height - 1)?;
1735 |                 let ord = page.compare_key(i, prev_key);
1736 |                 ensure!(ord == Ordering::Less || ord == Ordering::Equal);
1737 |             }
1738 |         }
1739 |         Ok(count)
1740 |     }
1741 | }
1742 | 
1743 | //
1744 | // Implementation of public methods.
1745 | // I have a problems with extracting them in separate treat.
1746 | //
1747 | impl Storage {
1748 |     ///
1749 |     /// Perform atomic updates: insert or update `to_upsert` tuples
1750 |     /// and remove `to_remove` tuples (if exist)
1751 |     /// If all operations are successful, then we commit transaction, otherwise rollback it.
1752 |     /// If commit or rollback itself returns error, then database is switched to the corrupted state and no further access to the database is possible.
1753 |     /// You will have to close and reopen it.
1754 |     ///
1755 |     pub fn update(
1756 |         &self,
1757 |         to_upsert: &mut dyn Iterator<Item = Result<(Key, Value)>>,
1758 |         to_remove: &mut dyn Iterator<Item = Result<Key>>,
1759 |     ) -> Result<()> {
1760 |         let mut db = self.db.write().unwrap(); // prevent concurrent access to the database during update operations (MURSIW)
1761 |         ensure!(db.state == DatabaseState::Opened);
1762 |         let mut result = self.do_updates(&mut db, to_upsert, to_remove);
1763 |         if result.is_ok() {
1764 |             result = self.commit(&mut db);
1765 |             if !result.is_ok() {
1766 |                 db.state = DatabaseState::Corrupted;
1767 |             }
1768 |         } else {
1769 |             if !self.rollback(&mut db).is_ok() {
1770 |                 db.state = DatabaseState::Corrupted;
1771 |             }
1772 |         }
1773 |         result
1774 |     }
1775 | 
1776 |     ///
1777 |     /// Traverse B-Tree, check B-Tree invariants and return total number of keys in B-Tree
1778 |     ///
1779 |     pub fn verify(&self) -> Result<u64> {
1780 |         let db = self.db.read().unwrap();
1781 |         ensure!(db.state == DatabaseState::Opened);
1782 |         if db.meta.root != 0 {
1783 |             let mut prev_key = Vec::new();
1784 |             self.traverse(db.meta.root, &mut prev_key, db.meta.height)
1785 |         } else {
1786 |             Ok(0)
1787 |         }
1788 |     }
1789 | 
1790 |     ///
1791 |     /// Put (key,value) pair in the storage, if such key already exist, associated value is updated
1792 |     ///
1793 |     pub fn put(&self, key: Key, value: Value) -> Result<()> {
1794 |         self.update(&mut iter::once(Ok((key, value))), &mut iter::empty())
1795 |     }
1796 | 
1797 |     ///
1798 |     /// Store value for u32 key
1799 |     ///
1800 |     pub fn put_u32(&self, key: u32, value: Value) -> Result<()> {
1801 |         self.put(key.to_be_bytes().to_vec(), value)
1802 |     }
1803 | 
1804 |     ///
1805 |     /// Store value for u64 key
1806 |     ///
1807 |     pub fn put_u64(&self, key: u64, value: Value) -> Result<()> {
1808 |         self.put(key.to_be_bytes().to_vec(), value)
1809 |     }
1810 | 
1811 |     ///
1812 |     /// Put (key,value) pairs in the storage, exited keys are updated
1813 |     ///
1814 |     pub fn put_all(&self, pairs: &mut dyn Iterator<Item = Result<(Key, Value)>>) -> Result<()> {
1815 |         self.update(pairs, &mut iter::empty())
1816 |     }
1817 | 
1818 |     ///
1819 |     /// Remove key from the storage, do nothing if not found
1820 |     ///
1821 |     pub fn remove(&self, key: Key) -> Result<()> {
1822 |         self.update(&mut iter::empty(), &mut iter::once(Ok(key)))
1823 |     }
1824 | 
1825 |     ///
1826 |     /// Remove u32 key from the storage, do nothing if not found
1827 |     ///
1828 |     pub fn remove_u32(&self, key: u32) -> Result<()> {
1829 |         self.remove(key.to_be_bytes().to_vec())
1830 |     }
1831 | 
1832 |     ///
1833 |     /// Remove u64 key from the storage, do nothing if not found
1834 |     ///
1835 |     pub fn remove_u64(&self, key: u64) -> Result<()> {
1836 |         self.remove(key.to_be_bytes().to_vec())
1837 |     }
1838 | 
1839 |     ///
1840 |     /// Remove keys from the storage, do nothing if not found
1841 |     ///
1842 |     pub fn remove_all(&self, keys: &mut dyn Iterator<Item = Result<Key>>) -> Result<()> {
1843 |         self.update(&mut iter::empty(), keys)
1844 |     }
1845 | 
1846 |     ///
1847 |     /// Iterator through pairs in key ascending order.
1848 |     /// Byte-wise comparison is used, to it is up to serializer to enforce proper ordering,
1849 |     /// for example for unsigned integer type big-endian encoding should be used.
1850 |     ///
1851 |     pub fn iter(&self) -> StorageIterator<'_> {
1852 |         self.range(..)
1853 |     }
1854 | 
1855 |     ///
1856 |     /// Lookup key in the storage.
1857 |     ///
1858 |     pub fn get(&self, key: &Key) -> Result<Option<Value>> {
1859 |         let mut iter = self.range((Included(key), Included(key)));
1860 |         Ok(iter.next().transpose()?.map(|kv| kv.1))
1861 |     }
1862 | 
1863 |     ///
1864 |     /// Lookup u32 key in the storage.
1865 |     ///
1866 |     pub fn get_u32(&self, key: u32) -> Result<Option<Value>> {
1867 |         self.get(&key.to_be_bytes().to_vec())
1868 |     }
1869 | 
1870 |     ///
1871 |     /// Lookup u64 key in the storage.
1872 |     ///
1873 |     pub fn get_u64(&self, key: u64) -> Result<Option<Value>> {
1874 |         self.get(&key.to_be_bytes().to_vec())
1875 |     }
1876 | 
1877 |     ///
1878 |     /// Returns bidirectional iterator
1879 |     ///
1880 |     pub fn range<R: RangeBounds<Key>>(&self, range: R) -> StorageIterator<'_> {
1881 |         StorageIterator {
1882 |             storage: &self,
1883 |             trans: None,
1884 |             from: range.start_bound().cloned(),
1885 |             till: range.end_bound().cloned(),
1886 |             left: TreePath::new(),
1887 |             right: TreePath::new(),
1888 |         }
1889 |     }
1890 | 
1891 |     ///
1892 |     /// Close storage. Close data and WAL files and truncate WAL file.
1893 |     ///
1894 |     pub fn close(&self) -> Result<()> {
1895 |         if let Ok(mut db) = self.db.write() {
1896 |             // avoid poisoned lock
1897 |             if db.state == DatabaseState::Opened {
1898 |                 let mut delayed_commit = false;
1899 |                 if let Ok(bm) = self.buf_mgr.lock() {
1900 |                     // avoid poisoned mutex
1901 |                     if bm.dirty_pages != 0 {
1902 |                         delayed_commit = true;
1903 |                     }
1904 |                 }
1905 |                 if delayed_commit {
1906 |                     self.commit(&mut db)?;
1907 |                 }
1908 |                 // Sync data file and truncate log in case of normal shutdown
1909 |                 self.file.sync_all()?;
1910 |                 if let Some(log) = &self.log {
1911 |                     log.set_len(0)?; // truncate WAL
1912 |                 }
1913 |                 db.state = DatabaseState::Closed;
1914 |             }
1915 |         }
1916 |         Ok(())
1917 |     }
1918 | 
1919 |     ///
1920 |     /// Shutdown storage. Unlike close it does't commit delayed transactions, flush data file and truncatate WAL.
1921 |     ///
1922 |     pub fn shutdown(&self) -> Result<()> {
1923 |         let mut db = self.db.write().unwrap();
1924 |         ensure!(db.state == DatabaseState::Opened);
1925 |         db.state = DatabaseState::Closed;
1926 |         Ok(())
1927 |     }
1928 | 
1929 |     ///
1930 |     /// Get recovery status
1931 |     ///
1932 |     pub fn get_recovery_status(&self) -> RecoveryStatus {
1933 |         let db = self.db.read().unwrap();
1934 |         db.recovery
1935 |     }
1936 | 
1937 |     ///
1938 |     /// Get database info
1939 |     ///
1940 |     pub fn get_database_info(&self) -> DatabaseInfo {
1941 |         let db = self.db.read().unwrap();
1942 |         db.get_info()
1943 |     }
1944 | 
1945 |     ///
1946 |     /// Get cache info
1947 |     ///
1948 |     pub fn get_cache_info(&self) -> CacheInfo {
1949 |         let bm = self.buf_mgr.lock().unwrap();
1950 |         CacheInfo {
1951 |             used: bm.cached as usize,
1952 |             pinned: bm.pinned as usize,
1953 |             dirtied: bm.dirtied as usize,
1954 |         }
1955 |     }
1956 | }
1957 | 
1958 | impl Drop for Storage {
1959 |     fn drop(&mut self) {
1960 |         self.close().unwrap();
1961 |     }
1962 | }
1963 | 
1964 | impl<'a> Transaction<'_> {
1965 |     ///
1966 |     /// Commit transaction
1967 |     ///
1968 |     pub fn commit(&mut self) -> Result<()> {
1969 |         ensure!(self.status == TransactionStatus::InProgress);
1970 |         self.storage.commit(&mut self.db)?;
1971 |         self.status = TransactionStatus::Committed;
1972 |         Ok(())
1973 |     }
1974 | 
1975 |     ///
1976 |     /// Delay commit of transaction
1977 |     ///
1978 |     pub fn delay(&mut self) -> Result<()> {
1979 |         ensure!(self.status == TransactionStatus::InProgress);
1980 |         self.db.lsn += 1;
1981 |         // mark transaction as committed to prevent implicit rollback by destructor
1982 |         self.status = TransactionStatus::Committed;
1983 |         Ok(())
1984 |     }
1985 | 
1986 |     ///
1987 |     /// Rollback transaction undoing all changes
1988 |     ///
1989 |     pub fn rollback(&mut self) -> Result<()> {
1990 |         ensure!(self.status == TransactionStatus::InProgress);
1991 |         self.storage.rollback(&mut self.db)?;
1992 |         self.status = TransactionStatus::Aborted;
1993 |         Ok(())
1994 |     }
1995 | 
1996 |     ///
1997 |     /// Insert new key in the storage or update existed key as part of this transaction.
1998 |     ///
1999 |     pub fn put(&mut self, key: &Key, value: &Value) -> Result<()> {
2000 |         ensure!(self.status == TransactionStatus::InProgress);
2001 |         self.storage.do_upsert(&mut self.db, key, value)?;
2002 |         Ok(())
2003 |     }
2004 | 
2005 |     ///
2006 |     /// Store value for u32 key
2007 |     ///
2008 |     pub fn put_u32(&mut self, key: u32, value: &Value) -> Result<()> {
2009 |         self.put(&key.to_be_bytes().to_vec(), value)
2010 |     }
2011 | 
2012 |     ///
2013 |     /// Store value for u64 key
2014 |     ///
2015 |     pub fn put_u64(&mut self, key: u64, value: &Value) -> Result<()> {
2016 |         self.put(&key.to_be_bytes().to_vec(), value)
2017 |     }
2018 | 
2019 |     ///
2020 |     /// Remove key from storage as part of this transaction.
2021 |     /// Does nothing if key not exist.
2022 |     ///
2023 |     pub fn remove(&mut self, key: &Key) -> Result<()> {
2024 |         ensure!(self.status == TransactionStatus::InProgress);
2025 |         self.storage.do_remove(&mut self.db, key)?;
2026 |         Ok(())
2027 |     }
2028 | 
2029 |     ///
2030 |     /// Remove u32 key from the storage, do nothing if not found
2031 |     ///
2032 |     pub fn remove_u32(&mut self, key: u32) -> Result<()> {
2033 |         self.remove(&key.to_be_bytes().to_vec())
2034 |     }
2035 | 
2036 |     ///
2037 |     /// Remove u64 key from the storage, do nothing if not found
2038 |     ///
2039 |     pub fn remove_u64(&mut self, key: u64) -> Result<()> {
2040 |         self.remove(&key.to_be_bytes().to_vec())
2041 |     }
2042 | 
2043 |     ///
2044 |     /// Iterator through pairs in key ascending order.
2045 |     /// Byte-wise comparison is used, to it is up to serializer to enforce proper ordering,
2046 |     /// for example for unsigned integer type big-endian encoding should be used.
2047 |     ///
2048 |     pub fn iter(&self) -> StorageIterator<'_> {
2049 |         self.range(..)
2050 |     }
2051 | 
2052 |     ///
2053 |     /// Lookup key in the storage.
2054 |     ///
2055 |     pub fn get(&self, key: &Key) -> Result<Option<Value>> {
2056 |         let mut iter = self.range((Included(key), Included(key)));
2057 |         Ok(iter.next().transpose()?.map(|kv| kv.1))
2058 |     }
2059 | 
2060 |     ///
2061 |     /// Lookup u32 key in the storage.
2062 |     ///
2063 |     pub fn get_u32(&self, key: u32) -> Result<Option<Value>> {
2064 |         self.get(&key.to_be_bytes().to_vec())
2065 |     }
2066 | 
2067 |     ///
2068 |     /// Lookup u64 key in the storage.
2069 |     ///
2070 |     pub fn get_u64(&self, key: u64) -> Result<Option<Value>> {
2071 |         self.get(&key.to_be_bytes().to_vec())
2072 |     }
2073 | 
2074 |     ///
2075 |     /// Returns bidirectional iterator
2076 |     ///
2077 |     pub fn range<R: RangeBounds<Key>>(&self, range: R) -> StorageIterator<'_> {
2078 |         StorageIterator {
2079 |             storage: self.storage,
2080 |             trans: Some(&self),
2081 |             from: range.start_bound().cloned(),
2082 |             till: range.end_bound().cloned(),
2083 |             left: TreePath::new(),
2084 |             right: TreePath::new(),
2085 |         }
2086 |     }
2087 |     ///
2088 |     /// Traverse B-Tree, check B-Tree invariants and return total number of keys in B-Tree
2089 |     ///
2090 |     pub fn verify(&self) -> Result<u64> {
2091 |         ensure!(self.status == TransactionStatus::InProgress);
2092 |         if self.db.meta.root != 0 {
2093 |             let mut prev_key = Vec::new();
2094 |             self.storage
2095 |                 .traverse(self.db.meta.root, &mut prev_key, self.db.meta.height)
2096 |         } else {
2097 |             Ok(0)
2098 |         }
2099 |     }
2100 | 
2101 |     ///
2102 |     /// Get database info
2103 |     ///
2104 |     pub fn get_database_info(&self) -> DatabaseInfo {
2105 |         self.db.get_info()
2106 |     }
2107 | 
2108 |     ///
2109 |     /// Get cache info
2110 |     ///
2111 |     pub fn get_cache_info(&self) -> CacheInfo {
2112 |         self.storage.get_cache_info()
2113 |     }
2114 | }
2115 | 
2116 | impl<'a> Drop for Transaction<'a> {
2117 |     fn drop(&mut self) {
2118 |         if self.status == TransactionStatus::InProgress {
2119 |             self.storage.rollback(&mut self.db).unwrap();
2120 |         }
2121 |     }
2122 | }
2123 | 


--------------------------------------------------------------------------------
/tests/storage_spec.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{bail, Result};
  2 | use rand::rngs::StdRng;
  3 | use rand::{Rng, SeedableRng};
  4 | use std::convert::TryInto;
  5 | use std::iter;
  6 | use std::path::Path;
  7 | use std::sync::Arc;
  8 | use std::thread;
  9 | use std::time::Instant;
 10 | 
 11 | use yakv::storage::*;
 12 | 
 13 | const RAND_SEED: u64 = 2021;
 14 | const N_RECORDS_LARGE: usize = 1000000;
 15 | const N_RECORDS_SMALL: usize = 10000;
 16 | 
 17 | #[test]
 18 | fn test_basic_ops() {
 19 |     let store = open_store("test1.dbs", Some("test1.log"));
 20 |     {
 21 |         let mut trans = store.start_transaction();
 22 |         trans.put(&v(b"1"), &v(b"one")).unwrap();
 23 |         trans.put(&v(b"2"), &v(b"two")).unwrap();
 24 |         trans.put(&v(b"3"), &v(b"three")).unwrap();
 25 |         trans.put(&v(b"4"), &v(b"four")).unwrap();
 26 |         trans.put(&v(b"5"), &v(b"five")).unwrap();
 27 |         assert_eq!(trans.get(&v(b"1")).unwrap().unwrap(), v(b"one"));
 28 |         trans.commit().unwrap();
 29 |     }
 30 |     assert_eq!(store.get(&v(b"1")).unwrap().unwrap(), v(b"one"));
 31 | 
 32 |     let mut b = b'1';
 33 |     for kv in store.iter().flatten() {
 34 |         assert_eq!(kv.0, vec![b]);
 35 |         b += 1;
 36 |     }
 37 |     assert_eq!(b, b'6');
 38 | 
 39 |     assert_eq!(
 40 |         store
 41 |             .range(..v(b"3"))
 42 |             .flatten()
 43 |             .collect::<Vec<(Key, Value)>>(),
 44 |         [(v(b"1"), v(b"one")), (v(b"2"), v(b"two"))]
 45 |     );
 46 |     assert_eq!(
 47 |         store
 48 |             .range(v(b"3")..v(b"4"))
 49 |             .flatten()
 50 |             .collect::<Vec<(Key, Value)>>(),
 51 |         [(v(b"3"), v(b"three"))]
 52 |     );
 53 |     assert_eq!(
 54 |         store
 55 |             .range(v(b"1")..=v(b"2"))
 56 |             .flatten()
 57 |             .collect::<Vec<(Key, Value)>>(),
 58 |         [(v(b"1"), v(b"one")), (v(b"2"), v(b"two"))]
 59 |     );
 60 |     assert_eq!(
 61 |         store
 62 |             .range(v(b"5")..)
 63 |             .flatten()
 64 |             .collect::<Vec<(Key, Value)>>(),
 65 |         [(v(b"5"), v(b"five"))]
 66 |     );
 67 | 
 68 |     {
 69 |         let mut it = store.iter();
 70 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one")));
 71 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two")));
 72 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"5"), v(b"five")));
 73 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"4"), v(b"four")));
 74 |     }
 75 |     {
 76 |         let mut it = store.range(..v(b"4"));
 77 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"3"), v(b"three")));
 78 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"2"), v(b"two")));
 79 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one")));
 80 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two")));
 81 |     }
 82 |     {
 83 |         let mut it = store.range(v(b"1")..=v(b"2"));
 84 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"1"), v(b"one")));
 85 |         assert_eq!(it.next().unwrap().unwrap(), (v(b"2"), v(b"two")));
 86 |         assert!(it.next().is_none());
 87 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"2"), v(b"two")));
 88 |         assert_eq!(it.next_back().unwrap().unwrap(), (v(b"1"), v(b"one")));
 89 |         assert!(it.next_back().is_none());
 90 |     }
 91 |     store.put(v(b"2"), v(b"two-two")).unwrap();
 92 |     assert_eq!(store.get(&v(b"1")).unwrap().unwrap(), v(b"one"));
 93 |     assert_eq!(store.get(&v(b"2")).unwrap().unwrap(), v(b"two-two"));
 94 |     assert_eq!(store.get(&v(b"3")).unwrap().unwrap(), v(b"three"));
 95 | 
 96 |     store.remove(v(b"3")).unwrap();
 97 |     assert_eq!(
 98 |         store
 99 |             .range(v(b"2")..v(b"5"))
100 |             .flatten()
101 |             .collect::<Vec<(Key, Value)>>(),
102 |         [(v(b"2"), v(b"two-two")), (v(b"4"), v(b"four"))]
103 |     );
104 | }
105 | 
106 | fn seq_benchmark(
107 |     db_path: &str,
108 |     log_path: Option<&str>,
109 |     n_records: usize,
110 |     transaction_size: usize,
111 | ) -> Result<()> {
112 |     let payload1: Vec<u8> = vec![1u8; 100];
113 |     let payload2: Vec<u8> = vec![2u8; 100];
114 |     {
115 |         let store = open_store(db_path, log_path);
116 |         let mut key: u64 = 0;
117 | 
118 |         let mut now = Instant::now();
119 |         for _ in 0..n_records / transaction_size {
120 |             store.put_all(
121 |                 &mut iter::repeat_with(|| {
122 |                     key += 1;
123 |                     Ok((pack(key), payload1.clone()))
124 |                 })
125 |                 .take(transaction_size),
126 |             )?;
127 |         }
128 |         println!(
129 |             "Elapsed time for {} inserts: {:?}",
130 |             n_records,
131 |             now.elapsed()
132 |         );
133 | 
134 |         now = Instant::now();
135 |         for i in 1..=n_records {
136 |             let key = (i as u64).to_be_bytes().to_vec();
137 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload1);
138 |         }
139 |         println!(
140 |             "Elapsed time for {} hot lookups: {:?}",
141 |             n_records,
142 |             now.elapsed()
143 |         );
144 | 
145 |         now = Instant::now();
146 |         key = 0;
147 |         for _ in 0..n_records / transaction_size {
148 |             let mut trans = store.start_transaction();
149 |             for _ in 0..transaction_size {
150 |                 key += 1;
151 |                 trans.put(&pack(key), &payload2)?;
152 |             }
153 |             trans.commit()?;
154 |         }
155 |         println!(
156 |             "Elapsed time for {} updates: {:?}",
157 |             n_records,
158 |             now.elapsed()
159 |         );
160 | 
161 |         for i in 1..=n_records {
162 |             let key = (i as u64).to_be_bytes().to_vec();
163 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload2);
164 |         }
165 |     }
166 |     {
167 |         // reopen database
168 |         let store = reopen_store(db_path, log_path);
169 | 
170 |         let mut now = Instant::now();
171 |         for i in 1..=n_records {
172 |             let key = (i as u64).to_be_bytes().to_vec();
173 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload2);
174 |         }
175 |         println!(
176 |             "Elapsed time for {} cold lookups: {:?}",
177 |             n_records,
178 |             now.elapsed()
179 |         );
180 | 
181 |         now = Instant::now();
182 |         let mut key = 0;
183 |         for _ in 0..n_records / transaction_size {
184 |             store.remove_all(
185 |                 &mut iter::repeat_with(|| {
186 |                     key += 1;
187 |                     Ok(pack(key))
188 |                 })
189 |                 .take(transaction_size),
190 |             )?;
191 |         }
192 |         println!(
193 |             "Elapsed time for {} removes: {:?}",
194 |             n_records,
195 |             now.elapsed()
196 |         );
197 |     }
198 |     Ok(())
199 | }
200 | 
201 | fn rnd_benchmark(
202 |     db_path: &str,
203 |     log_path: Option<&str>,
204 |     n_records: usize,
205 |     transaction_size: usize,
206 | ) -> Result<()> {
207 |     let payload1: Vec<u8> = vec![1u8; 100];
208 |     let payload2: Vec<u8> = vec![2u8; 100];
209 |     {
210 |         let store = open_store(db_path, log_path);
211 | 
212 |         let mut rand = StdRng::seed_from_u64(RAND_SEED);
213 |         let mut now = Instant::now();
214 |         for _ in 0..n_records / transaction_size {
215 |             store.put_all(
216 |                 &mut iter::repeat_with(|| Ok((rand.gen::<[u8; 8]>().to_vec(), payload1.clone())))
217 |                     .take(transaction_size),
218 |             )?;
219 |         }
220 |         println!(
221 |             "Elapsed time for {} inserts: {:?}",
222 |             n_records,
223 |             now.elapsed()
224 |         );
225 | 
226 |         now = Instant::now();
227 |         rand = StdRng::seed_from_u64(RAND_SEED);
228 |         for _ in 0..n_records {
229 |             let key = rand.gen::<[u8; 8]>().to_vec();
230 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload1);
231 |         }
232 |         println!(
233 |             "Elapsed time for {} hot lookups: {:?}",
234 |             n_records,
235 |             now.elapsed()
236 |         );
237 | 
238 |         now = Instant::now();
239 |         rand = StdRng::seed_from_u64(RAND_SEED);
240 |         for _ in 0..n_records / transaction_size {
241 |             let mut trans = store.start_transaction();
242 |             for _ in 0..transaction_size {
243 |                 trans.put(&rand.gen::<[u8; 8]>().to_vec(), &payload2)?;
244 |             }
245 |             trans.commit()?;
246 |         }
247 |         println!(
248 |             "Elapsed time for {} updates: {:?}",
249 |             n_records,
250 |             now.elapsed()
251 |         );
252 | 
253 |         rand = StdRng::seed_from_u64(RAND_SEED);
254 |         for _ in 0..n_records {
255 |             let key = rand.gen::<[u8; 8]>().to_vec();
256 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload2);
257 |         }
258 |     }
259 |     {
260 |         // reopen database
261 |         let store = reopen_store(db_path, log_path);
262 | 
263 |         let mut now = Instant::now();
264 |         let mut rand = StdRng::seed_from_u64(RAND_SEED);
265 |         for _ in 1..=n_records {
266 |             let key = rand.gen::<[u8; 8]>().to_vec();
267 |             assert_eq!(store.get(&key).unwrap().unwrap(), payload2);
268 |         }
269 |         println!(
270 |             "Elapsed time for {} cold lookups: {:?}",
271 |             n_records,
272 |             now.elapsed()
273 |         );
274 | 
275 |         now = Instant::now();
276 |         rand = StdRng::seed_from_u64(RAND_SEED);
277 |         for _ in 0..n_records / transaction_size {
278 |             store.remove_all(
279 |                 &mut iter::repeat_with(|| Ok(rand.gen::<[u8; 8]>().to_vec()))
280 |                     .take(transaction_size),
281 |             )?;
282 |         }
283 |         println!(
284 |             "Elapsed time for {} removes: {:?}",
285 |             n_records,
286 |             now.elapsed()
287 |         );
288 |     }
289 |     Ok(())
290 | }
291 | 
292 | #[test]
293 | fn seq_benchmark_wal_large_trans() {
294 |     assert!(seq_benchmark("test2.dbs", Some("test2.log"), N_RECORDS_LARGE, 1000,).is_ok());
295 | }
296 | 
297 | #[test]
298 | fn seq_benchmark_wal_small_trans() {
299 |     assert!(seq_benchmark("test3.dbs", Some("test3.log"), N_RECORDS_SMALL, 1,).is_ok());
300 | }
301 | 
302 | #[test]
303 | fn seq_benchmark_nowal_large_trans() {
304 |     assert!(seq_benchmark("test4.dbs", None, N_RECORDS_LARGE, 1000,).is_ok());
305 | }
306 | 
307 | #[test]
308 | fn seq_benchmark_nowal_small_trans() {
309 |     assert!(seq_benchmark("test5.dbs", None, N_RECORDS_LARGE, 1,).is_ok());
310 | }
311 | 
312 | #[test]
313 | fn rnd_benchmark_wal_large_trans() {
314 |     assert!(rnd_benchmark("test6.dbs", Some("test6.log"), N_RECORDS_LARGE, 1000,).is_ok());
315 | }
316 | 
317 | #[test]
318 | fn rnd_benchmark_wal_small_trans() {
319 |     assert!(rnd_benchmark("test7.dbs", Some("test7.log"), N_RECORDS_SMALL, 1,).is_ok());
320 | }
321 | 
322 | #[test]
323 | fn rnd_benchmark_nowal_large_trans() {
324 |     assert!(rnd_benchmark("test8.dbs", None, N_RECORDS_LARGE, 1000,).is_ok());
325 | }
326 | 
327 | #[test]
328 | fn rnd_benchmark_nowal_small_trans() {
329 |     assert!(rnd_benchmark("test9.dbs", None, N_RECORDS_LARGE, 1,).is_ok());
330 | }
331 | 
332 | #[test]
333 | fn test_acid() {
334 |     let store = open_store("test10.dbs", Some("test10.log"));
335 | 
336 |     assert!(store
337 |         .put_all(&mut (0..100).map(|key| {
338 |             if key == 50 {
339 |                 bail!("Simulate failure")
340 |             } else {
341 |                 Ok((pack(key), v(b"hello world!")))
342 |             }
343 |         }))
344 |         .is_err());
345 | 
346 |     assert_eq!(store.iter().count(), 0);
347 | 
348 |     assert!(store
349 |         .put_all(&mut (0..100).map(|key| { Ok((pack(key), v(b"hello world!"))) }))
350 |         .is_ok());
351 | 
352 |     assert!(store
353 |         .put_all(&mut (0..100).map(|key| {
354 |             if key == 50 {
355 |                 bail!("Simulate failure")
356 |             } else {
357 |                 Ok((pack(key), v(b"good bye!")))
358 |             }
359 |         }))
360 |         .is_err());
361 | 
362 |     assert_eq!(
363 |         store
364 |             .iter()
365 |             .flatten()
366 |             .map(|kv| assert_eq!(kv.1, v(b"hello world!")))
367 |             .count(),
368 |         100
369 |     );
370 | 
371 |     assert!(store
372 |         .remove_all(&mut (0..100).map(|key| {
373 |             if key == 50 {
374 |                 bail!("Simulate failure")
375 |             } else {
376 |                 Ok(pack(key))
377 |             }
378 |         }))
379 |         .is_err());
380 | 
381 |     assert_eq!(store.iter().count(), 100);
382 | }
383 | 
384 | #[test]
385 | fn test_recovery() {
386 |     let data_path = Path::new("test11.dbs");
387 |     let log_path = Path::new("test11.log");
388 |     const N_KEYS: u64 = 100000;
389 |     {
390 |         let _ = std::fs::remove_file(&data_path);
391 |         let _ = std::fs::remove_file(&log_path);
392 |         let mut cfg = StorageConfig::default();
393 |         cfg.wal_flush_threshold = 1;
394 |         let store = Storage::open(data_path, Some(log_path), cfg).unwrap();
395 |         {
396 |             let mut trans = store.start_transaction();
397 |             for key in 0..N_KEYS {
398 |                 trans.put(&pack(key), &v(b"first")).unwrap();
399 |             }
400 |             trans.commit().unwrap();
401 |         }
402 |         {
403 |             let mut trans = store.start_transaction();
404 |             for key in 0..N_KEYS {
405 |                 trans.put(&pack(key), &v(b"two")).unwrap();
406 |             }
407 |             trans.commit().unwrap();
408 |         }
409 |         {
410 |             let mut trans = store.start_transaction();
411 |             for key in 0..N_KEYS {
412 |                 trans.put(&pack(key), &v(b"three")).unwrap();
413 |             }
414 |             // transaction shoud be implicitly aborted
415 |         }
416 |         store.shutdown().unwrap(); // do not truncate WAL
417 |     }
418 |     {
419 |         let store = Storage::open(data_path, Some(log_path), StorageConfig::default()).unwrap();
420 |         let recovery = store.get_recovery_status();
421 |         assert_eq!(recovery.recovered_transactions, 2);
422 |         assert!(recovery.wal_size > recovery.recovery_end);
423 |         for key in 0..N_KEYS {
424 |             assert_eq!(store.get(&pack(key)).unwrap().unwrap(), v(b"two"));
425 |         }
426 |     }
427 | }
428 | 
429 | fn do_inserts(s: Arc<Storage>, tid: u32, n_records: u32) -> Result<()> {
430 |     let tid_bytes = tid.to_be_bytes();
431 |     for id in 0..n_records {
432 |         let mut key: Vec<u8> = Vec::new();
433 |         key.extend_from_slice(&id.to_be_bytes());
434 |         key.extend_from_slice(&tid_bytes);
435 |         s.put(key, tid_bytes.to_vec())?;
436 |     }
437 |     Ok(())
438 | }
439 | 
440 | fn do_selects(s: Arc<Storage>, n_records: usize) {
441 |     while s.iter().count() != n_records {}
442 | }
443 | 
444 | #[test]
445 | fn test_parallel_access() {
446 |     let store = Arc::new(open_store("test1.dbs", None));
447 |     let n_writers = 10u32;
448 |     let n_records = 10000u32;
449 |     let mut threads = Vec::new();
450 |     for i in 0..n_writers {
451 |         let s = store.clone();
452 |         threads.push(thread::spawn(move || {
453 |             do_inserts(s, i, n_records).unwrap();
454 |         }));
455 |     }
456 |     let s = store.clone();
457 |     threads.push(thread::spawn(move || {
458 |         do_selects(s, n_records as usize * n_writers as usize);
459 |     }));
460 |     for t in threads {
461 |         t.join().expect("Thread crashed");
462 |     }
463 |     let mut id = 0u32;
464 |     let mut tid = 0u32;
465 |     for entry in store.iter() {
466 |         let pair = entry.unwrap();
467 |         let key = pair.0;
468 |         let value = pair.1;
469 |         let curr_id = u32::from_be_bytes(key[0..4].try_into().unwrap());
470 |         let curr_tid = u32::from_be_bytes(key[4..8].try_into().unwrap());
471 |         let curr_value = u32::from_be_bytes(value.try_into().unwrap());
472 |         assert_eq!(curr_id, id);
473 |         assert_eq!(curr_tid, tid);
474 |         assert_eq!(curr_value, tid);
475 |         tid += 1;
476 |         if tid == n_writers {
477 |             tid = 0;
478 |             id += 1;
479 |         }
480 |     }
481 |     assert_eq!(id, n_records);
482 | }
483 | 
484 | fn v(b: &[u8]) -> Key {
485 |     b.to_vec()
486 | }
487 | 
488 | fn pack(key: u64) -> Vec<u8> {
489 |     key.to_be_bytes().to_vec()
490 | }
491 | 
492 | fn open_store(data_file: &str, log_file: Option<&str>) -> Storage {
493 |     let data_path = Path::new(data_file);
494 |     let log_path = log_file.map(|wal| Path::new(wal));
495 |     let _ = std::fs::remove_file(&data_path);
496 |     if let Some(log) = log_path {
497 |         let _ = std::fs::remove_file(&log);
498 |     }
499 |     Storage::open(data_path, log_path, StorageConfig::default()).unwrap()
500 | }
501 | 
502 | fn reopen_store(data_file: &str, log_file: Option<&str>) -> Storage {
503 |     let data_path = Path::new(data_file);
504 |     let log_path = log_file.map(|wal| Path::new(wal));
505 |     Storage::open(data_path, log_path, StorageConfig::default()).unwrap()
506 | }
507 | 


--------------------------------------------------------------------------------