├── Cargo.toml ├── LICENSE └── src ├── merge.rs ├── transaction.rs ├── rebalance.rs ├── put.rs ├── txn.rs └── del.rs /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sanakirja" 3 | description = "A key-value dictionary, using copy-on-write and B trees." 4 | version = "0.4.4" 5 | authors = ["Pierre-Étienne Meunier"] 6 | license = "MPL-2.0" 7 | documentation = "http://pijul.org/sanakirja/doc/sanakirja" 8 | repository = "http://pijul.org/sanakirja" 9 | include = ["Cargo.toml","src/transaction.rs","src/txn.rs","src/lib.rs","src/put.rs","src/del.rs","src/merge.rs","src/rebalance.rs"] 10 | 11 | [dependencies] 12 | log="0.3" 13 | env_logger="0.3" 14 | fs2="0.2" 15 | rand="0.3" 16 | memmap = "0.3" 17 | rustc-serialize = "0.3" 18 | 19 | [dev-dependencies] 20 | tempdir="0.3" 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /src/merge.rs: -------------------------------------------------------------------------------- 1 | use super::txn::*; 2 | use super::transaction::{PAGE_SIZE,Error}; 3 | use std; 4 | use rand::{Rng}; 5 | 6 | extern crate log; 7 | use super::put::*; 8 | 9 | use super::del::Smallest; 10 | 11 | /// Add all bindings from `source` to `target`, assuming `target` has 12 | /// enough free space and doesn't need compaction. 13 | // 14 | /// Forget offset `forgetting` during the copy, replacing its left 15 | /// child with `replace_page`. 16 | fn merge_page( 17 | rng:&mut R, 18 | txn:&mut MutTxn, 19 | source:&Cow, mut target:&mut MutPage, 20 | levels:&mut [u16], 21 | forgetting:u16, replace_page:u64, increment_children:bool) -> Result<(),Error> { 22 | unsafe { 23 | // A pointer to the last inserted value, so we can replace the 24 | // deleted's left child with `replace_page` 25 | let mut current_ptr = target.offset(levels[0] as isize); 26 | // Let's go. 27 | for (current, key,value,r) in PageIterator::new(source, 0) { 28 | debug!("merge_page: {:?} {:?} {:?} {:?}", current, std::str::from_utf8(key), r, increment_children); 29 | if current != forgetting { 30 | let size = record_size(key.len(), value.len() as usize); 31 | let off = target.can_alloc(size); 32 | debug_assert!(off > 0); 33 | debug_assert!(off + size <= PAGE_SIZE as u16); 34 | current_ptr = target.offset(off as isize); 35 | debug!("merge_page: off={:?}", off); 36 | let page_will_be_forgotten = // If the next one is going to be forgotten, we'll replace its page. 37 | u16::from_le(*(source.offset(current as isize) as *const u16)) == forgetting; 38 | 39 | if increment_children { 40 | if r > 0 && !page_will_be_forgotten { 41 | try!(incr_rc(rng, txn, r)) 42 | } 43 | if let UnsafeValue::O { offset, .. } = value { 44 | try!(incr_rc(rng, txn, offset)) 45 | } 46 | } 47 | local_insert_at(rng, target, key, value, r, off, size, levels); 48 | } else { 49 | debug!("forget, replace with {:?}", replace_page); 50 | // debug!("forget, not freeing {:?} {:?}", do_free_value, value); 51 | /*if do_free_value && !increment_children { 52 | if let UnsafeValue::O { offset, len } = value { 53 | try!(free_value(rng, txn, offset, len)) 54 | } 55 | }*/ 56 | *((current_ptr as *mut u64).offset(2)) = replace_page.to_le() 57 | } 58 | } 59 | } 60 | Ok(()) 61 | } 62 | 63 | /// Merge a left child into a right child, adding the separator 64 | /// element (given as (key,value)), forgetting one value, and 65 | /// replacing the left child of that value with `replace_page`. 66 | fn merge_right( 67 | rng:&mut R, 68 | txn:&mut MutTxn, 69 | left:&Cow, right:&mut MutPage, forgetting:u16, replace_page:u64, 70 | key:&[u8], value:UnsafeValue, increment_children:bool) -> Result<(), Error> { 71 | unsafe { 72 | debug!("merge right {:?} {:?} {:?}", left.page_offset(), right.page_offset(), std::str::from_utf8(key)); 73 | // Merge the left page into the right page. 74 | // TODO: maybe we need to compact `right`. 75 | let mut levels = [0;N_LEVELS]; 76 | let right_left_child = u64::from_le(*((right.offset(0) as *const u64).offset(2))); 77 | let left_left_child = *((left.offset(0) as *const u64).offset(2)); 78 | *((right.offset(0) as *mut u64).offset(2)) = left_left_child.to_le(); 79 | 80 | let page_will_be_forgotten = u16::from_le(*(left.offset(FIRST_HEAD as isize) as *const u16)) == forgetting; 81 | debug!("page_will_be_forgotten = {:?}", page_will_be_forgotten); 82 | if increment_children && left_left_child > 0 && !page_will_be_forgotten { 83 | try!(incr_rc(rng, txn, left_left_child)) 84 | } 85 | try!(merge_page(rng, txn, left, right, &mut levels, forgetting, replace_page, increment_children)); 86 | 87 | let size = record_size(key.len(), value.len() as usize); 88 | let off = right.can_alloc(size); 89 | debug_assert!(off + size <= PAGE_SIZE as u16); 90 | // Already incremented by the caller (when they copied "right"). 91 | /*if increment_children && right_left_child > 0 { 92 | try!(incr_rc(rng, txn, right_left_child)) 93 | }*/ 94 | local_insert_at(rng, right, key, value, right_left_child, off, size, &mut levels); 95 | } 96 | Ok(()) 97 | } 98 | 99 | /// Merge a right child into a left child, adding the separator 100 | /// element (given as (key,value)), forgetting one value, and 101 | /// replacing the left child of that value with `replace_page`. 102 | fn merge_left( 103 | rng:&mut R, 104 | txn:&mut MutTxn, 105 | right:&Cow, left:&mut MutPage, forgetting:u16, replace_page:u64, 106 | key:&[u8], value:UnsafeValue, 107 | increment_children:bool) -> Result<(), Error> { 108 | unsafe { 109 | debug!("merge left {:?} {:?} {:?}", right.page_offset(), left.page_offset(), std::str::from_utf8(key)); 110 | let mut levels = [0;N_LEVELS]; 111 | // First mission: set the levels to the last entry. 112 | let mut l = N_LEVELS-1; 113 | loop { 114 | loop { 115 | let next = u16::from_le(*((left.offset(levels[l] as isize) as *const u16).offset(l as isize))); 116 | if next != NIL { 117 | levels[l] = next 118 | } else { 119 | break 120 | } 121 | } 122 | if l == 0 { 123 | break 124 | } else { 125 | l-=1; 126 | levels[l] = levels[l+1] 127 | } 128 | } 129 | // Then, insert the separator, with child page the leftmost child of `right`. 130 | debug!("levels={:?}", levels); 131 | { 132 | let child = u64::from_le(*((right.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 133 | let size = record_size(key.len(), value.len() as usize); 134 | let off = left.can_alloc(size); 135 | // TODO: compact if necessary. 136 | debug_assert!(off + size <= PAGE_SIZE as u16); 137 | let page_will_be_forgotten = u16::from_le(*(right.offset(FIRST_HEAD as isize) as *const u16)) == forgetting; 138 | debug!("page_will_be_forgotten = {:?}", page_will_be_forgotten); 139 | if increment_children && child > 0 && !page_will_be_forgotten { 140 | try!(incr_rc(rng, txn, child)) 141 | } 142 | local_insert_at(rng, left, key, value, child, off, size, &mut levels); 143 | } 144 | // Finally, add all elements from `right` to `left`. 145 | try!(merge_page(rng, txn, right, left, &mut levels, forgetting, replace_page, increment_children)); 146 | } 147 | Ok(()) 148 | } 149 | 150 | 151 | /// Assuming `child_page` is the right child of the binding given by 152 | /// `levels`, merge it into its right sibling. 153 | pub fn merge_children_right( 154 | rng:&mut R, txn:&mut MutTxn, page:Cow, 155 | levels:[u16;N_LEVELS], 156 | child_page:&Cow, child_will_be_dup:bool, 157 | delete:&[u16], merged:u64, 158 | page_will_be_dup:bool) -> Result { 159 | 160 | debug!("merge_children_right {:?}", page_will_be_dup); 161 | 162 | let next_offset = unsafe { u16::from_le(*(page.offset(levels[0] as isize) as *const u16)) }; 163 | let next_ptr = page.offset(next_offset as isize); 164 | let right_sibling = txn.load_cow_page(unsafe { u64::from_le(*(next_ptr as *const u64).offset(2)) }); 165 | debug_assert!(child_page.page_offset() != right_sibling.page_offset()); 166 | let right_sibling_size = right_sibling.occupied(); 167 | 168 | // Separator 169 | let (next_key, next_value) = unsafe { read_key_value(next_ptr) }; 170 | let next_record_size = record_size(next_key.len(), next_value.len() as usize); 171 | 172 | // Size of the element deleted in `child_page`. 173 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 174 | let deleted_size = { 175 | let ptr = child_page.offset(forgetting as isize); 176 | let (key,value) = unsafe { read_key_value(ptr) }; 177 | debug!("delete key: {:?}", std::str::from_utf8(key)); 178 | record_size(key.len(), value.len() as usize) 179 | }; 180 | debug!("child_page_occupied {:?} {:?}", child_page.occupied(), deleted_size); 181 | 182 | if right_sibling_size + child_page.occupied() - deleted_size - 24 + next_record_size <= PAGE_SIZE as u16 { 183 | // Merge child_page into its right sibling. 184 | if page_will_be_dup { 185 | if let UnsafeValue::O { offset,.. } = next_value { 186 | try!(incr_rc(rng, txn, offset)) 187 | } 188 | } 189 | 190 | // Check the need for compaction of the right sibling. 191 | let needs_compaction = { 192 | let extra_size = child_page.occupied() - deleted_size - 24 + next_record_size; 193 | let off = right_sibling.can_alloc(extra_size); 194 | off+extra_size > PAGE_SIZE as u16 195 | }; 196 | let merged_right_sibling = { 197 | let levels = [0;N_LEVELS]; 198 | let mut new_levels = [0;N_LEVELS]; 199 | let right_sibling_rc = get_rc(txn, right_sibling.page_offset()); 200 | if right_sibling_rc > 1 { 201 | // We're not going to reference it anymore, since we need to copy it. 202 | try!(decr_rc(rng, txn, right_sibling.page_offset())) 203 | } 204 | let mut right_sibling = 205 | if page_will_be_dup || right_sibling_rc > 1 { 206 | // If another page is pointing to the right sibling, or will be (needs_dup), copy. 207 | try!(copy_page(rng, txn, &right_sibling.as_page(), &levels, &mut new_levels, false, false, 0, true)) 208 | } else { 209 | // Else, just CoW. 210 | try!(cow_pinpointing(rng, txn, 211 | if needs_compaction { right_sibling.as_nonmut() } else { right_sibling }, 212 | &levels, 213 | &mut new_levels, false, false, 0)) 214 | }; 215 | try!(merge_right(rng, txn, &child_page, &mut right_sibling, forgetting, merged, next_key, 216 | next_value, page_will_be_dup || child_will_be_dup)); 217 | right_sibling 218 | }; 219 | 220 | debug!("page_will_be_dup: {:?} {:?}", child_page.page_offset(), page_will_be_dup); 221 | if !page_will_be_dup { 222 | // If the page is not duplicated, we lose one reference to 223 | // the child. The right sibling is unchanged, though (or 224 | // was already duplicated). 225 | try!(free(rng, txn, child_page.page_offset())) 226 | } 227 | // Now, delete (next_key, next_value) from the current page. 228 | if page.occupied() - next_record_size < (PAGE_SIZE as u16)/2 { 229 | 230 | // let page_rc = get_rc(txn, page.page_offset()); 231 | Ok(Res::Underfull { page:page, delete:levels, merged:merged_right_sibling.page_offset(), 232 | must_be_dup: page_will_be_dup }) 233 | 234 | } else { 235 | let mut new_levels = [0;N_LEVELS]; 236 | let page = 237 | if page_will_be_dup { 238 | // If there are, or will be, several pointers to the current page, copy it. 239 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, 240 | merged_right_sibling.page_offset(), true)) 241 | } else { 242 | try!(cow_pinpointing(rng, txn, page, &levels, 243 | &mut new_levels, true, false, 244 | merged_right_sibling.page_offset())) 245 | }; 246 | Ok(Res::Ok { page:page }) 247 | } 248 | } else { 249 | debug!("giving up merge"); 250 | Ok(Res::Nothing { page:page }) 251 | } 252 | } 253 | 254 | 255 | /// Assuming `child_page` is the right child of the *next* binding, merge it into its left sibling. 256 | pub fn merge_children_left( 257 | rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], 258 | child_page:&Cow, child_will_be_dup:bool, 259 | delete:&[u16], merged:u64, 260 | page_will_be_dup:bool) -> Result { 261 | 262 | debug!("merge_children_left {:?}", page_will_be_dup); 263 | // Load the left sibling and compute its size. 264 | let left_sibling = { 265 | let current_ptr = page.offset(levels[0] as isize); 266 | txn.load_cow_page(unsafe { u64::from_le(*(current_ptr as *const u64).offset(2)) }) 267 | }; 268 | debug_assert!(child_page.page_offset() != left_sibling.page_offset()); 269 | debug!("left_sibling = {:?}", left_sibling); 270 | let left_sibling_size = left_sibling.occupied(); 271 | 272 | // Find the separator and compute its size 273 | let next_offset = u16::from_le(unsafe { *(page.offset(levels[0] as isize) as *const u16) }); 274 | let next_ptr = page.offset(next_offset as isize); 275 | let (next_key, next_value) = unsafe { read_key_value(next_ptr) }; 276 | let next_record_size = record_size(next_key.len(), next_value.len() as usize); 277 | 278 | // Compute the size of the element deleted in `child_page`. 279 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 280 | let deleted_size = { 281 | let ptr = child_page.offset(forgetting as isize); 282 | let (key,value) = unsafe { read_key_value(ptr) }; 283 | debug!("delete key: {:?}", std::str::from_utf8(key)); 284 | record_size(key.len(), value.len() as usize) 285 | }; 286 | debug!("child_page_occupied {:?} {:?}", child_page.occupied(), deleted_size); 287 | // If there's enough space in the left sibling, merge. Else, return Res::Nothing { .. }. 288 | if left_sibling_size + child_page.occupied() - deleted_size - 24 + next_record_size <= PAGE_SIZE as u16 { 289 | if page_will_be_dup { 290 | if let UnsafeValue::O { offset,.. } = next_value { 291 | try!(incr_rc(rng, txn, offset)) 292 | } 293 | } 294 | 295 | // Check the need for compaction of the right sibling. 296 | let needs_compaction = { 297 | let extra_size = child_page.occupied() - deleted_size - 24 + next_record_size; 298 | let off = left_sibling.can_alloc(extra_size); 299 | off+extra_size > PAGE_SIZE as u16 300 | }; 301 | 302 | let left_sibling_rc = get_rc(txn, left_sibling.page_offset()); 303 | if left_sibling_rc > 1 { 304 | // We're not going to reference it anymore, since we need to copy it. 305 | try!(decr_rc(rng, txn, left_sibling.page_offset())) 306 | } 307 | let merged_left_sibling = { 308 | let levels = [0;N_LEVELS]; 309 | let mut new_levels = [0;N_LEVELS]; 310 | let mut left_sibling = 311 | if page_will_be_dup || left_sibling_rc > 1 { 312 | try!(copy_page(rng, txn, &left_sibling.as_page(), &levels, &mut new_levels, false, false, 0, true)) 313 | } else { 314 | try!(cow_pinpointing(rng, txn, 315 | if needs_compaction { left_sibling.as_nonmut() } else { left_sibling }, 316 | &levels, 317 | &mut new_levels, false, false, 0)) 318 | }; 319 | try!(merge_left(rng, txn, &child_page, &mut left_sibling, forgetting, merged, next_key, next_value, 320 | page_will_be_dup || child_will_be_dup)); 321 | left_sibling 322 | }; 323 | debug!("page_will_be_dup: {:?} {:?}", child_page.page_offset(), page_will_be_dup); 324 | if !page_will_be_dup { 325 | // If the page is not duplicated, we lose one reference to 326 | // the child. The right sibling is unchanged, though (or 327 | // was already duplicated). 328 | try!(free(rng, txn, child_page.page_offset())) 329 | } 330 | 331 | // Now, delete (next_key, next_value) from the current page. 332 | if page.occupied() - next_record_size < (PAGE_SIZE as u16)/2 { 333 | //let page_rc = get_rc(txn, page.page_offset()); 334 | Ok(Res::Underfull { page:page, delete:levels, merged:merged_left_sibling.page_offset(), 335 | must_be_dup: page_will_be_dup }) 336 | 337 | } else { 338 | let mut new_levels = [0;N_LEVELS]; 339 | let page = 340 | if page_will_be_dup { 341 | // If there are, or will be, several pointers to the current page, copy. 342 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, 343 | merged_left_sibling.page_offset(), true)) 344 | } else { 345 | try!(cow_pinpointing(rng, txn, page, &levels, 346 | &mut new_levels, true, false, 347 | merged_left_sibling.page_offset())) 348 | }; 349 | Ok(Res::Ok { page:page }) 350 | } 351 | } else { 352 | debug!("giving up merge"); 353 | Ok(Res::Nothing { page:page }) 354 | } 355 | } 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | // Assuming we've just deleted an internal node (and thus `levels` is 365 | // set to the element just before the deleted node), merge 366 | // `child_page` into its left sibling if possible, and return `Res::Nothing{..}` else. 367 | pub fn merge_children_replace( 368 | rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], 369 | child_page:&Cow, child_will_be_dup:bool, 370 | replacement:&Smallest, 371 | delete:&[u16], merged:u64, 372 | page_will_be_dup:bool) -> Result { 373 | 374 | debug!("merge_children_replace"); 375 | // Compute the sizes of (1) the left sibling, (2) the deleted 376 | // element on `child_page`, (3) the size of the separator and (4) 377 | // the size of `child_page`. 378 | 379 | let left_ptr = page.offset(levels[0] as isize); 380 | let left_sibling = txn.load_cow_page(u64::from_le(unsafe { *(left_ptr as *const u64).offset(2) })); 381 | // (1) 382 | let left_sibling_size = left_sibling.occupied(); 383 | 384 | // (2) 385 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 386 | let deleted_size = { 387 | let ptr = child_page.offset(forgetting as isize); 388 | let (key,value) = unsafe { read_key_value(ptr) }; 389 | debug!("delete key: {:?}", std::str::from_utf8(key)); 390 | record_size(key.len(), value.len() as usize) 391 | }; 392 | // (3) 393 | let (next_key, next_value) = { 394 | let key = unsafe { std::slice::from_raw_parts(replacement.key_ptr, replacement.key_len) }; 395 | (key, replacement.value) 396 | }; 397 | debug!("replacement = {:?}",std::str::from_utf8(next_key)); 398 | let next_record_size = record_size(next_key.len(), next_value.len() as usize); 399 | // (4) 400 | let child_page_size = child_page.occupied(); 401 | 402 | debug!("child_page_occupied {:?} {:?}", child_page.occupied(), deleted_size); 403 | // If we can merge, do it. Else, return Res::Nothing { .. }. 404 | if left_sibling_size + child_page_size - 24 + next_record_size - deleted_size <= PAGE_SIZE as u16 { 405 | 406 | // Already increased when we deleted it from the smallest descendant page. 407 | /*if page_will_be_dup || child_will_be_dup { 408 | if let UnsafeValue::O { offset, .. } = next_value { 409 | try!(incr_rc(rng, txn, offset)) 410 | } 411 | }*/ 412 | // Check the need for compaction of the right sibling. 413 | let needs_compaction = { 414 | let extra_size = child_page.occupied() - deleted_size - 24 + next_record_size; 415 | let off = left_sibling.can_alloc(extra_size); 416 | off+extra_size > PAGE_SIZE as u16 417 | }; 418 | let left_sibling_rc = get_rc(txn, left_sibling.page_offset()); 419 | if left_sibling_rc > 1 { 420 | // We're not going to reference it anymore, since we need to copy it. 421 | try!(decr_rc(rng, txn, left_sibling.page_offset())) 422 | } 423 | let merged_left_sibling = { 424 | let levels = [0;N_LEVELS]; 425 | let mut new_levels = [0;N_LEVELS]; 426 | let mut left_sibling = 427 | if page_will_be_dup || left_sibling_rc > 1 { 428 | try!(copy_page(rng, txn, 429 | &left_sibling.as_page(), 430 | &levels, &mut new_levels, false, false, 0, true)) 431 | } else { 432 | try!(cow_pinpointing(rng, txn, 433 | if needs_compaction { left_sibling.as_nonmut() } else { left_sibling }, 434 | &levels, &mut new_levels, false, false, 0)) 435 | }; 436 | try!(merge_left(rng, txn, &child_page, &mut left_sibling, forgetting, merged, next_key, next_value, 437 | page_will_be_dup || child_will_be_dup)); 438 | left_sibling 439 | }; 440 | // Now, delete (next_key, next_value) from the current page. 441 | let result = if page.occupied() - next_record_size < (PAGE_SIZE as u16)/2 { 442 | // If this makes the current page underfull. 443 | // let page_rc = get_rc(txn, page.page_offset()); 444 | debug!("underfull"); 445 | Ok(Res::Underfull { page:page, delete:levels, merged:merged_left_sibling.page_offset(), 446 | must_be_dup: page_will_be_dup }) 447 | } else { 448 | // Else, just delete. 449 | debug!("not underfull"); 450 | let mut new_levels = [0;N_LEVELS]; 451 | let page = 452 | if page_will_be_dup { 453 | try!(copy_page(rng, txn, 454 | &page.as_page(), 455 | &levels, &mut new_levels, true, false, 456 | merged_left_sibling.page_offset(), true)) 457 | } else { 458 | try!(cow_pinpointing(rng, txn, page, &levels, 459 | &mut new_levels, true, false, 460 | merged_left_sibling.page_offset())) 461 | }; 462 | Ok(Res::Ok { page:page }) 463 | }; 464 | if !page_will_be_dup { 465 | try!(free(rng, txn, child_page.page_offset())); 466 | } 467 | result 468 | } else { 469 | debug!("giving up merge"); 470 | Ok(Res::Nothing { page:page }) 471 | } 472 | } 473 | 474 | 475 | -------------------------------------------------------------------------------- /src/transaction.rs: -------------------------------------------------------------------------------- 1 | // This Source Code Form is subject to the terms of the Mozilla Public 2 | // License, v. 2.0. If a copy of the MPL was not distributed with this 3 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | // TODO: 7 | // - get rid of initial length, grow file as needed. In other words, write lock + unmap + set_len + mmap. 8 | 9 | // X 32 bits mmap64 -> delegated to memmap crate. 10 | // X Windows -> delegated to memmap crate. 11 | // X SPARC (8kB pages) -> Allocate two consecutive pages instead of one. The BTree won't see the difference anyway. 12 | // X 32 bits compatibility. mmap has 64 bits offsets. 13 | // X process and thread mutex for mutable transactions. 14 | // X multiple consecutive pages (done with glue_pages) 15 | // X PAGE_SIZE is now a constant, check modulos/divisions to make that constant too. 16 | // X merge last page : done for just the last page, but could probably be improved. 17 | // X count allocated pages (debug/test). 18 | // X test page size in build.rs 19 | // X documentation 20 | 21 | // Types guarantee: the only pages we write are the ones we allocate. 22 | 23 | // LMDB takes care of zombie readers, at the cost of checking a file of size linear in the number of PIDs at the beginning of every transaction. Also, doesn't work on USB sticks. More details: mdb.c, line 2606: PID locks. 24 | 25 | use std; 26 | use std::sync::{RwLock, RwLockReadGuard, Mutex, MutexGuard}; 27 | use std::ptr::copy_nonoverlapping; 28 | use std::collections::{HashSet,HashMap}; 29 | use fs2::FileExt; 30 | use std::fs::{File,OpenOptions}; 31 | use std::path::Path; 32 | use memmap; 33 | 34 | pub const CURRENT_VERSION: u64 = 0; 35 | 36 | const OFF_MAP_LENGTH:isize = 1; 37 | const OFF_CURRENT_FREE:isize = 2; 38 | // We need a fixed page size for compatibility reasons. Most systems will have half of this, but some (SPARC) don't... 39 | pub const PAGE_SIZE: usize = 4096; 40 | pub const PAGE_SIZE_16: u16 = 4096; 41 | pub const PAGE_SIZE_64: u64 = 4096; 42 | 43 | pub const ZERO_HEADER: isize = 24; // size of the header on page 0, in bytes. 44 | #[derive(Debug)] 45 | pub enum Error { 46 | IO(std::io::Error), 47 | NotEnoughSpace, 48 | Poison 49 | } 50 | 51 | impl std::fmt::Display for Error { 52 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 53 | match *self { 54 | Error::IO(ref err) => write!(f, "IO error: {}", err), 55 | Error::NotEnoughSpace => write!(f, "Not enough space. Try opening the environment with a larger size."), 56 | Error::Poison => write!(f, "Not enough space. Try opening the environment with a larger size."), 57 | } 58 | } 59 | } 60 | 61 | impl std::error::Error for Error { 62 | fn description(&self) -> &str { 63 | match *self { 64 | Error::IO(ref err) => err.description(), 65 | Error::NotEnoughSpace => "Not enough space. Try opening the environment with a larger size.", 66 | Error::Poison => "Poison error" 67 | } 68 | } 69 | fn cause(&self) -> Option<&std::error::Error> { 70 | match *self { 71 | Error::IO(ref err) => Some(err), 72 | Error::NotEnoughSpace => None, 73 | Error::Poison => None 74 | 75 | } 76 | } 77 | } 78 | impl From for Error { 79 | fn from(e: std::io::Error) -> Error { 80 | Error::IO(e) 81 | } 82 | } 83 | 84 | impl From> for Error { 85 | fn from(_: std::sync::PoisonError) -> Error { 86 | Error::Poison 87 | } 88 | } 89 | 90 | // Lock order: first take thread locks, then process locks. 91 | 92 | // Why are there two synchronization mechanisms? 93 | // Because we would need to upgrade the read lock into a write lock, and rust has no way to do this. 94 | // So, we take a mutex to make sure no other mutable transaction can start, 95 | // and then at the time of writing, we also take the RwLock. 96 | 97 | /// Environment, required to start any transactions. Thread-safe, but opening the same database several times in the same process is not cross-platform. 98 | pub struct Env { 99 | pub length: u64, 100 | lock_file: File, 101 | mutable_file: File, 102 | mmap: memmap::Mmap, 103 | map: *mut u8, 104 | lock: RwLock<()>, // Ensure all reads are done when sync starts. 105 | mutable: Mutex<()>, // Ensure only one mutable transaction can be started. 106 | } 107 | unsafe impl Send for Env {} 108 | unsafe impl Sync for Env {} 109 | 110 | pub struct Txn<'env> { 111 | pub env: &'env Env, 112 | guard: RwLockReadGuard<'env, ()>, 113 | } 114 | 115 | pub struct MutTxn<'env,T> { 116 | pub env: &'env Env, 117 | mutable: Option>, 118 | parent:T, 119 | last_page: u64, 120 | current_list_page: Page, // current page storing the list of free pages. 121 | current_list_length: u64, // length of the current page of free pages. 122 | current_list_position: u64, // position in the current page of free pages. 123 | occupied_clean_pages: HashSet, /* Offsets of pages that were allocated by this transaction, and have not been freed since. */ 124 | free_clean_pages: Vec, /* Offsets of pages that were allocated by this transaction, and then freed. */ 125 | free_pages: Vec, /* Offsets of old pages freed by this transaction. These were *not* allocated by this transaction. */ 126 | pub roots:HashMap, 127 | } 128 | 129 | impl<'env> Drop for Txn<'env> { 130 | fn drop(&mut self) { 131 | self.env.lock_file.unlock().unwrap(); 132 | *self.guard; 133 | } 134 | } 135 | impl<'env,T> Drop for MutTxn<'env,T> { 136 | fn drop(&mut self) { 137 | debug!("dropping transaction"); 138 | self.env.mutable_file.unlock().unwrap(); 139 | if let Some(ref mut guard) = self.mutable { 140 | debug!("dropping guard"); 141 | **guard 142 | } 143 | } 144 | } 145 | 146 | 147 | #[derive(Debug)] 148 | pub struct Statistics { 149 | pub free_pages: HashSet, 150 | pub bookkeeping_pages: Vec, 151 | pub total_pages: u64, 152 | pub reference_counts: HashMap 153 | } 154 | 155 | 156 | impl Env { 157 | /// Initialize environment. log_length must be at least log(PAGE_SIZE) 158 | pub fn new>(path: P, length: u64) -> Result { 159 | //let length = (1 as u64).shl(log_length); 160 | let db_path = path.as_ref().join("db"); 161 | let db_exists = std::fs::metadata(&db_path).is_ok(); 162 | let file = try!( 163 | OpenOptions::new() 164 | .read(true) 165 | .write(true) 166 | .truncate(false) 167 | .create(true) 168 | .open(db_path) 169 | ); 170 | try!(file.set_len(length)); 171 | let mut mmap = try!(memmap::Mmap::open(&file, memmap::Protection::ReadWrite)); 172 | let lock_file = try!(File::create(path.as_ref() 173 | .join("db") 174 | .with_extension("lock"))); 175 | let mutable_file = try!(File::create(path.as_ref() 176 | .join("db") 177 | .with_extension("mut"))); 178 | let map = mmap.mut_ptr(); 179 | if !db_exists { 180 | unsafe { 181 | std::ptr::write_bytes(map, 0, PAGE_SIZE); 182 | *(map as *mut u64) = CURRENT_VERSION.to_le(); 183 | } 184 | } else { 185 | assert!(unsafe { u64::from_le(*(map as *const u64)) == CURRENT_VERSION }) 186 | } 187 | let env = Env { 188 | length: length, 189 | mmap: mmap, 190 | map: map, 191 | lock_file: lock_file, 192 | mutable_file: mutable_file, 193 | lock: RwLock::new(()), 194 | mutable: Mutex::new(()), 195 | }; 196 | Ok(env) 197 | } 198 | /// Start a read-only transaction. 199 | pub fn txn_begin<'env>(&'env self) -> Result,Error> { 200 | let read = try!(self.lock.read()); 201 | try!(self.lock_file.lock_shared()); 202 | Ok(Txn { 203 | env: self, 204 | guard: read, 205 | }) 206 | } 207 | 208 | /// Start a mutable transaction. Mutable transactions that go out of scope are automatically aborted. 209 | pub fn mut_txn_begin<'env>(&'env self) -> Result, Error> { 210 | unsafe { 211 | let last_page = u64::from_le(*((self.map as *const u64).offset(OFF_MAP_LENGTH))); 212 | let current_list_page = u64::from_le(*((self.map as *const u64).offset(OFF_CURRENT_FREE))); 213 | 214 | debug!("map header = {:?}, {:?}", last_page ,current_list_page); 215 | let guard = try!(self.mutable.lock()); 216 | debug!("taking file lock"); 217 | try!(self.mutable_file.lock_exclusive()); 218 | debug!("lock ok"); 219 | let current_list_page = Page { 220 | data: self.map.offset(current_list_page as isize), 221 | offset: current_list_page, 222 | }; 223 | let current_list_length = if current_list_page.offset == 0 { 224 | 0 225 | } else { 226 | u64::from_le(*((current_list_page.data as *const u64).offset(1))) 227 | }; 228 | Ok(MutTxn { 229 | env: self, 230 | mutable: Some(guard), 231 | parent:(), 232 | last_page: if last_page == 0 { 233 | PAGE_SIZE as u64 234 | } else { 235 | last_page 236 | }, 237 | current_list_page: current_list_page, 238 | current_list_length: current_list_length, 239 | current_list_position: current_list_length, /* position of the word immediately after the top. */ 240 | occupied_clean_pages: HashSet::new(), 241 | free_clean_pages: Vec::new(), 242 | free_pages: Vec::new(), 243 | roots: HashMap::new(), 244 | }) 245 | } 246 | } 247 | 248 | /// Compute statistics about pages. This is a potentially costlty operation, as we need to go through all bookkeeping pages. 249 | pub fn statistics(&self) -> Statistics { 250 | unsafe { 251 | let total_pages = u64::from_le(*((self.map as *const u64).offset(OFF_MAP_LENGTH))) as usize; 252 | let mut free_pages = HashSet::new(); 253 | let mut bookkeeping_pages = Vec::new(); 254 | let mut cur = u64::from_le(*((self.map as *const u64).offset(OFF_CURRENT_FREE))); 255 | while cur != 0 { 256 | bookkeeping_pages.push(cur); 257 | let p = self.map.offset(cur as isize) as *const u64; 258 | let prev = u64::from_le(*p); 259 | let len = u64::from_le(*(p.offset(1))); // size (number of u64). 260 | debug!("bookkeeping page: {:?}, {} {}", cur, prev, len); 261 | { 262 | let mut p: *const u64 = (p as *const u64).offset(2); 263 | let mut i = 0; 264 | while i < len { 265 | let free_page = u64::from_le(*p); 266 | if !free_pages.insert(free_page) { 267 | panic!("free page counted twice: {:?}",free_page) 268 | } 269 | p = p.offset(1); 270 | i += 1 271 | } 272 | } 273 | cur = prev 274 | } 275 | let refcounts = HashMap::new(); 276 | Statistics { 277 | total_pages: (total_pages / PAGE_SIZE) as u64, 278 | free_pages: free_pages, 279 | bookkeeping_pages: bookkeeping_pages, 280 | reference_counts: refcounts 281 | } 282 | } 283 | } 284 | } 285 | 286 | /// This is a semi-owned page: just as we can mutate several indices of an array in the same scope, we must be able to get several pages from a single environment in the same scope. However, pages don't outlive their environment. Pages longer than one PAGE_SIZE might trigger calls to munmap when they go out of scope. 287 | #[derive(Debug)] 288 | pub struct Page { 289 | pub data: *const u8, 290 | pub offset: u64, 291 | } 292 | #[derive(Debug)] 293 | pub struct MutPage { 294 | pub data: *mut u8, 295 | pub offset: u64, 296 | } 297 | 298 | impl MutPage { 299 | pub fn as_page(&self) -> Page { 300 | Page { data:self.data, offset: self.offset } 301 | } 302 | } 303 | 304 | pub unsafe fn free(txn: &mut MutTxn, offset: u64) { 305 | debug!("transaction::free page: {:?}", offset); 306 | if txn.occupied_clean_pages.remove(&offset) { 307 | txn.free_clean_pages.push(offset); 308 | } else { 309 | // Else, register it for freeing (we cannot reuse it in this transaction). 310 | txn.free_pages.push(offset) 311 | } 312 | } 313 | 314 | impl<'env> Txn<'env> { 315 | /// Find the appropriate map segment 316 | pub fn load_page(&self, off: u64) -> Page { 317 | debug!("load_page: off={:?}, length = {:?}", off, self.env.length); 318 | assert!(off < self.env.length); 319 | unsafe { 320 | Page { 321 | data: self.env.map.offset(off as isize), 322 | offset: off, 323 | } 324 | } 325 | } 326 | pub fn root(&self,num:isize) -> u64 { 327 | assert!(ZERO_HEADER + ((num+1)<<3) < (PAGE_SIZE as isize)); 328 | unsafe { 329 | u64::from_le(*((self.env.map.offset(ZERO_HEADER) as *const u64).offset(num))) 330 | } 331 | } 332 | } 333 | 334 | #[derive(Debug)] 335 | pub enum Cow { 336 | Page(Page), 337 | MutPage(MutPage), 338 | } 339 | 340 | impl<'env,T> MutTxn<'env,T> { 341 | pub fn mut_txn_begin<'txn>(&'txn mut self) -> Result>, Error> { 342 | unsafe { 343 | let mut txn = MutTxn { 344 | env: self.env, 345 | mutable: None, 346 | parent: std::mem::uninitialized(), 347 | last_page: self.last_page, 348 | current_list_page: Page { data:self.current_list_page.data, 349 | offset: self.current_list_page.offset }, 350 | current_list_length: self.current_list_length, 351 | current_list_position: self.current_list_position, 352 | occupied_clean_pages: HashSet::new(), 353 | free_clean_pages: Vec::new(), 354 | free_pages: Vec::new(), 355 | roots:self.roots.clone(), 356 | //reference_counts:self.reference_counts 357 | }; 358 | txn.parent = self; 359 | Ok(txn) 360 | } 361 | } 362 | pub fn load_page(&self, off: u64) -> Page { 363 | if off >= self.env.length { 364 | panic!("{:?} >= {:?}", off,self.env.length) 365 | } 366 | unsafe { 367 | Page { 368 | data: self.env.map.offset(off as isize), 369 | offset: off, 370 | } 371 | } 372 | } 373 | pub fn root(&self, num:isize) -> u64 { 374 | if let Some(root) = self.roots.get(&num) { 375 | *root 376 | } else { 377 | assert!(ZERO_HEADER + ((num+1)<<3) < (PAGE_SIZE as isize)); 378 | unsafe { 379 | u64::from_le(*((self.env.map.offset(ZERO_HEADER) as *const u64).offset(num as isize))) 380 | } 381 | } 382 | } 383 | pub fn set_root(&mut self, num:isize, value:u64) { 384 | self.roots.insert(num,value); 385 | } 386 | pub fn load_cow_page(&mut self, off: u64) -> Cow { 387 | debug!("transaction::load_mut_page: {:?} {:?}", 388 | off, 389 | self.occupied_clean_pages); 390 | assert!(off < self.env.length); 391 | if off != 0 && self.occupied_clean_pages.contains(&off) { 392 | unsafe { 393 | Cow::MutPage(MutPage { 394 | data: self.env.map.offset(off as isize), 395 | offset: off, 396 | }) 397 | } 398 | } else { 399 | unsafe { 400 | let d = self.env.map.offset(off as isize); 401 | Cow::Page(Page { 402 | data: d, 403 | offset: off, 404 | }) 405 | } 406 | } 407 | } 408 | 409 | /// Pop a free page from the list of free pages. 410 | fn free_pages_pop(&mut self) -> Option { 411 | debug!("free_pages_pop, current_list_position:{}", 412 | self.current_list_position); 413 | if self.current_list_page.offset == 0 { 414 | None 415 | } else { 416 | if self.current_list_position == 0 { 417 | let previous_page = unsafe { u64::from_le(*(self.current_list_page.data as *const u64)) }; 418 | debug!("free_pages_pop, previous page:{}", previous_page); 419 | if previous_page == 0 { 420 | None 421 | } else { 422 | // free page (i.e. push to the list of old 423 | // free pages), move to previous bookkeeping 424 | // pages, and call recursively. 425 | self.free_pages.push(self.current_list_page.offset); 426 | unsafe { 427 | self.current_list_page = Page { 428 | data: self.env.map.offset(previous_page as isize), 429 | offset: previous_page, 430 | }; 431 | self.current_list_length = u64::from_le(*((self.current_list_page.data as *const u64).offset(1))) 432 | } 433 | self.current_list_position = self.current_list_length; 434 | self.free_pages_pop() 435 | } 436 | } else { 437 | let pos = self.current_list_position; 438 | // find the page at the top. 439 | self.current_list_position -= 1; 440 | debug!("free_pages_pop, new position:{}", self.current_list_position); 441 | unsafe { 442 | Some(u64::from_le(*((self.current_list_page.data as *mut u64).offset(1 + pos as isize)))) 443 | } 444 | } 445 | } 446 | } 447 | /// Allocate a single page. 448 | pub fn alloc_page(&mut self) -> Result { 449 | debug!("alloc page"); 450 | // If we have allocated and freed a page in this transaction, use it first. 451 | if let Some(page) = self.free_clean_pages.pop() { 452 | debug!("clean page reuse:{}", page); 453 | self.occupied_clean_pages.insert(page); 454 | Ok(MutPage { 455 | data: unsafe { self.env.map.offset(page as isize) }, 456 | offset: page, 457 | }) 458 | } else { 459 | // Else, if there are free pages, take one. 460 | if let Some(page) = self.free_pages_pop() { 461 | debug!("using an old free page: {}", page); 462 | self.occupied_clean_pages.insert(page); 463 | Ok(MutPage { 464 | data: unsafe { self.env.map.offset(page as isize) }, 465 | offset: page, 466 | }) 467 | } else { 468 | // Else, allocate in the free space. 469 | let last = self.last_page; 470 | debug!("eating the free space: {}", last); 471 | if self.last_page + PAGE_SIZE_64 < self.env.length { 472 | self.last_page += PAGE_SIZE_64; 473 | self.occupied_clean_pages.insert(last); 474 | Ok(MutPage { 475 | data: unsafe { self.env.map.offset(last as isize) }, 476 | offset: last, 477 | }) 478 | } else { 479 | Err(Error::NotEnoughSpace) 480 | } 481 | } 482 | } 483 | } 484 | } 485 | 486 | pub trait Commit { 487 | fn commit(&mut self)->Result<(),Error>; 488 | } 489 | 490 | impl<'a,'env,T> Commit for MutTxn<'env,&'a mut MutTxn<'env,T>> { 491 | fn commit(&mut self)->Result<(),Error> { 492 | 493 | self.parent.last_page = self.last_page; 494 | self.parent.current_list_page = Page { offset:self.current_list_page.offset, 495 | data:self.current_list_page.data }; 496 | self.parent.current_list_length = self.current_list_length; 497 | self.parent.current_list_position = self.current_list_position; 498 | self.parent.occupied_clean_pages.extend(self.occupied_clean_pages.iter()); 499 | self.parent.free_clean_pages.extend(self.free_clean_pages.iter()); 500 | self.parent.free_pages.extend(self.free_pages.iter()); 501 | for (u,v) in self.roots.iter() { 502 | self.parent.roots.insert(*u,*v); 503 | } 504 | Ok(()) 505 | } 506 | } 507 | 508 | impl<'env> Commit for MutTxn<'env,()> { 509 | /// Commit a transaction. This is guaranteed to be atomic: either the commit succeeds, and all the changes made during the transaction are written to disk. Or the commit doesn't succeed, and we're back to the state just before starting the transaction. 510 | fn commit(&mut self) -> Result<(), Error> { 511 | // Tasks: 512 | // - allocate new pages (copy-on-write) to write the new list of free pages, including edited "stack pages". 513 | // 514 | // - write top of the stack 515 | // - write user data 516 | // 517 | // everything can be sync'ed at any time, except that the first page needs to be sync'ed last. 518 | unsafe { 519 | // Copy the current bookkeeping page to a newly allocated page. 520 | let mut current_page = try!(self.alloc_page()); 521 | if self.current_list_page.offset != 0 { 522 | // If there was at least one bookkeeping page before. 523 | debug!("commit: realloc BK, copy {:?}", self.current_list_position); 524 | copy_nonoverlapping(self.current_list_page.data as *const u64, 525 | current_page.data as *mut u64, 526 | 2 + self.current_list_position as usize); 527 | *((current_page.data as *mut u64).offset(1)) = self.current_list_position.to_le(); 528 | 529 | // and free the previous current bookkeeping page. 530 | debug!("freeing BK page {:?}", self.current_list_page.offset); 531 | self.free_pages.push(self.current_list_page.offset); 532 | 533 | } else { 534 | // Else, init the page. 535 | *(current_page.data as *mut u64) = 0; // previous page: none 536 | *((current_page.data as *mut u64).offset(1)) = 0; // len: 0 537 | } 538 | 539 | while !(self.free_pages.is_empty() && self.free_clean_pages.is_empty()) { 540 | debug!("commit: pushing"); 541 | // If page is full, or this is the first page, allocate new page. 542 | let len = u64::from_le(*((current_page.data as *const u64).offset(1))); 543 | debug!("len={:?}", len); 544 | if 16 + len * 8 + 8 >= PAGE_SIZE as u64 { 545 | debug!("commit: current is full, len={}", len); 546 | // 8 more bytes wouldn't fit in this page, time to allocate a new one 547 | 548 | let p = self.free_pages 549 | .pop() 550 | .unwrap_or_else(|| self.free_clean_pages.pop().unwrap()); 551 | 552 | let new_page = 553 | MutPage { 554 | data: self.env.map.offset(p as isize), 555 | offset: p, 556 | }; 557 | 558 | debug!("commit {} allocated {:?}", line!(), new_page.offset); 559 | // Write a reference to the current page (which cannot be null). 560 | *(new_page.data as *mut u64) = current_page.offset.to_le(); 561 | // Write the length of the new page (0). 562 | *((new_page.data as *mut u64).offset(1)) = 0; 563 | 564 | current_page = new_page; 565 | } else { 566 | // push 567 | let p = self.free_pages 568 | .pop() 569 | .unwrap_or_else(|| self.free_clean_pages.pop().unwrap()); 570 | debug!("commit: push {}", p); 571 | 572 | *((current_page.data as *mut u64).offset(1)) = (len + 1).to_le(); // increase length. 573 | *((current_page.data as *mut u64).offset(2 + len as isize)) = p.to_le(); // write pointer. 574 | } 575 | } 576 | // Take lock 577 | { 578 | debug!("commit: taking local lock"); 579 | *self.env.lock.write().unwrap(); 580 | debug!("commit: taking file lock"); 581 | self.env.lock_file.lock_exclusive().unwrap(); 582 | debug!("commit: lock ok"); 583 | for (u, v) in self.roots.iter() { 584 | *((self.env.map.offset(ZERO_HEADER) as *mut u64).offset(*u as isize)) = (*v).to_le(); 585 | } 586 | // synchronize all maps. Since PAGE_SIZE is not always 587 | // an actual page size, we flush the first two pages 588 | // last, instead of just the last one. 589 | try!(self.env.mmap.flush_range(2*PAGE_SIZE, (self.env.length - 2*PAGE_SIZE_64) as usize)); 590 | 591 | *((self.env.map as *mut u64).offset(OFF_MAP_LENGTH)) = self.last_page.to_le(); 592 | *((self.env.map as *mut u64).offset(OFF_CURRENT_FREE)) = current_page.offset.to_le(); 593 | try!(self.env.mmap.flush_range(0, 2*PAGE_SIZE)); 594 | self.env.lock_file.unlock().unwrap(); 595 | Ok(()) 596 | } 597 | } 598 | } 599 | // Abort the transaction. This is actually a no-op, just as a machine crash aborts a transaction. Letting the transaction go out of scope would have the same effect. 600 | // pub fn abort(self){ 601 | // } 602 | } 603 | -------------------------------------------------------------------------------- /src/rebalance.rs: -------------------------------------------------------------------------------- 1 | use super::txn::*; 2 | use super::transaction::{PAGE_SIZE,Error}; 3 | use std; 4 | use rand::{Rng}; 5 | 6 | extern crate log; 7 | use super::put::*; 8 | 9 | use super::del::Smallest; 10 | 11 | /// child_page is the next element's right child. 12 | pub fn handle_failed_right_rebalancing(rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], 13 | replacement:Option<&Smallest>, 14 | child_page:Cow, 15 | child_must_be_dup:bool, 16 | delete:[u16;N_LEVELS], replace_page:u64, 17 | do_free_value:bool, page_will_be_dup:bool) -> Result { 18 | debug!("handle failed right rebalancing {:?} {:?}", page.page_offset(), child_page.page_offset()); 19 | // Actually delete and replace in the child. 20 | let child_page_offset = child_page.page_offset(); 21 | let new_child_page = { 22 | let mut new_delete = [0;N_LEVELS]; 23 | if page_will_be_dup || child_must_be_dup { 24 | try!(copy_page(rng, txn, &child_page.as_page(), &delete, &mut new_delete, true, do_free_value, 0, true)) 25 | } else { 26 | try!(cow_pinpointing(rng, txn, child_page, 27 | &delete, 28 | &mut new_delete, 29 | true, do_free_value, 30 | replace_page)) 31 | } 32 | }; 33 | debug!("new_child_page: {:?}", new_child_page.page_offset()); 34 | if child_must_be_dup && !page_will_be_dup { 35 | try!(decr_rc(rng, txn, child_page_offset)) 36 | } 37 | if let Some(repl) = replacement { 38 | let mut new_levels = [0;N_LEVELS]; 39 | // Delete the next element on this page. 40 | let mut page = 41 | if page_will_be_dup { 42 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, true, 0, true)) 43 | } else { 44 | try!(cow_pinpointing(rng, txn, page, 45 | &levels, 46 | &mut new_levels, 47 | true, true, 48 | 0)) 49 | }; 50 | // Reinsert the replacement. 51 | let key = unsafe { std::slice::from_raw_parts(repl.key_ptr, repl.key_len) }; 52 | let size = record_size(key.len(), repl.value.len() as usize); 53 | let off = page.can_alloc(size); 54 | local_insert_at(rng, &mut page, key, repl.value, new_child_page.page_offset(), off, size, &mut new_levels); 55 | Ok(Res::Ok { page:page }) 56 | } else { 57 | let mut new_levels = [0;N_LEVELS]; 58 | let page = if page_will_be_dup { 59 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, false, false, 0, true)) 60 | } else { 61 | try!(cow_pinpointing(rng, txn, page, 62 | &levels, 63 | &mut new_levels, 64 | false, false, 65 | 0)) 66 | }; 67 | let next = u16::from_le(unsafe { *(page.offset(new_levels[0] as isize) as *const u16) }); 68 | unsafe { *((page.offset(next as isize) as *mut u64).offset(2)) = new_child_page.page_offset().to_le() } 69 | Ok(Res::Ok { page:page }) 70 | } 71 | } 72 | 73 | /// child_page is the current element's right child. 74 | pub fn handle_failed_left_rebalancing(rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], 75 | child_page:Cow, 76 | child_must_be_dup:bool, 77 | delete:[u16;N_LEVELS], replace_page:u64, do_free_value:bool, 78 | page_will_be_dup:bool) -> Result { 79 | debug!("handle failed left rebalancing {:?} {:?} {:?}", page.page_offset(), child_page.page_offset(), page_will_be_dup); 80 | // Actually delete and replace in the child. 81 | let child_page_offset = child_page.page_offset(); 82 | let new_child_page = { 83 | let mut new_delete = [0;N_LEVELS]; 84 | if page_will_be_dup || child_must_be_dup { 85 | try!(copy_page(rng, txn, &child_page.as_page(), &delete, &mut new_delete, 86 | true, do_free_value, replace_page, true)) 87 | } else { 88 | try!(cow_pinpointing(rng, txn, child_page, 89 | &delete, 90 | &mut new_delete, 91 | true, do_free_value, 92 | replace_page)) 93 | } 94 | }; 95 | debug!("new_child_page: {:?}", new_child_page.page_offset()); 96 | let mut new_levels = [0;N_LEVELS]; 97 | let page = 98 | if page_will_be_dup { 99 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, false, false, 100 | new_child_page.page_offset(), true)) 101 | } else { 102 | try!(cow_pinpointing(rng, txn, page, 103 | &levels[..], 104 | &mut new_levels[..], 105 | false, false, 106 | new_child_page.page_offset())) 107 | }; 108 | if child_must_be_dup && !page_will_be_dup { 109 | // If the child was copied, but its parent was not duplicated, one reference is lost. 110 | // In all other cases, all references are preserved. 111 | try!(decr_rc(rng, txn, child_page_offset)) 112 | } 113 | // We don't need to touch any reference counting here (they are 114 | // already handled in the calls to `copy_page` above). 115 | Ok(Res::Ok { page:page }) 116 | } 117 | 118 | 119 | /// Take elements from the current element's right child, and move 120 | /// them to the next element's right child, updating, and possibly 121 | /// replacing the separator with the provided replacement. 122 | /// 123 | /// Assumes the child page is the next element's right child. 124 | pub fn rebalance_right(rng:&mut R, txn:&mut MutTxn, page:Cow, mut levels:[u16;N_LEVELS], 125 | replacement:Option<&Smallest>, 126 | child_page:&Cow, child_must_dup:bool, 127 | forgetting:u16, replace_page:u64, 128 | page_will_be_dup:bool) -> Result { 129 | debug!("rebalance_right {:?}, levels {:?}", page.page_offset(), &levels[..]); 130 | 131 | // First operation: take all elements from one of the sides of the 132 | // merge, insert them into the other side. This might cause a split. 133 | 134 | // We want to delete the next element, i.e. the one after levels[0]. 135 | let next = u16::from_le(unsafe { *(page.offset(levels[0] as isize) as *const u16) }); 136 | debug_assert!(next!=NIL); 137 | 138 | // From now on, we'll call the "current" and "next" elements the 139 | // elements at levels[0] and the successor of levels[0], 140 | // regardless of whether they've been changed by the previous 141 | // calls. 142 | 143 | // Find the right child of the next element. 144 | let left_child = { 145 | let left_child = page.right_child(levels[0]); 146 | // u64::from_le(unsafe { *((page.offset(levels[0] as isize) as *const u64).offset(2)) }); 147 | txn.load_cow_page(left_child) 148 | }; 149 | // Find the right child of the current element. 150 | 151 | // Compute the page sizes to decide what to do (merge vs. rebalance). 152 | let right_size = child_page.occupied(); 153 | let left_size = left_child.occupied(); 154 | let middle_size = { 155 | if let Some(repl) = replacement { 156 | record_size(repl.key_len, repl.value.len() as usize) 157 | } else { 158 | let (key,value) = unsafe { read_key_value(page.offset(next as isize)) }; 159 | record_size(key.len(), value.len() as usize) 160 | } 161 | }; 162 | let deleted_size = { 163 | let ptr = child_page.offset(forgetting as isize); 164 | let (key,value) = unsafe { read_key_value(ptr) }; 165 | debug!("delete key: {:?}", std::str::from_utf8(key)); 166 | record_size(key.len(), value.len() as usize) 167 | }; 168 | if left_size <= right_size - deleted_size { 169 | return Ok(Res::Nothing { page:page }) 170 | } 171 | 172 | ////////////////////////////////////////////// 173 | 174 | let size = right_size + left_size + middle_size - deleted_size; 175 | debug!("sizes: {:?} {:?} {:?} sum = {:?}", right_size, left_size, middle_size, size); 176 | 177 | let mut new_left = try!(txn.alloc_page()); 178 | new_left.init(); 179 | let mut new_right = try!(txn.alloc_page()); 180 | new_right.init(); 181 | let mut middle = None; 182 | debug!("allocated {:?} and {:?}", new_left.page_offset(), new_right.page_offset()); 183 | 184 | let left_rc = get_rc(txn, left_child.page_offset()); 185 | 186 | let left_left_child = left_child.right_child(FIRST_HEAD); 187 | new_left.set_right_child(FIRST_HEAD, left_left_child); 188 | // u64::from_le(*((left_child.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 189 | // *((new_left.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = left_left_child.to_le(); 190 | if (page_will_be_dup || left_rc > 1) && left_left_child > 0 { 191 | // If both `left` and `new_left` stay alive after this 192 | // call, there is one more reference to left_left 193 | try!(incr_rc(rng, txn, left_left_child)) 194 | } else { 195 | debug!("line {:?}: not incr {:?}", line!(), left_left_child) 196 | } 197 | 198 | 199 | let mut left_bytes = 24; 200 | let mut left_levels = [0;N_LEVELS]; 201 | let mut right_levels = [0;N_LEVELS]; 202 | 203 | for (_, key, value, r) in PageIterator::new(&left_child,0) { 204 | 205 | let next_size = record_size(key.len(),value.len() as usize); 206 | if page_will_be_dup || left_rc > 1 { 207 | if r > 0 { 208 | try!(incr_rc(rng, txn, r)) 209 | } else { 210 | debug!("line {:?}: not incr {:?}", line!(), r) 211 | } 212 | if let UnsafeValue::O { offset,.. } = value { 213 | try!(incr_rc(rng, txn, offset)) 214 | } 215 | } else { 216 | debug!("line {:?}: not incr {:?}", line!(), r) 217 | } 218 | if middle.is_none() { 219 | debug!("left_bytes = {:?} {:?} {:?}", left_bytes, size, next_size); 220 | // Should we insert next_size into the left page, or as the middle element? 221 | if left_bytes+next_size // Size of the left page if we insert it into the left page. 222 | <= 223 | (size - next_size) / 2 // Size if we use this element as the middle one. 224 | { 225 | // insert in left page. 226 | let off = new_left.can_alloc(next_size); 227 | debug_assert!(off > 0); 228 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 229 | debug!("key -> left: {:?} {:?}", std::str::from_utf8(key), r); 230 | local_insert_at(rng, &mut new_left, key, value, r, off, next_size, &mut left_levels); 231 | left_bytes += next_size; 232 | } else { 233 | middle = Some((key.as_ptr(),key.len(),value,r)) 234 | } 235 | } else { 236 | // insert in right page. 237 | let off = new_right.can_alloc(next_size); 238 | debug_assert!(off > 0); 239 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 240 | debug!("key -> right: {:?} {:?}", std::str::from_utf8(key), r); 241 | local_insert_at(rng, &mut new_right, key, value, r, off, next_size, &mut right_levels); 242 | } 243 | } 244 | 245 | debug_assert!(middle.is_some()); 246 | { 247 | let right_left_child = child_page.right_child(FIRST_HEAD); 248 | // u64::from_le(unsafe { *((child_page.offset(0) as *const u64).offset(2)) }); 249 | debug!("right_left_child = {:?}", right_left_child); 250 | let (key,value) = unsafe { read_key_value(page.offset(next as isize)) }; 251 | let (key,value) = 252 | if let Some(repl) = replacement { 253 | debug!("replacement"); 254 | /* 255 | if !(child_must_dup || page_will_be_dup) && do_free_value { 256 | if let UnsafeValue::O { offset, len } = value { 257 | try!(free_value(rng, txn, offset, len)) 258 | } 259 | } 260 | */ 261 | unsafe { (std::slice::from_raw_parts(repl.key_ptr, repl.key_len), repl.value) } 262 | } else { 263 | debug!("original"); 264 | // If this page (the one containing the value) is 265 | // duplicated, there will be one more reference to 266 | // this value. 267 | if page_will_be_dup { 268 | if let UnsafeValue::O { ref offset, .. } = value { 269 | try!(incr_rc(rng, txn, *offset)) 270 | } 271 | } 272 | (key, value) 273 | }; 274 | let next_size = record_size(key.len(),value.len() as usize); 275 | let off = new_right.can_alloc(next_size); 276 | debug_assert!(off > 0); 277 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 278 | debug!("key -> right (middle): {:?} {:?} {:?} {:?}", std::str::from_utf8(key), right_left_child, page_will_be_dup, child_must_dup); 279 | if page_will_be_dup || child_must_dup { 280 | let page_will_be_forgotten = unsafe { 281 | u16::from_le(*(child_page.offset(FIRST_HEAD as isize) as *const u16)) == forgetting 282 | }; 283 | if right_left_child > 0 && !page_will_be_forgotten { 284 | // If the child is still alive after this call, increment 285 | // the grandchild's RC 286 | try!(incr_rc(rng, txn, right_left_child)) 287 | } else { 288 | debug!("line {:?}: not incr {:?}", line!(), right_left_child) 289 | } 290 | } else { 291 | debug!("line {:?}: not incr {:?}", line!(), right_left_child) 292 | } 293 | local_insert_at(rng, &mut new_right, key, value, right_left_child, off, next_size, &mut right_levels) 294 | } 295 | 296 | let mut last_updated_ptr = new_right.offset(right_levels[0] as isize); 297 | debug!("forgetting:{:?}", forgetting); 298 | for (cur, key, value, r) in PageIterator::new(child_page,0) { 299 | debug!("cur:{:?}, r:{:?}", cur, r); 300 | if cur != forgetting { 301 | let next_size = record_size(key.len(),value.len() as usize); 302 | // insert in right page. 303 | let off = new_right.can_alloc(next_size); 304 | debug_assert!(off > 0); 305 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 306 | last_updated_ptr = new_right.offset(off as isize); 307 | debug!("key -> right: {:?} {:?}", std::str::from_utf8(key), r); 308 | 309 | if page_will_be_dup || child_must_dup { 310 | let page_will_be_forgotten = unsafe { 311 | u16::from_le(*(child_page.offset(cur as isize) as *const u16)) == forgetting 312 | }; 313 | if r > 0 && !page_will_be_forgotten { 314 | try!(incr_rc(rng, txn, r)) 315 | } else { 316 | debug!("line {:?}: not incr {:?}", line!(), r) 317 | } 318 | if let UnsafeValue::O { offset, .. } = value { 319 | try!(incr_rc(rng, txn, offset)) 320 | } 321 | } else { 322 | debug!("line {:?}: not incr {:?}", line!(), r) 323 | } 324 | local_insert_at(rng, &mut new_right, key, value, r, off, next_size, &mut right_levels) 325 | } else { 326 | /* 327 | if !(child_must_dup || page_will_be_dup) && do_free_value { 328 | if let UnsafeValue::O { offset, len } = value { 329 | try!(free_value(rng, txn, offset, len)) 330 | } 331 | } 332 | */ 333 | debug!("replacing ptr, replace_page={:?}", replace_page); 334 | unsafe { *((last_updated_ptr as *mut u64).offset(2)) = replace_page.to_le(); } 335 | } 336 | } 337 | 338 | 339 | let result = { 340 | // Delete the current entry, insert the new one instead. 341 | if let Some((key_ptr,key_len,value,r)) = middle { 342 | 343 | unsafe { *((new_right.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = r.to_le(); } 344 | let key = unsafe { std::slice::from_raw_parts(key_ptr, key_len) }; 345 | debug!("middle = {:?}", std::str::from_utf8(key)); 346 | // The following call might split. 347 | unsafe { 348 | check_alloc_local_insert(rng, txn, page, 349 | key, value, new_left.page_offset(), new_right.page_offset(), &mut levels, 350 | page_will_be_dup) 351 | } 352 | } else { 353 | unreachable!() 354 | } 355 | }; 356 | debug!("result = {:?}", result); 357 | // 358 | debug!("freeing left: {:?} {:?}", left_child.page_offset(), page_will_be_dup); 359 | 360 | if !page_will_be_dup { 361 | // Decrease the reference counter of the left child. 362 | try!(free(rng, txn, left_child.page_offset())); 363 | // Decrease the reference counter of the child. 364 | debug!("freeing child: {:?}", child_page.page_offset()); 365 | try!(free(rng, txn, child_page.page_offset())); 366 | } 367 | result 368 | } 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | /// Take elements from the right child of the element immediately 377 | /// after the current one (where "current" is the one indicated by 378 | /// `levels`), and move them to the right child of the current 379 | /// element, updating the separator. 380 | /// 381 | /// Assumes `child_page` is the current element's right child. 382 | pub fn rebalance_left(rng:&mut R, txn:&mut MutTxn, page:Cow, mut levels:[u16;N_LEVELS], 383 | child_page:&Cow, child_must_dup:bool, 384 | forgetting:u16, replace_page:u64, 385 | page_will_be_dup:bool) -> Result { 386 | debug!("rebalance_left"); 387 | 388 | // First operation: take all elements from one of the sides of the 389 | // merge, insert them into the other side. This might cause a split. 390 | 391 | // We want to delete the next element, i.e. the one after levels[0]. 392 | let next = u16::from_le(unsafe { *(page.offset(levels[0] as isize) as *const u16) }); 393 | debug_assert!(next!=NIL); 394 | 395 | // From now on, we'll call the "current" and "next" elements the 396 | // elements at levels[0] and the successor of levels[0], 397 | // regardless of whether they've been changed by the previous 398 | // calls. 399 | 400 | // Find the right child of the next element. 401 | let right_child = { 402 | let right_child = page.right_child(next); // u64::from_le(unsafe { *((page.offset(next as isize) as *const u64).offset(2)) }); 403 | txn.load_cow_page(right_child) 404 | }; 405 | 406 | // Compute the page sizes to decide what to do (merge vs. rebalance). 407 | let left_size = child_page.occupied(); 408 | let right_size = right_child.occupied(); 409 | let middle_size = { 410 | let (key,value) = unsafe { read_key_value(page.offset(next as isize)) }; 411 | record_size(key.len(), value.len() as usize) 412 | }; 413 | let deleted_size = { 414 | let ptr = child_page.offset(forgetting as isize); 415 | let (key,value) = unsafe { read_key_value(ptr) }; 416 | debug!("delete key: {:?}", std::str::from_utf8(key)); 417 | record_size(key.len(), value.len() as usize) 418 | }; 419 | if right_size <= left_size - deleted_size { 420 | return Ok(Res::Nothing { page:page }) 421 | } 422 | let size = right_size + left_size + middle_size - deleted_size; 423 | debug!("sizes: {:?} {:?} {:?} sum = {:?}", right_size, left_size, middle_size, size); 424 | 425 | let mut new_left = try!(txn.alloc_page()); 426 | new_left.init(); 427 | let mut new_right = try!(txn.alloc_page()); 428 | new_right.init(); 429 | let mut middle = None; 430 | debug!("allocated {:?} and {:?}", new_left.page_offset(), new_right.page_offset()); 431 | 432 | let left_left_child = child_page.right_child(FIRST_HEAD); 433 | // u64::from_le(*((child_page.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 434 | new_left.set_right_child(FIRST_HEAD, left_left_child); 435 | // *((new_left.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = left_left_child.to_le(); 436 | 437 | unsafe { 438 | let page_will_be_forgotten = u16::from_le(*(child_page.offset(FIRST_HEAD as isize) as *const u16)) == forgetting; 439 | 440 | if (page_will_be_dup || child_must_dup) && left_left_child > 0 && !page_will_be_forgotten { 441 | debug!("incr left_left {:?}", left_left_child); 442 | try!(incr_rc(rng, txn, left_left_child)) 443 | } 444 | } 445 | 446 | let mut left_bytes = 24; 447 | let mut left_levels = [0;N_LEVELS]; 448 | let mut right_levels = [0;N_LEVELS]; 449 | 450 | let mut last_updated_ptr = new_left.offset(0); 451 | for (cur, key, value, r) in PageIterator::new(child_page,0) { 452 | if cur != forgetting { 453 | let next_size = record_size(key.len(),value.len() as usize); 454 | // insert in right page. 455 | let off = new_left.can_alloc(next_size); 456 | debug_assert!(off > 0); 457 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 458 | last_updated_ptr = new_left.offset(off as isize); 459 | 460 | debug!("key -> left: {:?} {:?}", std::str::from_utf8(key), r); 461 | if page_will_be_dup || child_must_dup { 462 | let page_will_be_forgotten = unsafe { 463 | u16::from_le(*(child_page.offset(cur as isize) as *const u16)) == forgetting 464 | }; 465 | if r > 0 && !page_will_be_forgotten { 466 | try!(incr_rc(rng, txn, r)) 467 | } else { 468 | debug!("line {:?}: not incr {:?}", line!(), r) 469 | } 470 | if let UnsafeValue::O { offset, .. } = value { 471 | try!(incr_rc(rng, txn, offset)) 472 | } 473 | } else { 474 | debug!("line {:?}: not incr {:?}", line!(), r) 475 | } 476 | local_insert_at(rng, &mut new_left, key, value, r, off, next_size, &mut left_levels); 477 | left_bytes += next_size; 478 | } else { 479 | // freeing value: already done in the recursive calls before 480 | /* 481 | if !(child_must_dup || page_will_be_dup) && do_free_value { 482 | if let UnsafeValue::O { offset, len } = value { 483 | try!(free_value(rng, txn, offset, len)) 484 | } 485 | } 486 | */ 487 | unsafe { *((last_updated_ptr as *mut u64).offset(2)) = replace_page.to_le() } 488 | } 489 | } 490 | let right_rc = get_rc(txn, right_child.page_offset()); 491 | { 492 | let right_left_child = right_child.right_child(FIRST_HEAD); 493 | // u64::from_le(unsafe { *((right_child.offset(0) as *const u64).offset(2)) }); 494 | let (key,value) = unsafe { read_key_value(page.offset(next as isize)) }; 495 | let next_size = record_size(key.len(),value.len() as usize); 496 | let off = new_left.can_alloc(next_size); 497 | debug_assert!(off > 0); 498 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 499 | debug!("key -> left: {:?} {:?}", std::str::from_utf8(key), right_left_child); 500 | if page_will_be_dup || right_rc > 1 { 501 | if right_left_child > 0 { 502 | debug!("incr right_left {:?}", right_left_child); 503 | try!(incr_rc(rng, txn, right_left_child)) 504 | } else { 505 | debug!("line {:?}: not incr {:?}", line!(), right_left_child) 506 | } 507 | } else { 508 | debug!("line {:?}: not incr {:?}", line!(), right_left_child) 509 | } 510 | if page_will_be_dup { 511 | if let UnsafeValue::O { offset, .. } = value { 512 | try!(incr_rc(rng, txn, offset)) 513 | } 514 | } 515 | local_insert_at(rng, &mut new_left, key, value, right_left_child, off, next_size, &mut left_levels); 516 | left_bytes += next_size; 517 | } 518 | for (_, key, value, r) in PageIterator::new(&right_child,0) { 519 | 520 | let next_size = record_size(key.len(),value.len() as usize); 521 | if page_will_be_dup || right_rc > 1 { 522 | if r > 0 { 523 | try!(incr_rc(rng, txn, r)) 524 | } else { 525 | debug!("line {:?}: not incr {:?}", line!(), r) 526 | } 527 | if let UnsafeValue::O { offset, .. } = value { 528 | try!(incr_rc(rng, txn, offset)) 529 | } 530 | } else { 531 | debug!("line {:?}: not incr {:?}", line!(), r) 532 | } 533 | if middle.is_none() { 534 | debug!("left_bytes = {:?} {:?} {:?}", left_bytes, size, next_size); 535 | // Should we insert next_size into the left page, or as the middle element? 536 | if left_bytes+next_size // Size of the left page if we insert it into the left page. 537 | <= 538 | (size - next_size) / 2 // Size if we use this element as the middle one. 539 | { 540 | // insert in left page. 541 | let off = new_left.can_alloc(next_size); 542 | debug_assert!(off > 0); 543 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 544 | debug!("key -> right: {:?} {:?}", std::str::from_utf8(key), r); 545 | local_insert_at(rng, &mut new_left, key, value, r, off, next_size, &mut left_levels); 546 | left_bytes += next_size; 547 | } else { 548 | middle = Some((key.as_ptr(),key.len(),value,r)) 549 | } 550 | } else { 551 | // insert in right page. 552 | let off = new_right.can_alloc(next_size); 553 | debug_assert!(off > 0); 554 | debug_assert!(off + next_size <= PAGE_SIZE as u16); 555 | local_insert_at(rng, &mut new_right, key, value, r, off, next_size, &mut right_levels); 556 | } 557 | } 558 | 559 | let result = { 560 | // Delete the current entry, insert the new one instead. 561 | if let Some((key_ptr,key_len,value,r)) = middle { 562 | 563 | new_right.set_right_child(FIRST_HEAD, r); 564 | // unsafe { *((new_right.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = r.to_le(); } 565 | let key = unsafe { std::slice::from_raw_parts(key_ptr, key_len) }; 566 | debug!("middle = {:?}", std::str::from_utf8(key)); 567 | // The following call might split. 568 | unsafe { 569 | check_alloc_local_insert(rng, txn, page, 570 | key, value, new_left.page_offset(), new_right.page_offset(), &mut levels, 571 | page_will_be_dup) 572 | } 573 | } else { 574 | unreachable!() 575 | } 576 | }; 577 | debug!("result = {:?}", result); 578 | // 579 | debug!("freeing right: {:?} {:?}", right_child.page_offset(), page_will_be_dup); 580 | if !page_will_be_dup { 581 | try!(free(rng, txn, right_child.page_offset())); 582 | debug!("freeing child: {:?}", child_page.page_offset()); 583 | try!(free(rng, txn, child_page.page_offset())); 584 | } 585 | result 586 | } 587 | 588 | 589 | 590 | 591 | /// If the levels have already been found, compact or split the page 592 | /// if necessary, and inserts the input (key, value) into the result, 593 | /// at the input levels. 594 | unsafe fn check_alloc_local_insert(rng:&mut R, txn:&mut MutTxn, page:Cow, key:&[u8], value:UnsafeValue, left_page: u64, right_page:u64, levels:&mut [u16], page_will_be_dup:bool) -> Result { 595 | debug!("check_alloc_local_insert, levels {:?}, left={:?}, right={:?}", levels, left_page, right_page); 596 | let size = record_size(key.len(), value.len() as usize); 597 | let mut new_levels = [NIL;N_LEVELS]; 598 | let off = page.can_alloc(size); 599 | if off > 0 { 600 | 601 | debug!("check_alloc_local_insert: non-split"); 602 | let mut page = 603 | if page_will_be_dup { 604 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, left_page, true)) 605 | } else { 606 | debug_assert!(get_rc(txn, page.page_offset()) <= 1); 607 | if off + size < PAGE_SIZE as u16 { 608 | // No need to copy nor compact the page, the value can be written right away. 609 | debug!("check_alloc, no compaction, levels={:?}", levels); 610 | try!(cow_pinpointing(rng, txn, page, levels, &mut new_levels, true, false, left_page)) 611 | } else { 612 | // Here, we need to compact the page, which is equivalent to considering it non mutable and CoW it. 613 | debug!("check_alloc, compaction, levels={:?}", levels); 614 | let page = try!(cow_pinpointing(rng, txn, page.as_nonmut(), levels, &mut new_levels, true, false, left_page)); 615 | page 616 | } 617 | }; 618 | let off = page.can_alloc(size); 619 | debug_assert!(off+size < PAGE_SIZE as u16); 620 | debug!("new_levels:{:?}", new_levels); 621 | local_insert_at(rng, &mut page, key, value, right_page, off, size, &mut new_levels); 622 | std::ptr::copy_nonoverlapping(new_levels.as_ptr(), levels.as_mut_ptr(), N_LEVELS); 623 | Ok(Res::Ok { page:page }) 624 | } else { 625 | debug!("check_alloc_local_insert: split"); 626 | let next = u16::from_le(*(page.offset(levels[0] as isize) as *const u16)); 627 | Ok(try!(split_page(rng, txn, &page, key, value, right_page, page_will_be_dup, next, levels[0], left_page))) 628 | } 629 | } 630 | -------------------------------------------------------------------------------- /src/put.rs: -------------------------------------------------------------------------------- 1 | use super::txn::*; 2 | use super::transaction::{PAGE_SIZE,Error}; 3 | use std; 4 | use std::cmp::Ordering; 5 | use super::transaction; 6 | use rand::{Rng}; 7 | 8 | extern crate log; 9 | 10 | #[derive(Debug)] 11 | pub enum Res { 12 | Ok { page: MutPage }, 13 | Underfull { 14 | page: Cow, // The page where we want to delete something. 15 | delete: [u16;N_LEVELS], // The binding before the one we want to delete. 16 | merged: u64, // The updated left child of the deleted binding. 17 | must_be_dup: bool // This page is referenced at least twice (used when rebalancing fails) 18 | }, 19 | Split { 20 | key_ptr:*const u8, 21 | key_len:usize, 22 | value: UnsafeValue, 23 | left: MutPage, 24 | right: MutPage, 25 | free_page: u64, // Former version of the page, before the 26 | // split. Free after the split is performed. Might be 0 if no page 27 | // needs to be freed / decremented. 28 | }, 29 | Nothing { page:Cow } 30 | } 31 | 32 | 33 | pub fn fork_db(rng:&mut R, txn:&mut MutTxn, off:u64) -> Result<(),Error> { 34 | try!(incr_rc(rng,txn,off)); 35 | Ok(()) 36 | } 37 | 38 | /// Increase the reference count of a page. 39 | pub fn incr_rc(rng:&mut R, txn:&mut MutTxn, off:u64)->Result<(),Error> { 40 | debug!(">>>>>>>>>>>> incr_rc"); 41 | let mut rc = if let Some(rc) = txn.rc() { rc } else { try!(txn.create_db()) }; 42 | let count = txn.get_u64(&rc, off).unwrap_or(1); 43 | debug!("incrementing page {:?} to {:?}", off, count+1); 44 | try!(txn.replace_u64(rng, &mut rc, off, count+1)); 45 | txn.set_rc(rc); 46 | debug!("<<<<<<<<<<<< incr_rc"); 47 | Ok(()) 48 | } 49 | 50 | /// Increase the reference count of a page. 51 | pub fn decr_rc(rng:&mut R, txn:&mut MutTxn, off:u64)->Result<(),Error> { 52 | let mut rc = if let Some(rc) = txn.rc() { rc } else { try!(txn.create_db()) }; 53 | let count = txn.get_u64(&rc, off).unwrap_or(1); 54 | debug!(">>>>>>>>>>>> decr_rc {:?} {:?}", off, count); 55 | if count-1 <= 1 { 56 | try!(txn.del_u64(rng, &mut rc, off)); 57 | } else { 58 | try!(txn.replace_u64(rng, &mut rc, off, count-1)); 59 | } 60 | txn.set_rc(rc); 61 | debug!("<<<<<<<<<<<< decr_rc"); 62 | Ok(()) 63 | } 64 | 65 | /// Get the reference count of a page. Returns 0 if the page is not reference-counted. 66 | pub fn get_rc(txn:&T, off:u64) -> u64 { 67 | if let Some(rc) = txn.rc() { 68 | txn.get_u64(&rc, off).unwrap_or(1) 69 | } else { 70 | 0 71 | } 72 | } 73 | 74 | 75 | /// Decrease the reference count of a page, freeing it if it's no longer referenced. 76 | pub fn free(rng:&mut R, txn:&mut MutTxn, off:u64) -> Result<(),Error> { 77 | //println!("freeing {:?}", off); 78 | debug_assert!(off != 0); 79 | let really_free = { 80 | if let Some(mut rc) = txn.rc() { 81 | if let Some(count) = txn.get_u64(&rc, off) { 82 | if count>1 { 83 | debug!("rc: {:?}, off: {:?}, count: {:?}", rc, off, rc); 84 | if count > 2 { 85 | try!(txn.replace_u64(rng, &mut rc, off, count-1)); 86 | } else { 87 | try!(txn.del_u64(rng, &mut rc, off)); 88 | }; 89 | txn.set_rc(rc); 90 | false 91 | } else { 92 | try!(txn.del_u64(rng,&mut rc,off)); 93 | txn.set_rc(rc); 94 | true 95 | } 96 | } else { 97 | true 98 | } 99 | } else { 100 | true 101 | } 102 | }; 103 | if really_free { 104 | let mut index = 3; 105 | if txn.protected_pages[0] == off { 106 | index = 0 107 | } else if txn.protected_pages[1] == off { 108 | index = 1 109 | } 110 | if index < 3 { 111 | debug!("not freeing protected {:?}", off); 112 | txn.free_protected[index] = true 113 | } else { 114 | debug!("really freeing {:?}", off); 115 | unsafe { transaction::free(&mut txn.txn, off) } 116 | } 117 | } 118 | Ok(()) 119 | } 120 | 121 | 122 | 123 | /// Allocate one large values, spanning over at least one page. 124 | pub fn alloc_value(txn:&mut MutTxn, value: &[u8]) -> Result { 125 | debug!("alloc_value"); 126 | let mut len = value.len(); 127 | let mut p_value = value.as_ptr(); 128 | let mut page = try!(txn.alloc_page()); 129 | let first_page = page.page_offset(); 130 | unsafe { 131 | loop { 132 | if len <= PAGE_SIZE { 133 | std::ptr::copy_nonoverlapping(p_value, page.offset(0), len); 134 | break 135 | } else { 136 | std::ptr::copy_nonoverlapping(p_value, page.offset(8), PAGE_SIZE-8); 137 | p_value = p_value.offset((PAGE_SIZE-8) as isize); 138 | len -= PAGE_SIZE - 8; 139 | let next_page = try!(txn.alloc_page()); 140 | *(page.offset(0) as *mut u64) = next_page.page_offset().to_le(); 141 | page = next_page 142 | } 143 | } 144 | } 145 | debug_assert!(first_page > 0); 146 | debug!("/alloc_value"); 147 | Ok(UnsafeValue::O { offset: first_page, len: value.len() as u32 }) 148 | } 149 | 150 | 151 | 152 | pub fn free_value(rng:&mut R, txn:&mut MutTxn, mut offset:u64, mut len:u32)->Result<(),Error> { 153 | debug!(">>>>>>>>>>>>>>>>>>>>> freeing value {:?}", offset); 154 | let really_free = 155 | if let Some(mut rc) = txn.rc() { 156 | if let Some(count) = txn.get_u64(&mut rc, offset) { 157 | debug!("count = {:?}", count); 158 | if count>1 { 159 | try!(txn.replace_u64(rng, &mut rc, offset, count-1)); 160 | txn.set_rc(rc); 161 | false 162 | } else { 163 | try!(txn.del_u64(rng, &mut rc, offset)); 164 | txn.set_rc(rc); 165 | true 166 | } 167 | } else { 168 | true 169 | } 170 | } else { 171 | true 172 | }; 173 | if (!cfg!(feature="no_free")) && really_free { 174 | debug!("really freeing value {:?}", offset); 175 | unsafe { 176 | loop { 177 | if len <= PAGE_SIZE as u32 { 178 | transaction::free(&mut txn.txn, offset); 179 | break 180 | } else { 181 | let page = txn.load_cow_page(offset).data(); 182 | let next_offset = u64::from_le(*(page as *const u64)); 183 | transaction::free(&mut txn.txn, offset); 184 | 185 | len -= (PAGE_SIZE-8) as u32; 186 | offset = next_offset; 187 | } 188 | } 189 | } 190 | } 191 | debug!("<<<<<<<<<<<<<<<<<<<<< free_value"); 192 | Ok(()) 193 | } 194 | 195 | 196 | /// Returns a mutable copy of the page, possibly forgetting the next binding (and then possibly also freeing the associated value), and possibly incrementing the reference counts of child pages. 197 | /// If translate_right > 0, replaces the next child page by translate_right. 198 | /// 199 | /// For performance reasons, we don't copy anything on the way to the 200 | /// leaves, instead copying on the way back. 201 | /// 202 | /// Therefore, we might need to copy pages without freeing the 203 | /// previous one, since their reference count is not yet updated. 204 | /// 205 | pub fn copy_page(rng:&mut R, txn:&mut MutTxn, p:&Page, old_levels:&[u16], pinpoints:&mut [u16], 206 | forgetting_next: bool, forgetting_value:bool, 207 | translate_right: u64, incr_children_rc:bool) -> Result { 208 | unsafe { 209 | // Reset all pinpoints. 210 | for i in 0.. N_LEVELS { 211 | pinpoints[i] = FIRST_HEAD; 212 | } 213 | // 214 | 215 | let forget = if forgetting_next { 216 | u16::from_le(*(p.offset(old_levels[0] as isize) as *const u16)) 217 | } else { 218 | NIL 219 | }; 220 | 221 | let mut page = try!(txn.alloc_page()); 222 | debug!("copy_page: allocated {:?}", page.page_offset()); 223 | page.init(); 224 | let mut n = 0; 225 | let mut levels:[u16;N_LEVELS] = [FIRST_HEAD;N_LEVELS]; 226 | 227 | let right_page = 228 | if old_levels[0]==FIRST_HEAD && translate_right > 0 { 229 | translate_right 230 | } else { 231 | let r = u64::from_le(*((p.offset(FIRST_HEAD as isize) as *mut u64).offset(2))); 232 | if incr_children_rc && r > 0 { 233 | try!(incr_rc(rng, txn, r)) 234 | } 235 | r 236 | }; 237 | *((page.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = right_page.to_le(); 238 | 239 | for (current, key, value, right) in PageIterator::new(p, 0) { 240 | 241 | let right = if current == old_levels[0] && translate_right > 0 { 242 | translate_right 243 | } else { 244 | right 245 | }; 246 | if current != forget { 247 | 248 | if right > 0 && right != translate_right && incr_children_rc { 249 | debug!("copy, incr {:?}", right); 250 | try!(incr_rc(rng, txn, right)) 251 | } 252 | 253 | // Increase count of value if the previous 254 | // page is not freed at the end of this 255 | // function. 256 | if incr_children_rc { 257 | if let UnsafeValue::O { offset,.. } = value { 258 | try!(incr_rc(rng, txn, offset)) 259 | } 260 | } 261 | debug!("copy_page: {:?}", std::str::from_utf8(key)); 262 | let size = record_size(key.len(), value.len() as usize); 263 | let off = page.can_alloc(size); 264 | debug!("size={:?}, off = {:?}", size, off); 265 | debug_assert!(off > 0); 266 | page.reset_pointers(off); 267 | page.alloc_key_value(off, size, key.as_ptr(), key.len(), value); 268 | *((page.offset(off as isize) as *mut u64).offset(2)) = right.to_le(); 269 | 270 | for level in 0..N_LEVELS { 271 | if n & ((1 << level)-1) == 0 { // always true for level = 0 272 | debug!("link from {:?} to {:?} at level {:?}", levels[level], off, level); 273 | *((page.offset(levels[level] as isize) as *mut u16).offset(level as isize)) = off.to_le(); 274 | levels[level] = off; 275 | // If the pinpointed offset has not passed yet, update the pinpoint at this level. 276 | if pinpoints[0] == FIRST_HEAD && level > 0 && old_levels[0] != FIRST_HEAD { 277 | pinpoints[level] = off 278 | } 279 | } 280 | } 281 | 282 | if old_levels[0] == current { 283 | pinpoints[0] = off 284 | } 285 | } else { 286 | debug!("copy: forgetting"); 287 | if forgetting_value { 288 | // Here, maybe we need to forget 289 | if let UnsafeValue::O { offset, len } = value { 290 | //println!("cow_pinpointing: freeing value {:?}", offset); 291 | try!(free_value(rng, txn, offset, len)) 292 | } 293 | } 294 | } 295 | n+=1; 296 | } 297 | Ok(page) 298 | } 299 | } 300 | 301 | /// Turn a Cow into a MutPage, copying it if it's not already mutable. In the case a copy is needed, and argument 'pinpoint' is non-zero, a non-zero offset (in bytes) to the equivalent element in the new page is returned. This can happen for instance because of compaction. 302 | pub fn cow_pinpointing(rng:&mut R, txn:&mut MutTxn, page:Cow, old_levels:&[u16], pinpoints:&mut [u16], 303 | forgetting_next: bool, forgetting_value:bool, 304 | translate_right:u64) -> Result { 305 | unsafe { 306 | match page.cow { 307 | transaction::Cow::Page(p0) => { 308 | let p0_offset = p0.offset; 309 | let page_rc = get_rc(txn, p0_offset); 310 | let p = Page { page:p0 }; 311 | 312 | let page = try!(copy_page(rng, txn, &p, old_levels, pinpoints, forgetting_next, 313 | forgetting_value, translate_right, false)); // never increase the counter of child pages 314 | if page_rc <= 1 { 315 | if page_rc == 1 { 316 | let mut rc = txn.rc().unwrap(); 317 | try!(txn.del_u64(rng, &mut rc, p0_offset)); 318 | txn.set_rc(rc); 319 | } 320 | //println!("free cow: {:?}", page_offset); 321 | if !cfg!(feature="no_free") { 322 | transaction::free(&mut(txn.txn), p0_offset) 323 | } 324 | } else { 325 | let mut rc = txn.rc().unwrap(); 326 | try!(txn.replace_u64(rng, &mut rc, p0_offset, page_rc-1)); 327 | txn.set_rc(rc); 328 | } 329 | Ok(page) 330 | } 331 | transaction::Cow::MutPage(p) => { 332 | let p = MutPage { page:p }; 333 | std::ptr::copy_nonoverlapping(old_levels.as_ptr(), pinpoints.as_mut_ptr(), old_levels.len()); 334 | if forgetting_next { 335 | let next = u16::from_le(*(p.offset(old_levels[0] as isize) as *const u16)); 336 | debug!("next = {:?}", next); 337 | debug_assert!(next > 0); 338 | // We forget an entry, register the freed memory. 339 | let (key,value) = read_key_value(p.offset(next as isize)); 340 | if forgetting_value { 341 | if let UnsafeValue::O { offset, len } = value { 342 | // println!("cow_pinpointing: freeing value {:?}", offset); 343 | try!(free_value(rng, txn, offset, len)) 344 | } 345 | } 346 | // Mark the freed space on the page. 347 | let size = record_size(key.len(),value.len() as usize); 348 | *(p.p_occupied()) = (p.occupied() - size).to_le(); 349 | 350 | 351 | // Now, really delete! 352 | for l in 0..N_LEVELS { 353 | debug_assert!(old_levels[l] != NIL); 354 | let next_l = u16::from_le(*((p.offset(old_levels[l] as isize) as *const u16).offset(l as isize))); 355 | if next_l == next && next != NIL { 356 | // Replace the next one with the next-next-one, at this level. 357 | let next_next = u16::from_le(*((p.offset(next_l as isize) as *const u16).offset(l as isize))); 358 | debug!("copy {:?}, creating {:?} -> {:?} at level {:?}", 359 | p.page_offset(), 360 | old_levels[l], 361 | next_next, l); 362 | *((p.offset(old_levels[l] as isize) as *mut u16).offset(l as isize)) = 363 | next_next.to_le() 364 | } else { 365 | debug!("copy {:?}, no link at level {:?}, old_levels[l]={:?}, next_l={:?}, next={:?}", 366 | p.page_offset(), 367 | l, old_levels[l], next_l, next); 368 | } 369 | } 370 | } 371 | if translate_right > 0 { 372 | // Translate the right page. 373 | *((p.offset(old_levels[0] as isize) as *mut u64).offset(2)) = translate_right.to_le(); 374 | } 375 | Ok(p) 376 | } 377 | } 378 | } 379 | } 380 | 381 | 382 | #[cfg(test)] 383 | fn test_insert(value_size:usize) { 384 | extern crate tempdir; 385 | extern crate rand; 386 | extern crate env_logger; 387 | use super::{Env, Transaction}; 388 | 389 | use rand::{Rng}; 390 | let mut rng = rand::thread_rng(); 391 | 392 | env_logger::init().unwrap_or(()); 393 | let dir = tempdir::TempDir::new("pijul").unwrap(); 394 | let env = Env::new(dir.path(), 1000).unwrap(); 395 | let mut txn = env.mut_txn_begin().unwrap(); 396 | 397 | let mut page = txn.alloc_page().unwrap(); 398 | page.init(); 399 | 400 | let mut random:Vec<(String,String)> = Vec::new(); 401 | 402 | for i in 0..200 { 403 | println!("i={:?}", i); 404 | let key: String = rng 405 | .gen_ascii_chars() 406 | .take(200) 407 | .collect(); 408 | println!("key = {:?}", key); 409 | let value: String = rng 410 | .gen_ascii_chars() 411 | .take(value_size) 412 | .collect(); 413 | { 414 | let key = key.as_bytes(); 415 | let value = value.as_bytes(); 416 | let value = if value.len() > VALUE_SIZE_THRESHOLD { 417 | alloc_value(&mut txn,value).unwrap() 418 | } else { 419 | UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } 420 | }; 421 | 422 | match insert(&mut rng, &mut txn, Cow::from_mut_page(page), key, value, 0, false) { 423 | Ok(Res::Ok { page:page_,.. }) => { 424 | page = page_ 425 | }, 426 | Ok(Res::Nothing { page:page_ }) => { 427 | //println!("already present"); 428 | page = page_.unwrap_mut() 429 | }, 430 | Ok(x) => { 431 | page = root_split(&mut rng, &mut txn, x).unwrap() 432 | }, 433 | _ => panic!("") 434 | } 435 | 436 | let db = Db { root_num: -1, root: page.page_offset() }; 437 | debug!("debugging"); 438 | txn.debug(&[&db], format!("/tmp/after_{}",i), false, false); 439 | for &(ref key, _) in random.iter() { 440 | assert!(txn.get(&db, key.as_bytes(), None).is_some()) 441 | } 442 | 443 | } 444 | random.push((key,value)); 445 | } 446 | 447 | let db = Db { root_num: -1, root: page.page_offset() }; 448 | txn.debug(&[&db], format!("/tmp/debug"), false, false); 449 | for &(ref key, _) in random.iter() { 450 | assert!(txn.get(&db, key.as_bytes(), None).is_some()) 451 | } 452 | } 453 | 454 | 455 | #[test] 456 | fn test_insert_small() { 457 | test_insert(50) 458 | } 459 | 460 | #[test] 461 | fn test_insert_large() { 462 | test_insert(2000) 463 | } 464 | 465 | 466 | 467 | /// Changes the value of levels and eq, so that all items in levels are offsets to the largest entry in the list strictly smaller than (key,value). 468 | pub fn set_levels(txn:&MutTxn, page:&P, key:&[u8], value:Option, levels:&mut [u16], eq:&mut bool) { 469 | let mut level = N_LEVELS-1; 470 | let mut current_off = FIRST_HEAD; 471 | let mut current = page.offset(FIRST_HEAD as isize) as *const u16; 472 | let mut last_compared_offset = 0; 473 | loop { 474 | // advance in the list until there's nothing more to do. 475 | loop { 476 | let next = u16::from_le(unsafe { *(current.offset(level as isize)) }); // next in the list at the current level. 477 | //println!("first loop, next = {:?}", next); 478 | if next == NIL { 479 | debug!("next=NIL, current_off={:?}", current_off); 480 | levels[level] = current_off; 481 | break 482 | } else { 483 | debug_assert!(next!=0); 484 | if next == last_compared_offset { 485 | // We're going to get the same result as last 486 | // time, and this wasn't Ordering::Greater. It it 487 | // was Ordering::Equal, we already set eq. 488 | break 489 | } else { 490 | last_compared_offset = next; 491 | let next_ptr = page.offset(next as isize); 492 | let (next_key,next_value) = unsafe { read_key_value(next_ptr) }; 493 | // debug!("compare: {:?} {:?}", std::str::from_utf8(key), std::str::from_utf8(next_key)); 494 | match key.cmp(next_key) { 495 | Ordering::Less => break, 496 | Ordering::Equal => 497 | if let Some(value) = value { 498 | /*if cfg!(test) { 499 | unsafe { 500 | if (Value::from_unsafe(&value, txn)).cmp(Value::from_unsafe(&next_value, txn)) != Ordering::Equal { 501 | debug!("differ on value {:?}", next_value); 502 | let mut s0 = Vec::new(); 503 | for i in Value::from_unsafe(&value, txn) { 504 | s0.extend(i) 505 | } 506 | let mut s1 = Vec::new(); 507 | for i in Value::from_unsafe(&next_value, txn) { 508 | s1.extend(i) 509 | } 510 | debug!("{:?}", std::str::from_utf8(&s0)); 511 | debug!("{:?}", std::str::from_utf8(&s1)); 512 | } 513 | } 514 | }*/ 515 | match unsafe { (Value::from_unsafe(&value, txn)).cmp(Value::from_unsafe(&next_value, txn)) } { 516 | Ordering::Less => break, 517 | Ordering::Equal => { 518 | *eq = true; 519 | break 520 | }, 521 | Ordering::Greater => { 522 | current_off = next; 523 | current = page.offset(current_off as isize) as *const u16; 524 | } 525 | } 526 | } else { 527 | // If no value was given, set at the smallest value, hence here. 528 | *eq = true; 529 | break 530 | }, 531 | Ordering::Greater => { 532 | current_off = next; 533 | current = page.offset(current_off as isize) as *const u16; 534 | } 535 | } 536 | } 537 | } 538 | } 539 | levels[level] = current_off; 540 | if level == 0 { 541 | break 542 | } else { 543 | level -= 1; 544 | levels[level] = levels[level+1] 545 | } 546 | } 547 | } 548 | 549 | 550 | 551 | pub fn insert(rng:&mut R, txn:&mut MutTxn, page:Cow, key:&[u8], value:UnsafeValue, right_page:u64, parent_will_be_dup:bool) -> Result { 552 | debug!("insert page = {:?}", page.page_offset()); 553 | let mut eq = false; 554 | let mut levels = [0;N_LEVELS]; 555 | set_levels(txn, &page, key, Some(value), &mut levels[..], &mut eq); 556 | debug!("levels={:?}", levels); 557 | if eq { 558 | Ok(Res::Nothing{page:page}) 559 | } else { 560 | let child_page = page.right_child(levels[0]); 561 | let page_rc = get_rc(txn, page.page_offset()); 562 | let page_will_be_dup = parent_will_be_dup || (page_rc > 1); 563 | debug!("page_rc = {:?} {:?}", parent_will_be_dup, page_rc); 564 | if child_page > 0 && right_page == 0 { 565 | debug!("inserting in child page {:?}", child_page); 566 | // Insert in the page below. 567 | let next_page = txn.load_cow_page(child_page); 568 | 569 | match try!(insert(rng, txn, next_page, key, value, right_page, page_will_be_dup)) { 570 | Res::Nothing{..} => Ok(Res::Nothing { page:page }), 571 | Res::Ok { page:next_page } => { 572 | debug!("Child returned ok: {:?}", next_page); 573 | 574 | // The page below was updated. Update the reference in the current page 575 | let mut new_levels = [0;N_LEVELS]; 576 | 577 | if !page_will_be_dup { 578 | let page = try!(cow_pinpointing(rng, txn, page, &levels[..], &mut new_levels[..], false, false, 579 | next_page.page_offset())); 580 | Ok(Res::Ok { page:page }) 581 | } else { 582 | // Decrement the counter for the first page with RC>1 on the path from the root. 583 | if !parent_will_be_dup && page_rc > 1 { 584 | try!(decr_rc(rng, txn, page.page_offset())) 585 | } 586 | let page = 587 | try!(copy_page(rng, txn, &page.as_page(), &levels[..], &mut new_levels[..], false, false, 588 | next_page.page_offset(), true)); 589 | Ok(Res::Ok { page: page }) 590 | } 591 | }, 592 | Res::Split { key_ptr,key_len,value:value_,left,right,free_page } => { 593 | debug_assert!(free_page == child_page || free_page == 0); 594 | // The page below split. Update the child to the 595 | // left half of the split, and insert the middle 596 | // element returned by the split in the current 597 | // page. 598 | 599 | // Now reinsert the element here. 600 | let key_ = unsafe {std::slice::from_raw_parts(key_ptr, key_len)}; 601 | let result = unsafe { 602 | full_local_insert(rng, txn, page, key_, value_, right.page_offset(), 603 | &mut levels, left.page_offset(), parent_will_be_dup, 604 | page_will_be_dup) 605 | }; 606 | if !page_will_be_dup && free_page > 0 { 607 | try!(free(rng, txn, free_page)); 608 | } 609 | result 610 | }, 611 | Res::Underfull {..} => unreachable!() 612 | } 613 | } else { 614 | debug!("inserting here"); 615 | // No child page, insert on this page. 616 | unsafe { 617 | full_local_insert(rng, txn, page, key, value, right_page, &mut levels, 0, parent_will_be_dup, page_will_be_dup) 618 | } 619 | } 620 | } 621 | } 622 | 623 | pub unsafe fn full_local_insert(rng:&mut R, txn:&mut MutTxn, page:Cow, key:&[u8], value:UnsafeValue, right_page:u64, levels:&mut [u16], left_page:u64, parent_will_be_dup: bool, page_will_be_dup:bool) -> Result { 624 | let size = record_size(key.len(), value.len() as usize); 625 | let mut new_levels = [0;N_LEVELS]; 626 | if !page_will_be_dup { 627 | 628 | let off = page.can_alloc(size); 629 | if off > 0 { 630 | let (mut page,off) = 631 | if off + size < PAGE_SIZE as u16 && get_rc(txn, page.page_offset()) <= 1 { 632 | // No need to copy nor compact the page, the value can be written right away. 633 | (try!(cow_pinpointing(rng, txn, page, &levels, &mut new_levels, 634 | false, false, left_page)), 635 | off) 636 | } else { 637 | // Here, we need to compact the page, which is equivalent to considering it non mutable and CoW it. 638 | 639 | let page = try!(cow_pinpointing(rng, txn, page.as_nonmut(), 640 | &levels[..], 641 | &mut new_levels[..], false, false, 642 | left_page)); 643 | let off = page.can_alloc(size); 644 | (page, off) 645 | }; 646 | local_insert_at(rng, &mut page, key, value, right_page, 647 | off, size, &mut new_levels[..]); 648 | Ok(Res::Ok { page:page }) 649 | } else { 650 | debug!("splitting, key = {:?}", std::str::from_utf8(key)); 651 | if left_page > 0 { 652 | Ok(try!(split_page(rng, txn, &page, key, value, right_page, page_will_be_dup, NIL, levels[0], left_page))) 653 | } else { 654 | Ok(try!(split_page(rng, txn, &page, key, value, right_page, page_will_be_dup, NIL, NIL, 0))) 655 | } 656 | } 657 | 658 | } else { 659 | if !parent_will_be_dup { 660 | try!(decr_rc(rng, txn, page.page_offset())) 661 | } 662 | let off = page.can_alloc(size); 663 | if off > 0 { 664 | let p = txn.load_page(page.page_offset()); 665 | let mut page = try!(copy_page(rng, txn, &p, levels, &mut new_levels, false, false, left_page, true)); 666 | local_insert_at(rng, &mut page, key, value, right_page, 667 | off, size, &mut new_levels[..]); 668 | Ok(Res::Ok { page:page }) 669 | } else { 670 | debug!("splitting, key = {:?}", std::str::from_utf8(key)); 671 | if left_page > 0 { 672 | Ok(try!(split_page(rng, txn, &page, key, value, right_page, page_will_be_dup, NIL, levels[0], left_page))) 673 | } else { 674 | Ok(try!(split_page(rng, txn, &page, key, value, right_page, page_will_be_dup, NIL, NIL, 0))) 675 | } 676 | } 677 | } 678 | } 679 | 680 | 681 | 682 | /// If the "levels" (pointers to the current elements of each of the 683 | /// lists) are known, allocate an element of size size at offset off, 684 | /// updates the lists on the page, and update the levels accordingly. 685 | pub fn local_insert_at(rng:&mut R, page:&mut MutPage, key:&[u8], value:UnsafeValue, right_page:u64, off:u16, size:u16, levels:&mut [u16]) { 686 | debug!("entering local_insert_at"); 687 | debug_assert!(off + size <= PAGE_SIZE as u16); 688 | page.reset_pointers(off); 689 | page.alloc_key_value(off, size, key.as_ptr(), key.len(), value); 690 | page.set_right_child(off, right_page); 691 | for i in 0..N_LEVELS { 692 | let next = page.level(levels[i], i); 693 | debug!("{:?} levels[{:?}]={:?}, next={:?}", page.page_offset(), i, levels[i], next); 694 | // debug_assert!(next != 0); 695 | /*if let UnsafeValue::O { ref offset,.. } = value { 696 | debug!("local_insert_at: UnsafeValue::O {:?}", offset); 697 | }*/ 698 | page.set_level(off, i, next); 699 | // *((page.offset(off as isize) as *mut u16).offset(i as isize)) = next; 700 | page.set_level(levels[i], i, off); 701 | // *((page.offset(levels[i] as isize) as *mut u16).offset(i as isize)) = off.to_le(); 702 | debug!("local_insert_at: link from {:?}.{:?} to {:?}, at level {:?}", page.page_offset(), levels[i], off, i); 703 | levels[i] = off; 704 | if rng.gen() { 705 | break 706 | } 707 | } 708 | debug!("exiting local_insert_at"); 709 | } 710 | 711 | 712 | /// The arguments to split_page are non-trivial. This function takes a 713 | /// page and an element to insert, too large to fit in the page. It 714 | /// splits the page, inserts the new element, and returns the middle 715 | /// element of the split as a Res::Split { .. }. 716 | /// 717 | /// Moreover, this function guarantees that before reinserting the 718 | /// binding given as argument, each of the two sides of the split can 719 | /// hold at least two more bindings (this is required for deletions). 720 | pub unsafe fn split_page(rng:&mut R, txn:&mut MutTxn,page:&Cow, 721 | // (key, value, right_page) of the record to insert. 722 | key:&[u8], value:UnsafeValue, right_page:u64, 723 | // Sometimes, a split propagates upwards: 724 | // more precisely, inserting the middle 725 | // element into the page upwards causes it 726 | // to split. If the page upwards was 727 | // non-mutable, we could not write the 728 | // page to the left of the middle element 729 | // before the split (without copying the 730 | // whole soon-to-be-freed page, of 731 | // course). translate_index and 732 | // translate_right_page are meant for this 733 | // purpose: the pointer to the page that 734 | // split is "translated" to a pointer to the 735 | // left page of the split. 736 | page_will_be_dup:bool, 737 | forgetting:u16, 738 | translate_index:u16, translate_right_page:u64)->Result { 739 | 740 | debug!("split {:?} {:?}", page.page_offset(), page_will_be_dup); 741 | debug!("split {:?}", std::str::from_utf8(key)); 742 | let mut left = try!(txn.alloc_page()); 743 | left.init(); 744 | let mut right = try!(txn.alloc_page()); 745 | right.init(); 746 | debug!("split allocated {:?} {:?}", left.page_offset(), right.page_offset()); 747 | *((left.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = 748 | if translate_index == 0 { 749 | translate_right_page.to_le() 750 | } else { 751 | let r = u64::from_le(*((page.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 752 | if page_will_be_dup && r > 0 { try!(incr_rc(rng, txn, r)) } 753 | r.to_le() 754 | }; 755 | 756 | // Loop through the values of the page, in order, and insert them to left in order. 757 | // Stop whenever both pages can include one extra entry after inserting the input entry to this function. 758 | 759 | let mut left_bytes = 24; 760 | let mut left_levels = [FIRST_HEAD;N_LEVELS]; 761 | let mut right_levels = [FIRST_HEAD;N_LEVELS]; 762 | let mut middle = None; 763 | 764 | let mut extra_on_lhs = false; 765 | 766 | for (current, key_, value_, r) in PageIterator::new(page,0) { 767 | debug!("split key_ = {:?} {:?}", current, std::str::from_utf8(key_)); 768 | if current == forgetting { 769 | // Only used in rebalance, which already frees values. 770 | /*if !page_will_be_dup { 771 | if let UnsafeValue::O { offset, len } = value_ { 772 | try!(free_value(rng, txn, offset, len)); 773 | } 774 | }*/ 775 | continue 776 | } 777 | let r = if current == translate_index { 778 | translate_right_page 779 | } else { 780 | if page_will_be_dup && r > 0 { 781 | try!(incr_rc(rng, txn, r)) 782 | } 783 | r 784 | }; 785 | if page_will_be_dup { 786 | if let UnsafeValue::O { offset, .. } = value_ { 787 | try!(incr_rc(rng, txn, offset)) 788 | } 789 | } 790 | let next_size = record_size(key_.len(),value_.len() as usize); 791 | if middle.is_none() { // Insert in left page. 792 | if left_bytes + next_size <= (PAGE_SIZE as u16) / 2 { 793 | // insert in left page. 794 | let off = left.can_alloc(next_size); 795 | local_insert_at(rng, &mut left, key_, value_, r, off, next_size, &mut left_levels); 796 | left_bytes += next_size; 797 | } else { 798 | // Maybe we won't insert the new key here, in which case we can go one more step. 799 | if left_bytes <= (PAGE_SIZE as u16) / 2 { 800 | extra_on_lhs = match key.cmp(key_) { 801 | Ordering::Less => true, 802 | Ordering::Greater => false, 803 | Ordering::Equal => 804 | match (Value::from_unsafe(&value, txn)).cmp(Value::from_unsafe(&value_, txn)) { 805 | Ordering::Less | Ordering::Equal => true, 806 | Ordering::Greater => false 807 | } 808 | }; 809 | debug!("one more key ? {:?}", extra_on_lhs); 810 | if !extra_on_lhs { 811 | // The next key is larger than all elements on 812 | // the left page, but smaller than the extra key. 813 | // This is the separator. 814 | middle = Some((key_.as_ptr(),key_.len(),value_,r)) 815 | } else { 816 | // We insert the extra key on the left-hand side now. and save (key_,value_) for later. 817 | let mut levels = [0;N_LEVELS]; 818 | let mut eq = false; 819 | set_levels(txn, &left, key, Some(value), &mut levels[..], &mut eq); 820 | 821 | let size = record_size(key.len(), value.len() as usize); 822 | let off = left.can_alloc(size); 823 | local_insert_at(rng, &mut left, key, value, right_page, off, size, &mut levels); 824 | left_bytes += size; 825 | middle = Some((key_.as_ptr(),key_.len(),value_,r)) 826 | } 827 | } else { 828 | middle = Some((key_.as_ptr(),key_.len(),value_,r)) 829 | } 830 | } 831 | } else { 832 | // insert in right page. 833 | let off = right.can_alloc(next_size); 834 | local_insert_at(rng, &mut right, key_, value_, r, off, next_size, &mut right_levels); 835 | } 836 | } 837 | 838 | // If the extra entry was not added to the left-hand side, add it to the right-hand side. 839 | debug!("extra_on_lhs: {:?}", extra_on_lhs); 840 | if !extra_on_lhs { 841 | 842 | if cfg!(test) { 843 | if let Some((key_ptr, key_len, _, _)) = middle { 844 | // check that we're inserting on the right side. 845 | let key_ = std::slice::from_raw_parts(key_ptr, key_len); 846 | debug_assert!( key >= key_ ) 847 | } 848 | } 849 | 850 | let mut levels = [0;N_LEVELS]; 851 | let mut eq = false; 852 | set_levels(txn, &right, key, Some(value), &mut levels[..], &mut eq); 853 | 854 | let size = record_size(key.len(), value.len() as usize); 855 | let off = right.can_alloc(size); 856 | local_insert_at(rng, &mut right, key, value, right_page, off, size, &mut levels); 857 | } 858 | if let Some((key_ptr, key_len, value_, right_child)) = middle { 859 | *((right.offset(FIRST_HEAD as isize) as *mut u64).offset(2)) = right_child.to_le(); 860 | Ok(Res::Split { 861 | key_ptr: key_ptr, 862 | key_len: key_len, 863 | value: value_, 864 | left: left, 865 | right: right, 866 | free_page: if page_will_be_dup { 0 } else { page.page_offset() } 867 | }) 868 | } else { 869 | unreachable!() 870 | } 871 | } 872 | 873 | 874 | 875 | // This function deals with the case where the main page split, either during insert, or during delete. 876 | pub fn root_split(rng:&mut R, txn: &mut MutTxn, x:Res) -> Result { 877 | debug!("ROOT SPLIT"); 878 | if let Res::Split { left,right,key_ptr,key_len,value,free_page } = x { 879 | let mut page = try!(txn.alloc_page()); 880 | page.init(); 881 | page.set_right_child(FIRST_HEAD, left.page_offset()); 882 | let mut levels = [0;N_LEVELS]; 883 | let size = record_size(key_len, value.len() as usize); 884 | let off = page.can_alloc(size); 885 | let key = unsafe { std::slice::from_raw_parts(key_ptr, key_len) }; 886 | local_insert_at(rng, &mut page, key, value, right.page_offset(), off, size, &mut levels); 887 | debug!("root split, freeing {:?}", free_page); 888 | try!(free(rng, txn, free_page)); 889 | Ok(page) 890 | } else { 891 | unreachable!() 892 | } 893 | } 894 | 895 | 896 | pub fn put(rng:&mut R, txn: &mut MutTxn, db: &mut Db, key: &[u8], value: &[u8])->Result { 897 | assert!(key.len() < MAX_KEY_SIZE); 898 | let root_page = Cow { cow: txn.txn.load_cow_page(db.root) }; 899 | let value = if value.len() > VALUE_SIZE_THRESHOLD { 900 | try!(alloc_value(txn,value)) 901 | } else { 902 | UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } 903 | }; 904 | debug!("key = {:?}", std::str::from_utf8(key)); 905 | unsafe { debug!("value = {:?}", Value::from_unsafe(&value, txn)) } 906 | match try!(insert(rng, txn, root_page, key, value, 0, false)) { 907 | Res::Nothing { .. } => Ok(false), 908 | Res::Ok { page,.. } => { db.root = page.page_offset(); Ok(true) } 909 | x => { 910 | db.root = try!(root_split(rng,txn,x)).page_offset(); 911 | Ok(true) 912 | } 913 | } 914 | } 915 | -------------------------------------------------------------------------------- /src/txn.rs: -------------------------------------------------------------------------------- 1 | use super::transaction; 2 | use std; 3 | use std::path::Path; 4 | use super::transaction::{PAGE_SIZE,PAGE_SIZE_16, PAGE_SIZE_64}; 5 | use std::fs::File; 6 | use std::io::BufWriter; 7 | use std::collections::HashSet; 8 | use std::ptr::copy_nonoverlapping; 9 | use std::io::Write; 10 | use std::fmt; 11 | use std::cmp::Ordering; 12 | #[cfg(debug_assertions)] 13 | use rustc_serialize::hex::ToHex; 14 | 15 | // Guarantee: there are at least 4 bindings per page. 16 | const BINDING_HEADER_SIZE: usize = 16; // each binding on B tree pages requires 16 bytes of header. 17 | 18 | pub const MAX_KEY_SIZE: usize = (PAGE_SIZE >> 3); 19 | pub const VALUE_SIZE_THRESHOLD: usize = (PAGE_SIZE >> 3) - BINDING_HEADER_SIZE - 6; // 6 is the page header size (24) divided by 4. 20 | 21 | pub const NIL:u16 = 0xffff; 22 | pub const FIRST_HEAD:u16 = 0; 23 | pub const N_LEVELS:usize = 5; 24 | pub const VALUE_HEADER_LEN:usize = 8; 25 | 26 | #[derive(Debug)] 27 | /// A database identifier. A `Db` can be reused in any number of transactions belonging to the same environment. 28 | pub struct Db { 29 | #[doc(hidden)] 30 | pub root: u64, 31 | #[doc(hidden)] 32 | pub root_num: isize 33 | } 34 | 35 | impl Db { 36 | pub unsafe fn clone(&self) -> Db { 37 | Db { root:self.root, root_num:self.root_num } 38 | } 39 | pub unsafe fn from_value(v:&[u8]) -> Db { 40 | let root = u64::from_le(*(v.as_ptr() as *const u64)); 41 | Db { root:root, root_num: -1 } 42 | } 43 | } 44 | 45 | 46 | /// Mutable transaction 47 | pub struct MutTxn<'env,T> { 48 | #[doc(hidden)] 49 | pub txn: transaction::MutTxn<'env,T>, 50 | #[doc(hidden)] 51 | pub protected_pages: [u64;2], 52 | #[doc(hidden)] 53 | pub free_protected: [bool;2] 54 | } 55 | 56 | impl<'env,T> Drop for MutTxn<'env,T> { 57 | fn drop(&mut self) { 58 | debug!("dropping muttxn"); 59 | std::mem::drop(&mut self.txn) 60 | } 61 | } 62 | 63 | 64 | /// Immutable transaction 65 | pub struct Txn<'env> { 66 | pub txn: transaction::Txn<'env>, 67 | } 68 | 69 | type Error = transaction::Error; 70 | 71 | const REFERENCE_COUNTS:isize = 0; 72 | // pub const MAIN_ROOT:usize = 1; 73 | 74 | impl<'env,T> MutTxn<'env,T> { 75 | #[doc(hidden)] 76 | pub fn alloc_page(&mut self) -> Result { 77 | let page = try!(self.txn.alloc_page()); 78 | // debug!("txn.alloc_page: {:?}", page.offset); 79 | Ok(MutPage { page: page }) 80 | } 81 | #[doc(hidden)] 82 | pub fn load_cow_page(&mut self, off: u64) -> Cow { 83 | Cow { cow: self.txn.load_cow_page(off) } 84 | } 85 | #[doc(hidden)] 86 | pub fn set_rc(&mut self, db:Db) { 87 | self.txn.set_root(REFERENCE_COUNTS, db.root) 88 | } 89 | 90 | 91 | #[cfg(debug_assertions)] 92 | #[doc(hidden)] 93 | pub fn debug>(&self, db: &[&Db], p: P, keys_hex:bool, values_hex:bool) { 94 | debug(self, db, p, keys_hex, values_hex) 95 | } 96 | #[cfg(debug_assertions)] 97 | #[doc(hidden)] 98 | pub fn debug_concise>(&self, db: &[&Db], p: P) { 99 | debug_concise(self, db, p) 100 | } 101 | } 102 | 103 | impl<'env> Txn<'env> { 104 | #[cfg(test)] 105 | #[doc(hidden)] 106 | pub fn debug>(&self, db: &[&Db], p: P, keys_hex:bool, values_hex:bool) { 107 | debug(self, db, p, keys_hex, values_hex) 108 | } 109 | #[cfg(debug_assertions)] 110 | #[doc(hidden)] 111 | pub fn debug_concise>(&self, db: &[&Db], p: P) { 112 | debug_concise(self, db, p) 113 | } 114 | } 115 | 116 | 117 | /// The following structure is meant to iterate through the skip list 118 | /// in a page. More specifically, it goes through all bindings at the 119 | /// specified level. 120 | #[doc(hidden)] 121 | pub struct PageIterator<'a,P:super::txn::P + 'a> { 122 | pub page:&'a P, 123 | pub level:usize, 124 | pub current:u16 125 | } 126 | impl<'a,P:super::txn::P + 'a> PageIterator<'a,P> { 127 | #[doc(hidden)] 128 | pub fn new(page:&'a P, level:usize) -> Self { 129 | unsafe { 130 | // Skip the first pointer (has no key/value) 131 | let current = u16::from_le(*(page.offset(FIRST_HEAD as isize) as *const u16)); 132 | PageIterator { page:page, level:level, current:current } 133 | } 134 | } 135 | } 136 | impl<'a,P:super::txn::P + 'a> Iterator for PageIterator<'a,P> { 137 | type Item = (u16, &'a [u8], UnsafeValue, u64); 138 | fn next(&mut self) -> Option { 139 | if self.current == NIL { 140 | None 141 | } else { 142 | unsafe { 143 | let current = self.current; 144 | let (key,value) = read_key_value(self.page.offset(self.current as isize)); 145 | let right_child = u64::from_le(*((self.page.offset(self.current as isize) as *const u64).offset(2))); 146 | self.current = u16::from_le(*(self.page.offset(self.current as isize) as *const u16)); 147 | Some((current,key,value,right_child)) 148 | } 149 | } 150 | } 151 | } 152 | 153 | 154 | #[derive(Clone,Copy,Debug)] 155 | pub enum UnsafeValue { 156 | S { p:*const u8, 157 | len:u32 }, 158 | O { offset: u64, 159 | len: u32 } 160 | } 161 | 162 | /// Iterator over parts of a value. On values of size at most 4096 bytes, the iterator will run exactly once. On larger values, it returns all parts of the value, in order. 163 | #[derive(Clone)] 164 | pub enum Value<'a,T:'a> { 165 | S { p:*const u8, 166 | len:u32 }, 167 | O { txn:&'a T, 168 | offset: u64, 169 | len: u32 } 170 | } 171 | 172 | impl <'a,T:LoadPage>fmt::Debug for Value<'a,T> { 173 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 174 | // let it = Value { txn:self.txn, value:self.value.clone() }; 175 | let it:Value<_> = self.clone(); 176 | try!(write!(f,"Value ({:?}) {{ value: [", self.len())); 177 | let mut first = true; 178 | for x in it { 179 | if !first { 180 | try!(write!(f, ", {:?}", std::str::from_utf8(x))) 181 | } else { 182 | try!(write!(f, "{:?}", std::str::from_utf8(x))); 183 | first = false; 184 | } 185 | } 186 | try!(write!(f,"] }}")); 187 | Ok(()) 188 | } 189 | } 190 | impl <'a,T:LoadPage> Iterator for Value<'a,T> { 191 | type Item = &'a [u8]; 192 | fn next(&mut self)->Option<&'a [u8]> { 193 | match self { 194 | &mut Value::O { ref txn, ref mut offset, ref mut len } => { 195 | debug!("iterator: {:?}, {:?}", offset, len); 196 | if *len == 0 { 197 | None 198 | } else { 199 | if *len <= PAGE_SIZE as u32 { 200 | unsafe { 201 | let page = txn.load_page(*offset).offset(0); 202 | let slice=std::slice::from_raw_parts(page.offset(0), *len as usize); 203 | *len = 0; 204 | Some(slice) 205 | } 206 | } else { 207 | unsafe { 208 | let page = txn.load_page(*offset).offset(0); 209 | // change the pointer of "current page" to the next page 210 | *offset = u64::from_le(*(page as *const u64)); 211 | let l = PAGE_SIZE-VALUE_HEADER_LEN; 212 | *len -= l as u32; 213 | Some(std::slice::from_raw_parts(page.offset(VALUE_HEADER_LEN as isize), l as usize)) 214 | } 215 | } 216 | } 217 | }, 218 | &mut Value::S{ref mut p,ref mut len} => { 219 | if *len == 0 { 220 | None 221 | } else { 222 | if *len <= PAGE_SIZE as u32 { 223 | let l = *len; 224 | *len = 0; 225 | unsafe { 226 | Some(std::slice::from_raw_parts(*p,l as usize)) 227 | } 228 | } else { 229 | let pp = *p; 230 | unsafe { 231 | let l = PAGE_SIZE - VALUE_HEADER_LEN; 232 | *p = ((*p) as *mut u8).offset(l as isize); 233 | *len -= l as u32; 234 | Some(std::slice::from_raw_parts(pp,l as usize)) 235 | } 236 | } 237 | } 238 | } 239 | } 240 | } 241 | } 242 | 243 | 244 | 245 | impl UnsafeValue { 246 | pub fn len(&self) -> u32 { 247 | match self { 248 | &UnsafeValue::S{len,..} => len, 249 | &UnsafeValue::O{len,..} => len, 250 | } 251 | } 252 | } 253 | impl<'a,T> Value<'a,T> { 254 | pub fn len(&self) -> u32 { 255 | match self { 256 | &Value::S{len,..} => len, 257 | &Value::O{len,..} => len, 258 | } 259 | } 260 | 261 | pub fn clone(&self) -> Value<'a,T> { 262 | match self { 263 | &Value::S{ref p, ref len} => Value::S { len:*len, p:*p }, 264 | &Value::O{ref offset, ref len, ref txn} => Value::O { len:*len, offset:*offset, txn:*txn }, 265 | } 266 | } 267 | 268 | pub unsafe fn from_unsafe(u:&UnsafeValue, txn: &'a T) -> Value<'a,T> { 269 | match u { 270 | &UnsafeValue::S{ref p, ref len} => Value::S { len:*len, p:*p }, 271 | &UnsafeValue::O{ref offset, ref len} => Value::O { len:*len, offset:*offset, txn:txn }, 272 | } 273 | } 274 | pub fn from_slice(slice:&'a[u8]) -> Value<'a,T> { 275 | Value::S { p:slice.as_ptr(), len:slice.len() as u32 } 276 | // Value { txn: None, value: UnsafeValue::S { p:slice.as_ptr(), len:slice.len() as u32 } } 277 | } 278 | } 279 | 280 | 281 | // Difference between mutpage and mutpages: mutpages might also contain just one page, but it is unmapped whenever it goes out of scope, whereas P belongs to the main map. Useful for 32-bits platforms. 282 | 283 | 284 | #[derive(Debug)] 285 | pub struct MutPage { 286 | pub page: transaction::MutPage, 287 | } 288 | #[derive(Debug)] 289 | pub struct Page { 290 | pub page: transaction::Page, 291 | } 292 | 293 | 294 | pub unsafe fn read_key_value<'a>(p: *const u8) -> (&'a [u8], UnsafeValue) { 295 | let key_len = u16::from_le(*(p as *const u16).offset(5)); 296 | let val_len = u32::from_le(*(p as *const u32).offset(3)); 297 | 298 | if (val_len as usize) < VALUE_SIZE_THRESHOLD { 299 | let padding = (8 - (val_len & 7)) & 7; 300 | (std::slice::from_raw_parts((p as *const u8).offset((24 + val_len + padding) as isize), key_len as usize), 301 | UnsafeValue::S { p:(p as *const u8).offset(24), len:val_len }) 302 | } else { 303 | (std::slice::from_raw_parts((p as *const u8).offset(32), key_len as usize), 304 | { 305 | let offset = u64::from_le(*((p as *const u64).offset(3))); 306 | UnsafeValue::O { 307 | offset: offset, 308 | len: val_len, 309 | } 310 | }) 311 | } 312 | } 313 | 314 | pub trait LoadPage:Sized { 315 | fn length(&self) -> u64; 316 | 317 | fn root_db_(&self,num:isize) -> Option; 318 | 319 | fn open_db_<'a>(&'a self, root:&Db, key: &[u8]) -> Option { 320 | let page = self.load_page(root.root); 321 | unsafe { 322 | let db = self.get_(page, key, None); 323 | if let Some(UnsafeValue::S{p,..}) = db { 324 | Some(Db { root_num: -1, root: u64::from_le(*(p as *const u64)) }) 325 | } else { 326 | None 327 | } 328 | } 329 | } 330 | 331 | fn load_page(&self, off: u64) -> Page; 332 | 333 | fn get_u64(&self, db: &Db, key: u64) -> Option { 334 | let page = self.load_page(db.root); 335 | self.get_u64_(page, key) 336 | } 337 | 338 | fn get_u64_(&self, page:Page, key: u64) -> Option { 339 | unsafe { 340 | let mut key_:[u8;8] = [0;8]; 341 | *(key_.as_mut_ptr() as *mut u64) = key.to_le(); 342 | self.get_(page, &key_[..], None).and_then( 343 | |x| { 344 | if let UnsafeValue::S { p,.. } = x { 345 | Some(u64::from_le(*(p as *const u64))) 346 | } else { 347 | None 348 | } 349 | }) 350 | } 351 | } 352 | 353 | unsafe fn get_(&self, page:Page, key: &[u8], value:Option) -> Option { 354 | debug!("sanakirja::get_"); 355 | //println!("get from page {:?}", page); 356 | let mut current_off = FIRST_HEAD; 357 | let mut current = page.offset(current_off as isize) as *const u16; 358 | let mut level = N_LEVELS-1; 359 | let next_page; 360 | let mut equal:Option = None; 361 | 362 | let mut last_compared_offset = 0; 363 | 364 | loop { 365 | // advance in the list until there's nothing more to do. 366 | loop { 367 | debug!("current = {:?}", current); 368 | let next = u16::from_le(*(current.offset(level as isize))); // next in the list at the current level. 369 | if next == NIL { 370 | break 371 | } else { 372 | if next == last_compared_offset { 373 | // If we didn't move forward in the previous 374 | // list, and we're still comparing with the 375 | // same key/value, this key/value is <= to the 376 | // next one also in this list. 377 | break 378 | } else { 379 | last_compared_offset = next; 380 | let next_ptr = page.offset(next as isize); 381 | let (next_key,next_value) = read_key_value(next_ptr); 382 | debug!("next_value={:?}", next_value); 383 | /*println!("cmp {:?} {:?}", 384 | std::str::from_utf8_unchecked(key), 385 | std::str::from_utf8_unchecked(next_key));*/ 386 | match key.cmp(next_key) { 387 | Ordering::Less => break, 388 | Ordering::Equal => 389 | if let Some(value) = value { 390 | match (Value::from_unsafe(&value, self)).cmp(Value::from_unsafe(&next_value, self)) { 391 | Ordering::Less => break, 392 | Ordering::Equal => { 393 | equal = Some(next_value); 394 | break 395 | }, 396 | Ordering::Greater => { 397 | current_off = next; 398 | current = page.offset(current_off as isize) as *const u16; 399 | } 400 | } 401 | } else { 402 | equal = Some(next_value); 403 | break 404 | }, 405 | Ordering::Greater => { 406 | current_off = next; 407 | current = page.offset(current_off as isize) as *const u16; 408 | } 409 | } 410 | } 411 | } 412 | } 413 | if level == 0 { 414 | next_page = u64::from_le(*((current as *const u64).offset(2))); 415 | break 416 | } else { 417 | level -= 1 418 | } 419 | } 420 | debug!("next_page = {:?}", next_page); 421 | if next_page > 0 { 422 | let next_page_ = self.load_page(next_page); 423 | self.get_(next_page_, key, value).or(equal) 424 | } else { 425 | equal 426 | } 427 | } 428 | 429 | // In iterators, the page stack stores a list of pages from the 430 | // top of the tree down, where each page is stored as a full u64: 431 | // the least significant 12 bits encode the offset in the current 432 | // page, given by the other bits. 433 | unsafe fn iter_<'a,'b>(&'a self, 434 | initial_page: &Page, 435 | key:&[u8], 436 | value:Option) -> Iter<'a, Self> { 437 | 438 | let mut iter = Iter { txn:self, page_stack:[0;52], stack_pointer: 0 }; 439 | // page_stack.clear(); 440 | iter.push(initial_page.page_offset() | (FIRST_HEAD as u64)); 441 | 442 | loop { 443 | let next_page; 444 | { 445 | let (page_offset, current_off):(u64,u16) = offsets(iter.page_stack[iter.stack_pointer-1]); 446 | 447 | let page:Page = self.load_page(page_offset); 448 | let mut current:*const u16 = page.offset(current_off as isize) as *const u16; 449 | let mut level = N_LEVELS-1; 450 | 451 | // First mission: find first element. 452 | loop { 453 | // advance in the list until there's nothing more to do. 454 | // Notice that we never push NIL. 455 | loop { 456 | let next = u16::from_le(*(current.offset(level as isize))); // next in the list at the current level. 457 | if next == NIL { 458 | break 459 | } else { 460 | let next_ptr = page.offset(next as isize); 461 | let (next_key,next_value) = read_key_value(next_ptr); 462 | match key.cmp(next_key) { 463 | Ordering::Less => break, 464 | Ordering::Equal => 465 | if let Some(value) = value { 466 | match (Value::from_unsafe(&value, self)).cmp(Value::from_unsafe(&next_value, self)) { 467 | Ordering::Less => break, 468 | Ordering::Equal => break, 469 | Ordering::Greater => { 470 | iter.page_stack[iter.stack_pointer-1] = page_offset | (next as u64); 471 | current = page.offset(next as isize) as *const u16; 472 | } 473 | } 474 | } else { 475 | break 476 | }, 477 | Ordering::Greater => { 478 | iter.page_stack[iter.stack_pointer-1] = page_offset | (next as u64); 479 | current = page.offset(next as isize) as *const u16; 480 | } 481 | } 482 | } 483 | } 484 | if level == 0 { 485 | let next = u16::from_le(*(current.offset(level as isize))); // next in the list at the current level. 486 | iter.page_stack[iter.stack_pointer-1] = page_offset | (next as u64); 487 | next_page = u64::from_le(*((current as *const u64).offset(2))); 488 | break 489 | } else { 490 | level -= 1 491 | } 492 | } 493 | } 494 | if next_page == 0 { 495 | break 496 | } else { 497 | iter.push(next_page | (FIRST_HEAD as u64)); 498 | } 499 | } 500 | iter 501 | } 502 | 503 | fn rc(&self) -> Option; 504 | } 505 | 506 | pub struct Iter<'a, T:'a> { 507 | txn:&'a T, 508 | page_stack:[u64;52], 509 | stack_pointer:usize 510 | } 511 | 512 | impl<'a,T:'a> Iter<'a,T> { 513 | fn push(&mut self, x:u64) { 514 | self.page_stack[self.stack_pointer] = x; 515 | self.stack_pointer += 1 516 | } 517 | fn pop(&mut self) -> u64 { 518 | self.stack_pointer -= 1; 519 | self.page_stack[self.stack_pointer] 520 | } 521 | } 522 | 523 | fn offsets(x:u64) -> (u64, u16) { 524 | let mask:u64 = PAGE_SIZE_64-1; 525 | (x & !mask, (x&mask) as u16) 526 | } 527 | 528 | impl<'a,'b,T:LoadPage+'a> Iterator for Iter<'a, T> { 529 | type Item = (&'a[u8], Value<'a,T>); 530 | fn next(&mut self) -> Option { 531 | if self.stack_pointer == 0 { 532 | None 533 | } else { 534 | unsafe { 535 | let (page_off, current_off):(u64,u16) = offsets(self.page_stack[self.stack_pointer-1]); 536 | // println!("page_off = {:?} {:?}", page_off, current_off); 537 | // the binding at current_off is the next one to be sent. 538 | if current_off >= 4095 { 539 | // println!("pop"); 540 | self.pop(); 541 | self.next() 542 | } else { 543 | let page = self.txn.load_page(page_off); 544 | let current:*const u16 = page.offset(current_off as isize) as *const u16; 545 | 546 | // We set the page stack to the next binding, and return the current one. 547 | 548 | // Move the top of the stack to the next binding. 549 | { 550 | let next = u16::from_le(*(current as *const u16)); 551 | let next = std::cmp::min(next, 4095); // Avoid overflow. 552 | self.page_stack[self.stack_pointer-1] = page_off | (next as u64); 553 | } 554 | // If there's a page below, push it: the next element is there. 555 | let next_page = u64::from_le(*((current as *const u64).offset(2))); 556 | if next_page != 0 { 557 | // println!("push"); 558 | self.push(next_page | (FIRST_HEAD as u64)); 559 | } 560 | 561 | // Now, return the current element. If we're inside the page, there's an element to return. 562 | if current_off > FIRST_HEAD { 563 | let (key,value) = read_key_value(current as *const u8); 564 | Some((key, Value::from_unsafe(&value, self.txn))) 565 | } else { 566 | // Else, we're at the beginning of the page, 567 | // the element is either in the page we just 568 | // pushed, or (if there's no page below) the 569 | // next element. 570 | self.next() 571 | } 572 | } 573 | } 574 | } 575 | } 576 | } 577 | 578 | 579 | pub trait P:std::fmt::Debug { 580 | /// offset of the page in the file. 581 | fn page_offset(&self) -> u64; 582 | 583 | /// pointer to the first word of the page. 584 | fn data(&self) -> *const u64; 585 | 586 | /// 0 if cannot alloc, valid offset else (offset in bytes from the start of the page) 587 | fn can_alloc(&self, size: u16) -> u16 { 588 | assert!(size & 7 == 0); // 64 bits aligned. 589 | if self.occupied() + size < PAGE_SIZE as u16 { 590 | self.first_free() 591 | } else { 592 | 0 593 | } 594 | } 595 | // First free spot in this page (head of the linked list, number of |u32| from the last glue. 596 | fn first_free(&self) -> u16 { 597 | unsafe { 598 | let first_free = u16::from_le(*(self.p_first_free())); 599 | if first_free > 0 { 600 | first_free 601 | } else { 602 | FIRST_HEAD + 24 603 | } 604 | } 605 | } 606 | fn p_first_free(&self) -> *mut u16 { 607 | unsafe { ((self.data() as *mut u8).offset(FIRST_HEAD as isize + 10) as *mut u16) } 608 | } 609 | 610 | fn occupied(&self) -> u16 { 611 | unsafe { 612 | let occupied = u16::from_le(*(self.p_occupied())); 613 | if occupied > 0 { 614 | occupied 615 | } else { 616 | FIRST_HEAD + 24 617 | } 618 | } 619 | } 620 | fn p_occupied(&self) -> *mut u16 { 621 | unsafe { ((self.data() as *mut u8).offset(FIRST_HEAD as isize + 12) as *mut u16) } 622 | } 623 | 624 | // offset in u32. 625 | fn offset(&self, off: isize) -> *mut u8 { 626 | unsafe { 627 | let p = self.data() as *mut u8; 628 | p.offset(off) 629 | } 630 | } 631 | fn right_child(&self, off:u16) -> u64 { 632 | assert!(off < PAGE_SIZE_16); 633 | unsafe { 634 | u64::from_le(*((self.offset(off as isize) as *const u64).offset(2))) 635 | } 636 | } 637 | fn level(&mut self, off:u16, level:usize) -> u16 { 638 | assert!(off <= PAGE_SIZE_16); 639 | unsafe { 640 | u16::from_le(*((self.offset(off as isize) as *mut u16).offset(level as isize))) 641 | } 642 | } 643 | } 644 | 645 | impl P for Cow { 646 | fn data(&self) -> *const u64 { 647 | match self.cow { 648 | transaction::Cow::Page(ref p) => p.data as *const u64, 649 | transaction::Cow::MutPage(ref p) => p.data as *const u64, 650 | } 651 | } 652 | fn page_offset(&self) -> u64 { 653 | match self.cow { 654 | transaction::Cow::Page(ref p) => p.offset, 655 | transaction::Cow::MutPage(ref p) => p.offset, 656 | } 657 | } 658 | } 659 | 660 | impl P for Page { 661 | fn page_offset(&self) -> u64 { 662 | self.page.offset 663 | } 664 | fn data(&self) -> *const u64 { 665 | self.page.data as *mut u64 666 | } 667 | } 668 | 669 | impl P for MutPage { 670 | fn page_offset(&self) -> u64 { 671 | self.page.offset 672 | } 673 | fn data(&self) -> *const u64 { 674 | self.page.data as *mut u64 675 | } 676 | } 677 | 678 | 679 | impl MutPage { 680 | pub fn init(&mut self) { 681 | debug!("mut page init: {:?}",self); 682 | unsafe { 683 | std::ptr::write_bytes(self.page.data as *mut u8, 0, FIRST_HEAD as usize); 684 | let ptr = (self.page.data as *mut u8).offset(FIRST_HEAD as isize) as *mut u16; 685 | *(ptr as *mut u16) = NIL.to_le(); 686 | *((ptr as *mut u16).offset(1)) = NIL.to_le(); 687 | *((ptr as *mut u16).offset(2)) = NIL.to_le(); 688 | *((ptr as *mut u16).offset(3)) = NIL.to_le(); 689 | *((ptr as *mut u16).offset(4)) = NIL.to_le(); 690 | *((ptr as *mut u16).offset(5)) = 0; 691 | *((ptr as *mut u16).offset(6)) = 0; 692 | *((ptr as *mut u16).offset(7)) = 0; 693 | *((ptr as *mut u64).offset(2)) = 0; // next_page 694 | } 695 | } 696 | 697 | /// Takes a size in bytes, returns an offset from the word before 698 | /// the beginning of the contents (0 is invalid, 1 is the first 699 | /// offset). 700 | pub fn alloc(&mut self, first_free: u16, size: u16) { 701 | unsafe { 702 | debug_assert!(size & 7 == 0); // 64 bits aligned. 703 | *(self.p_first_free()) = (first_free + size).to_le(); 704 | } 705 | } 706 | 707 | // allocate and write key, value, left and right neighbors. 708 | pub fn alloc_key_value(&mut self, 709 | off_ptr: u16, 710 | size: u16, 711 | key_ptr:*const u8, 712 | key_len:usize, 713 | value: UnsafeValue) { 714 | unsafe { 715 | *(self.p_occupied()) = (self.occupied() + size).to_le(); 716 | self.alloc(off_ptr, size); 717 | self.write_key_value(off_ptr, key_ptr, key_len, value) 718 | } 719 | } 720 | // allocate and write key, value, left and right neighbors. 721 | pub fn write_key_value(&mut self, 722 | off_ptr: u16, 723 | key_ptr:*const u8, 724 | key_len:usize, 725 | value: UnsafeValue) { 726 | unsafe { 727 | let ptr = self.offset(off_ptr as isize) as *mut u8; 728 | *((ptr as *mut u16).offset(5)) = (key_len as u16).to_le(); 729 | let target_key_ptr = match value { 730 | UnsafeValue::S { p,len } => { 731 | debug_assert!(len < VALUE_SIZE_THRESHOLD as u32); 732 | *((ptr as *mut u32).offset(3)) = len.to_le(); 733 | copy_nonoverlapping(p,(ptr as *mut u8).offset(24), len as usize); 734 | 735 | let padding = (8 - (len & 7)) & 7; 736 | (ptr as *mut u8).offset((24 + len + padding) as isize) 737 | }, 738 | UnsafeValue::O { offset,len } => { 739 | debug!("write_key_value: {:?}", offset); 740 | *((ptr as *mut u32).offset(3)) = len.to_le(); 741 | *((ptr as *mut u64).offset(3)) = offset.to_le(); 742 | (ptr as *mut u8).offset(32) 743 | } 744 | }; 745 | copy_nonoverlapping(key_ptr, target_key_ptr, key_len); 746 | } 747 | } 748 | pub fn reset_pointers(&mut self, off_ptr:u16) { 749 | assert!(off_ptr + 24 < PAGE_SIZE as u16); 750 | // println!("resetting pointers for {:?} at {:?}", self.page_offset(), off_ptr); 751 | unsafe { 752 | let ptr = self.offset(off_ptr as isize) as *mut u8; 753 | *(ptr as *mut u16) = NIL; 754 | *((ptr as *mut u16).offset(1)) = NIL; 755 | *((ptr as *mut u16).offset(2)) = NIL; 756 | *((ptr as *mut u16).offset(3)) = NIL; 757 | *((ptr as *mut u16).offset(4)) = NIL; 758 | *((ptr as *mut u64).offset(2)) = 0; 759 | } 760 | } 761 | pub fn set_right_child(&self, off:u16, right_child:u64) { 762 | assert!(off < PAGE_SIZE_16); 763 | unsafe { 764 | *((self.offset(off as isize) as *mut u64).offset(2)) = right_child.to_le(); 765 | } 766 | } 767 | pub fn set_level(&mut self, off:u16, level:usize, next:u16) { 768 | assert!(off <= PAGE_SIZE_16 - 16); 769 | unsafe { 770 | *((self.offset(off as isize) as *mut u16).offset(level as isize)) = next.to_le(); 771 | } 772 | } 773 | } 774 | 775 | 776 | #[derive(Debug)] 777 | pub struct Cow { 778 | pub cow: transaction::Cow, 779 | } 780 | 781 | impl Cow { 782 | 783 | pub fn from_mut_page(p:MutPage)->Cow { 784 | Cow{cow:transaction::Cow::MutPage(p.page)} 785 | } 786 | 787 | #[cfg(test)] 788 | pub fn unwrap_mut(self) -> MutPage { 789 | match self.cow { 790 | transaction::Cow::MutPage(p) => MutPage { page: p }, 791 | transaction::Cow::Page(_) => panic!("unwrap") 792 | } 793 | } 794 | 795 | pub fn as_nonmut(self) -> Cow { 796 | match self.cow { 797 | transaction::Cow::MutPage(p) => Cow { cow: transaction::Cow::Page(p.as_page()) }, 798 | x => Cow { cow: x } 799 | } 800 | } 801 | pub fn as_page(self) -> Page { 802 | match self.cow { 803 | transaction::Cow::Page(p) => Page { page: p }, 804 | transaction::Cow::MutPage(p) => Page { page: p.as_page() }, 805 | } 806 | } 807 | } 808 | 809 | impl<'env,T> LoadPage for MutTxn<'env,T> { 810 | fn length(&self) -> u64 { 811 | self.txn.env.length 812 | } 813 | fn root_db_(&self,num:isize) -> Option { 814 | let root = self.txn.root(num); 815 | if root == 0 { 816 | None 817 | } else { 818 | Some(Db { root_num:num, root: self.txn.root(num) }) 819 | } 820 | } 821 | fn load_page(&self, off: u64) -> Page { 822 | Page { page: self.txn.load_page(off) } 823 | } 824 | 825 | fn rc(&self) -> Option { 826 | let rc = self.txn.root(REFERENCE_COUNTS); 827 | if rc == 0 { 828 | None 829 | } else { 830 | Some(Db { root_num:REFERENCE_COUNTS, root: rc }) 831 | } 832 | } 833 | } 834 | impl<'env> LoadPage for Txn<'env> { 835 | fn length(&self) -> u64 { 836 | self.txn.env.length 837 | } 838 | fn root_db_(&self,num:isize) -> Option { 839 | let root = self.txn.root(num); 840 | if root == 0 { 841 | None 842 | } else { 843 | Some(Db { root_num:num, root: self.txn.root(num) }) 844 | } 845 | } 846 | fn load_page(&self, off: u64) -> Page { 847 | Page { page: self.txn.load_page(off) } 848 | } 849 | 850 | fn rc(&self) -> Option { 851 | let rc = self.txn.root(REFERENCE_COUNTS); 852 | if rc == 0 { 853 | None 854 | } else { 855 | Some(Db { root_num:REFERENCE_COUNTS, root: rc }) 856 | } 857 | } 858 | } 859 | 860 | #[cfg(debug_assertions)] 861 | fn debug, T: LoadPage + super::Transaction>(t: &T, db: &[&Db], p: P, keys_hex:bool, values_hex:bool) { 862 | let f = File::create(p.as_ref()).unwrap(); 863 | let mut buf = BufWriter::new(f); 864 | writeln!(&mut buf, "digraph{{").unwrap(); 865 | let mut h = HashSet::new(); 866 | fn print_page(txn: &T, 867 | keys_hex:bool,values_hex:bool, 868 | pages: &mut HashSet, 869 | buf: &mut BufWriter, 870 | p: &Page, 871 | print_children: bool) { 872 | if !pages.contains(&p.page.offset) { 873 | pages.insert(p.page.offset); 874 | if print_children { 875 | 876 | let rc = if let Some(rc) = txn.rc() { 877 | txn.get_u64(&rc, p.page.offset).unwrap_or(1) 878 | } else { 879 | 0 880 | }; 881 | 882 | writeln!(buf, 883 | "subgraph cluster{} {{\nlabel=\"Page {}, first_free {}, occupied {}, rc {}\";\ncolor=black;", 884 | p.page.offset, 885 | p.page.offset, 886 | p.first_free(), 887 | p.occupied(), 888 | rc 889 | ) 890 | .unwrap(); 891 | } 892 | let root = FIRST_HEAD; 893 | //debug!("print_page: page {:?}", p.page.offset); 894 | let mut h = Vec::new(); 895 | let mut edges = Vec::new(); 896 | let mut hh = HashSet::new(); 897 | print_tree(txn, keys_hex, values_hex, &mut hh, buf, &mut edges, &mut h, p, root); 898 | if print_children { 899 | writeln!(buf, "}}").unwrap(); 900 | } 901 | for p in edges.iter() { 902 | writeln!(buf, "{}", p).unwrap() 903 | } 904 | if print_children { 905 | for p in h.iter() { 906 | print_page(txn, keys_hex, values_hex, pages, buf, p, true) 907 | } 908 | } 909 | } 910 | } 911 | 912 | fn print_tree(txn: &T, 913 | keys_hex:bool,values_hex:bool, 914 | nodes: &mut HashSet, 915 | buf: &mut BufWriter, 916 | edges: &mut Vec, 917 | pages: &mut Vec, 918 | p: &Page, 919 | off: u16) { 920 | unsafe { 921 | //debug!("print tree:{:?}, off={:?}",p, off); 922 | let ptr = p.offset(off as isize) as *const u32; 923 | let (key,value) = { 924 | if off == FIRST_HEAD { 925 | ("root".to_string(),"".to_string()) 926 | } else { 927 | let (key, value) = read_key_value(ptr as *const u8); 928 | // debug!("key,value = ({:?},{:?})", key.as_ptr(), value); 929 | let key = 930 | if keys_hex { 931 | key.to_hex() 932 | } else { 933 | let key = std::str::from_utf8_unchecked(&key[0..(std::cmp::min(20,key.len()))]); 934 | key.to_string() 935 | }; 936 | let value = { 937 | if let UnsafeValue::O { ref offset, .. } = value { 938 | format!("{:?}(rc = {:?})", offset, super::put::get_rc(txn, *offset)) 939 | } else { 940 | let mut value_=Vec::new(); 941 | let mut value = Value::from_unsafe(&value, txn); 942 | if values_hex { 943 | for i in value { 944 | value_.extend(i) 945 | } 946 | value_.to_hex() 947 | } else { 948 | let value = if value.len() > 20 { 949 | let contents = value.next().unwrap(); 950 | value_.extend(&contents[0..20]); 951 | value_.extend(b"..."); 952 | &value_[..] 953 | } else { 954 | value.next().unwrap() 955 | }; 956 | std::str::from_utf8_unchecked(value).to_string() 957 | } 958 | } 959 | }; 960 | (key,value) 961 | } 962 | }; 963 | //debug!("key,value={:?},{:?}",key,value); 964 | writeln!(buf, 965 | "n_{}_{}[label=\"{}: '{}'->'{}'\"];", 966 | p.page.offset, 967 | off, 968 | off, 969 | key, 970 | value) 971 | .unwrap(); 972 | if !nodes.contains(&off) { 973 | let next_page = u64::from_le(*((ptr as *const u64).offset(2))); 974 | if next_page>0 { 975 | //debug!("print_tree, page = {:?}, next_page = {:?}", p.page.offset, next_page); 976 | pages.push(txn.load_page(next_page)); 977 | edges.push(format!( 978 | "n_{}_{}->n_{}_{}[color=\"red\"];", 979 | p.page.offset, 980 | off, 981 | next_page, 982 | FIRST_HEAD)) 983 | }; 984 | nodes.insert(off); 985 | for i in 0..5 { 986 | let left = u16::from_le(*((ptr as *const u16).offset(i))); 987 | //debug!("{:?}",((ptr as *const u16).offset(i))); 988 | if left != NIL { 989 | writeln!(buf, 990 | "n_{}_{}->n_{}_{}[color=\"blue\", label=\"{}\"];", 991 | p.page.offset, 992 | off, 993 | p.page.offset, 994 | left,i) 995 | .unwrap(); 996 | //debug!("print_tree: recursive call"); 997 | print_tree(txn,keys_hex, values_hex, nodes,buf,edges,pages,p,left) 998 | } 999 | } 1000 | } 1001 | //debug!("/print tree:{:?}",p); 1002 | } 1003 | } 1004 | for db in db { 1005 | let page = t.load_page(db.root); 1006 | print_page(t, keys_hex, values_hex, &mut h, &mut buf, &page, true /* print children */); 1007 | } 1008 | writeln!(&mut buf, "}}").unwrap(); 1009 | } 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | #[cfg(debug_assertions)] 1018 | fn debug_concise, T: LoadPage>(t: &T, db: &[&Db], p: P) { 1019 | let f = File::create(p.as_ref()).unwrap(); 1020 | let mut buf = BufWriter::new(f); 1021 | writeln!(&mut buf, "digraph{{").unwrap(); 1022 | let mut h = HashSet::new(); 1023 | fn print_page(txn: &T, 1024 | pages: &mut HashSet, 1025 | buf: &mut BufWriter, 1026 | p: &Page) { 1027 | if !pages.contains(&p.page.offset) { 1028 | pages.insert(p.page.offset); 1029 | let rc = if let Some(rc) = txn.rc() { 1030 | txn.get_u64(&rc, p.page.offset).unwrap_or(1) 1031 | } else { 1032 | 0 1033 | }; 1034 | writeln!(buf, 1035 | "page_{}[label=\"{}, ff {}, occ {}, rc {}\"];", 1036 | p.page.offset, 1037 | p.page.offset, 1038 | p.first_free(), 1039 | p.occupied(), 1040 | rc 1041 | ).unwrap(); 1042 | 1043 | let root = FIRST_HEAD; 1044 | //debug!("print_page: page {:?}", p.page.offset); 1045 | let mut h = Vec::new(); 1046 | let mut edges = Vec::new(); 1047 | let mut hh = HashSet::new(); 1048 | print_tree(txn, &mut hh, &mut edges, buf, &mut h, p, root); 1049 | for edge in edges.iter() { 1050 | writeln!(buf, "{}", edge).unwrap(); 1051 | } 1052 | for p in h.iter() { 1053 | print_page(txn, pages, buf, p) 1054 | } 1055 | } 1056 | } 1057 | 1058 | fn print_tree(txn: &T, 1059 | nodes: &mut HashSet, 1060 | edges:&mut Vec, 1061 | buf: &mut BufWriter, 1062 | pages: &mut Vec, 1063 | p: &Page, 1064 | off: u16) { 1065 | unsafe { 1066 | //debug!("print tree:{:?}, off={:?}",p, off); 1067 | let ptr = p.offset(off as isize) as *const u32; 1068 | //debug!("key,value={:?},{:?}",key,value); 1069 | if !nodes.contains(&off) { 1070 | let next_page = u64::from_le(*((ptr as *const u64).offset(2))); 1071 | if next_page>0 { 1072 | //debug!("print_tree, page = {:?}, next_page = {:?}", p.page.offset, next_page); 1073 | pages.push(txn.load_page(next_page)); 1074 | edges.push(format!( 1075 | "page_{}->page_{}[color=\"red\"];", 1076 | p.page.offset, 1077 | next_page)) 1078 | }; 1079 | nodes.insert(off); 1080 | let next = u16::from_le(*((ptr as *const u16).offset(0))); 1081 | //debug!("{:?}",((ptr as *const u16).offset(i))); 1082 | if next != NIL { 1083 | print_tree(txn, nodes, edges, buf, pages, p, next) 1084 | } 1085 | } 1086 | //debug!("/print tree:{:?}",p); 1087 | } 1088 | } 1089 | for db in db { 1090 | let page = t.load_page(db.root); 1091 | print_page(t, &mut h, &mut buf, &page); 1092 | } 1093 | writeln!(&mut buf, "}}").unwrap(); 1094 | } 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | pub fn record_size(key: usize, value: usize) -> u16 { 1101 | if value < VALUE_SIZE_THRESHOLD { 1102 | let key_padding = (8 - (key & 7)) & 7; 1103 | let value_padding = (8 - (value & 7)) & 7; 1104 | (24 + key + key_padding + value + value_padding) as u16 1105 | } else { 1106 | let key_padding = (8 - (key & 7)) & 7; 1107 | (24 + key + 8 + key_padding) as u16 1108 | } 1109 | } 1110 | -------------------------------------------------------------------------------- /src/del.rs: -------------------------------------------------------------------------------- 1 | use super::txn::*; 2 | use super::transaction::{PAGE_SIZE,Error}; 3 | use std; 4 | use rand::{Rng}; 5 | use super::Transaction; 6 | 7 | extern crate log; 8 | use super::put::*; 9 | use super::merge; 10 | use super::rebalance; 11 | 12 | // This type is an instruction to page_delete below. 13 | #[derive(Copy,Clone,Debug)] 14 | enum C<'a> { 15 | KV { key:&'a [u8], value:UnsafeValue }, // delete by comparing the key and value. 16 | K { key:&'a[u8] }, // delete the smallest binding of that key. 17 | Smallest // delete the smallest element of a B-tree (used to replace the root of a B-tree). 18 | } 19 | 20 | // Return type of the smallest (key,value). 21 | #[derive(Debug)] 22 | pub struct Smallest { 23 | pub key_ptr:*const u8, 24 | pub key_len:usize, 25 | pub value:UnsafeValue, 26 | pub page:u64 27 | } 28 | 29 | 30 | // Handle an Res::Underfull result from the current page. 31 | // 32 | // - child_page is the page that was just edited. We still need to 33 | // delete according to the "delete" argument, and replace the left 34 | // child of that element with "merged". 35 | // 36 | // - The levels are at the element whose right child is child_page. 37 | // 38 | 39 | fn handle_underfull(rng:&mut R, txn:&mut MutTxn, mut page:Cow, levels:[u16;N_LEVELS], 40 | child_page:Cow, child_must_be_dup:bool, 41 | delete:[u16;N_LEVELS], merged:u64, 42 | page_will_be_dup:bool) -> Result { 43 | debug!("handle_underfull {:?}", page_will_be_dup); 44 | let mut new_levels = [0;N_LEVELS]; 45 | unsafe { 46 | std::ptr::copy_nonoverlapping(levels.as_ptr(), new_levels.as_mut_ptr(), N_LEVELS) 47 | } 48 | 49 | // First try to merge with our right sibling. 50 | let next_offset = unsafe { u16::from_le(*(page.offset(levels[0] as isize) as *const u16)) }; 51 | 52 | if next_offset != NIL { 53 | match try!(merge::merge_children_right(rng, txn, page, levels, &child_page, child_must_be_dup, 54 | &delete, 55 | merged, 56 | page_will_be_dup)) { 57 | 58 | Res::Nothing { page:page_ } => { 59 | // If we couldn't merge: 60 | debug!("merged failed, page={:?}, levels={:?}", page_.page_offset(), levels); 61 | if levels[0] == FIRST_HEAD { 62 | // If we're at the first binding, and there's no 63 | // left sibling to try to merge with. In this 64 | // case, the child page is the left child of the 65 | // key to rebalance. 66 | debug!("first case of rebalancing: {:?}", levels[0]); 67 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 68 | let mut new_levels = [0;N_LEVELS]; 69 | unsafe { 70 | std::ptr::copy_nonoverlapping(levels.as_ptr(), new_levels.as_mut_ptr(), N_LEVELS) 71 | } 72 | match try!(rebalance::rebalance_left(rng, txn, page_, levels, &child_page, child_must_be_dup, 73 | forgetting, merged, 74 | page_will_be_dup)) { 75 | Res::Nothing { page:page_ } => { 76 | let result = try!(rebalance::handle_failed_left_rebalancing(rng, txn, page_, levels, child_page, child_must_be_dup, delete, merged, false, page_will_be_dup)); 77 | // Only in this case will the page containing the smallest element be kept alive. 78 | return Ok(result) 79 | }, 80 | x => { 81 | return Ok(x) 82 | } 83 | } 84 | } else { 85 | // Or there's a left sibling to merge with, and 86 | // it's appropriate to merge to the left 87 | // (i.e. we've not deleted here). 88 | page = page_ 89 | } 90 | }, 91 | res => return Ok(res) 92 | } 93 | } 94 | // If we haven't found a solution so far, move to the previous element, and merge the child page with its left sibling. 95 | 96 | // Move back by one 97 | debug!("trying to merge to left"); 98 | set_pred(&page, &mut new_levels); 99 | match try!(merge::merge_children_left(rng, txn, page, new_levels, &child_page, child_must_be_dup, 100 | &delete, merged, 101 | page_will_be_dup)) { 102 | Res::Nothing { page } => { 103 | // we couldn't merge. rebalance. 104 | debug!("second case of rebalancing: {:?}", child_page); 105 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 106 | let result = match try!(rebalance::rebalance_right(rng, txn, page, new_levels, None, &child_page, 107 | child_must_be_dup, 108 | forgetting, merged, 109 | page_will_be_dup)) { 110 | Res::Nothing { page:page_ } => { 111 | debug!("failed rebalancing"); 112 | // Only in this case will the page containing the smallest element be kept alive. 113 | Ok(try!(rebalance::handle_failed_right_rebalancing(rng, txn, page_, new_levels, None, 114 | child_page, child_must_be_dup, 115 | delete, merged, false, 116 | page_will_be_dup))) 117 | }, 118 | x => Ok(x) 119 | }; 120 | debug!("rebalancing done"); 121 | result 122 | }, 123 | res => Ok(res) 124 | } 125 | } 126 | /// Move back to the predecessor of levels[0]. If levels[0] appears in 127 | /// other lists, move back on them too. 128 | fn set_pred(page:&Cow, levels:&mut [u16]) { 129 | //trace!("set_pred"); 130 | let level0 = levels[0]; 131 | debug_assert!(level0 != FIRST_HEAD && level0 != NIL); 132 | let mut l = 1; 133 | // Go up in levels until we find an entry different from level0. 134 | while l < N_LEVELS && levels[l] == level0 { 135 | l += 1 136 | } 137 | //trace!("!set_pred"); 138 | // Now we go down by one step, staying at the same (non-level0) place in that new list. 139 | if l == N_LEVELS { 140 | l -=1; 141 | levels[l] = FIRST_HEAD; 142 | } else { 143 | l -=1; 144 | levels[l] = levels[l+1] 145 | } 146 | //trace!("?set_pred"); 147 | // and advance in all the lists until we find level0 (level0 is in all the lists of level l or below). 148 | loop { 149 | loop { 150 | let next = u16::from_le(unsafe { *((page.offset(levels[l] as isize) as *const u16).offset(l as isize)) }); 151 | if next == level0 { 152 | break 153 | } else { 154 | levels[l] = next 155 | } 156 | } 157 | if l==0 { 158 | break 159 | } else { 160 | l-=1; 161 | levels[l] = levels[l+1] 162 | } 163 | } 164 | //trace!("/set_pred"); 165 | } 166 | 167 | 168 | 169 | 170 | 171 | fn handle_underfull_replace(rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], 172 | child_page:Cow, 173 | child_must_be_dup:bool, 174 | replacement:&Smallest, 175 | delete:[u16;N_LEVELS], merged:u64, 176 | page_will_be_dup:bool) -> Result { 177 | debug!("handle_underfull_replace"); 178 | // First try to merge with our right sibling. 179 | match try!(merge::merge_children_replace( 180 | rng, txn, page, levels, &child_page, child_must_be_dup, 181 | replacement, 182 | &delete, merged, page_will_be_dup)) { 183 | 184 | Res::Nothing { page:page_ } => { 185 | // If we couldn't merge: 186 | debug!("rebalancing: {:?}", levels[0]); 187 | let forgetting = u16::from_le(unsafe { *(child_page.offset(delete[0] as isize) as *const u16) }); 188 | match try!(rebalance::rebalance_right(rng, txn, page_, levels, Some(replacement), &child_page, 189 | child_must_be_dup, 190 | forgetting, merged, 191 | page_will_be_dup)) { 192 | Res::Nothing { page:page_} => { 193 | return rebalance::handle_failed_right_rebalancing(rng, txn, page_, levels, Some(replacement), child_page, 194 | child_must_be_dup, 195 | delete, merged, false, page_will_be_dup) 196 | }, 197 | x => Ok(x) 198 | } 199 | }, 200 | res => Ok(res) 201 | } 202 | } 203 | 204 | 205 | fn get_smallest_binding(txn:&mut MutTxn, mut current:u64) -> Smallest { 206 | loop { 207 | let page = txn.load_page(current); 208 | current = unsafe { u64::from_le(*(page.offset(FIRST_HEAD as isize + 16) as *const u64)) }; 209 | if current == 0 { 210 | let (next_key,next_value) = { 211 | let cur_ptr = page.offset(0) as *const u16; 212 | let next_off = u16::from_le(unsafe { *cur_ptr }); 213 | debug_assert!(next_off > 0 && next_off != NIL); 214 | let next_ptr = page.offset(next_off as isize); 215 | unsafe { read_key_value(next_ptr) } 216 | }; 217 | return Smallest { 218 | key_ptr: next_key.as_ptr(), 219 | key_len: next_key.len(), 220 | value: next_value, 221 | page: page.page_offset() 222 | } 223 | } 224 | } 225 | } 226 | 227 | 228 | fn delete_at_internal_node(rng:&mut R, txn:&mut MutTxn, page:Cow, levels:[u16;N_LEVELS], page_will_be_dup:bool) -> Result { 229 | debug!("delete_at_internal_node {:?}", page); 230 | // Not found below, but we can delete something here. 231 | 232 | // Find the matching element, and the page to its right. 233 | let next_off = unsafe { u16::from_le(*(page.offset(levels[0] as isize) as *const u16)) }; 234 | let next = page.offset(next_off as isize); 235 | let child_page = unsafe { u64::from_le(*((next as *const u64).offset(2))) }; 236 | let child_page = txn.load_cow_page(child_page); 237 | 238 | // First get the smallest binding, replace here. 239 | let smallest = get_smallest_binding(txn, child_page.page_offset()); 240 | debug!("protecting {:?}", smallest.page); 241 | let mut protected_index = 0; 242 | if txn.protected_pages[0] != 0 { 243 | protected_index = 1 244 | } 245 | txn.protected_pages[protected_index] = smallest.page; 246 | txn.free_protected[protected_index] = false; 247 | 248 | 249 | { 250 | let key = unsafe { std::slice::from_raw_parts(smallest.key_ptr, smallest.key_len) }; 251 | debug!("smallest: {:?} {:?}", std::str::from_utf8(key), smallest.page); 252 | } 253 | let result = match try!(delete(rng,txn, child_page, C::Smallest, page_will_be_dup)) { 254 | Res::Ok { page: child_page } => { 255 | debug!("internal: ok"); 256 | // Set the child page here, regardless of whether a merge is coming after this. 257 | debug!("not underfull"); 258 | 259 | let smallest_key = unsafe { std::slice::from_raw_parts(smallest.key_ptr, smallest.key_len) }; 260 | let size = record_size(smallest.key_len, smallest.value.len() as usize); 261 | 262 | let (key,value) = unsafe { read_key_value(next) }; 263 | let deleted_size = record_size(key.len(), value.len() as usize); 264 | let result = if (page.occupied() + size) - deleted_size <= PAGE_SIZE as u16 { 265 | let mut new_levels = [0;N_LEVELS]; 266 | let mut page = 267 | if page_will_be_dup { 268 | debug!("copying"); 269 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, 0, true)) 270 | } else { 271 | let off = page.can_alloc(size); 272 | debug!("off = {:?}", off); 273 | if off > 0 && off + size <= PAGE_SIZE as u16 { 274 | debug!("pinpointing, levels[0]={:?}", levels[0]); 275 | try!(cow_pinpointing(rng, txn, page, &levels, &mut new_levels, true, true, 0)) 276 | } else { 277 | debug!("compacting"); 278 | try!(cow_pinpointing(rng, txn, page.as_nonmut(), &levels, &mut new_levels, true, true, 0)) 279 | } 280 | }; 281 | let off = page.can_alloc(size); 282 | debug!("off = {:?}, size={:?}", off, size); 283 | debug_assert!(off + size <= PAGE_SIZE as u16); 284 | local_insert_at(rng, &mut page, smallest_key, smallest.value, child_page.page_offset(), off, size, &mut new_levels); 285 | Res::Ok { page:page } 286 | 287 | } else { 288 | // split page. 289 | // Decrement value, except if the page is duplicated 290 | if !page_will_be_dup { 291 | if let UnsafeValue::O { offset, len } = value { 292 | try!(free_value(rng, txn, offset, len)) 293 | } 294 | } 295 | unsafe { 296 | try!(split_page(rng, txn, &page, 297 | smallest_key, smallest.value, child_page.page_offset(), 298 | page_will_be_dup, next_off, 299 | NIL, 0)) 300 | } 301 | }; 302 | Ok(result) 303 | }, 304 | Res::Underfull { page: child_page, delete, merged, must_be_dup, .. } => { 305 | 306 | if !page_will_be_dup { 307 | let (_,value) = unsafe { read_key_value(next) }; 308 | if let UnsafeValue::O { offset, len } = value { 309 | try!(free_value(rng, txn, offset, len)) 310 | } 311 | } 312 | 313 | debug!("internal: underfull"); 314 | handle_underfull_replace(rng, txn, page, levels, child_page, 315 | must_be_dup, 316 | &smallest, delete, merged, 317 | page_will_be_dup) 318 | }, 319 | Res::Split { key_len,key_ptr,value, left, right, free_page } => { 320 | 321 | debug!("internal: split"); 322 | let middle_key = unsafe { std::slice::from_raw_parts(key_ptr, key_len) }; 323 | let middle_size = record_size(key_len, value.len() as usize); 324 | 325 | let smallest_key = unsafe { std::slice::from_raw_parts(smallest.key_ptr, smallest.key_len) }; 326 | let smallest_size = record_size(smallest.key_len, smallest.value.len() as usize); 327 | 328 | // We need to insert middle_key -> right and smallest_key -> left to the page. 329 | let deleted_size = unsafe { 330 | let (key,value) = read_key_value(next); 331 | 332 | if !page_will_be_dup { 333 | if let UnsafeValue::O { offset, len } = value { 334 | try!(free_value(rng, txn, offset, len)) 335 | } 336 | } 337 | 338 | record_size(key.len(), value.len() as usize) 339 | }; 340 | 341 | let result = if (page.occupied() + middle_size + smallest_size) - deleted_size <= PAGE_SIZE as u16 { 342 | 343 | let mut new_levels = [0;N_LEVELS]; 344 | // Delete the current element. 345 | let mut page = if page_will_be_dup { 346 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, 0, true)) 347 | } else { 348 | if page.first_free() + middle_size + smallest_size <= PAGE_SIZE as u16 { 349 | try!(cow_pinpointing(rng, txn, page, &levels, &mut new_levels, true, false, 0)) 350 | } else { 351 | try!(cow_pinpointing(rng, txn, page.as_nonmut(), &levels, &mut new_levels, true, false, 0)) 352 | } 353 | }; 354 | // Reinsert the left page with the smallest key. 355 | let middle_off = page.can_alloc(middle_size); 356 | debug_assert!(middle_off + middle_size <= PAGE_SIZE as u16); 357 | local_insert_at(rng, &mut page, middle_key, value, right.page_offset(), middle_off, middle_size, &mut new_levels); 358 | 359 | let smallest_off = page.can_alloc(smallest_size); 360 | debug_assert!(smallest_off + smallest_size <= PAGE_SIZE as u16); 361 | local_insert_at(rng, &mut page, smallest_key, smallest.value, left.page_offset(), smallest_off, smallest_size, &mut new_levels); 362 | 363 | Ok(Res::Ok { page:page }) 364 | } else { 365 | // split. 366 | unsafe { 367 | split_page(rng, txn, &page, 368 | middle_key, value, right.page_offset(), 369 | page_will_be_dup, NIL, 370 | levels[0], left.page_offset()) 371 | } 372 | }; 373 | if !page_will_be_dup && free_page > 0 { 374 | try!(free(rng, txn, free_page)); 375 | } else { 376 | // incrementing value: already done in split_page 377 | /* 378 | if let UnsafeValue::O { offset, .. } = value { 379 | try!(incr_rc(rng, txn, offset)) 380 | } 381 | */ 382 | } 383 | result 384 | }, 385 | Res::Nothing { .. } => { 386 | if cfg!(debug_assertions) { 387 | panic!("Child page {:?} was empty when trying to remove its smallest element.", page) 388 | } else { 389 | unreachable!() 390 | } 391 | } 392 | }; 393 | debug!("protected: {:?}", txn.protected_pages); 394 | if txn.free_protected[protected_index] { 395 | debug!("freeing previously protected {:?}", smallest.page); 396 | unsafe { super::transaction::free(&mut txn.txn, smallest.page) } 397 | } 398 | txn.protected_pages[protected_index] = 0; 399 | txn.free_protected[protected_index] = false; 400 | result 401 | } 402 | 403 | 404 | fn delete(rng:&mut R, txn:&mut MutTxn, page:Cow, comp:C, 405 | parent_will_be_dup:bool) -> Result { 406 | 407 | debug!("delete = {:?}", page); 408 | let mut levels:[u16;N_LEVELS] = [FIRST_HEAD;N_LEVELS]; 409 | let mut eq = false; 410 | match comp { 411 | C::KV { key, value } => set_levels(txn, &page, key, Some(value), &mut levels, &mut eq), 412 | C::K { key } => set_levels(txn, &page, key, None, &mut levels, &mut eq), 413 | C::Smallest => { eq = true } 414 | } 415 | let child_page = u64::from_le(unsafe { *((page.offset(levels[0] as isize) as *const u64).offset(2)) }); 416 | debug!("next_page = {:?}, {:?}", child_page, eq); 417 | let page_rc = get_rc(txn, page.page_offset()); 418 | let this_will_be_dup = parent_will_be_dup || (page_rc > 1); 419 | debug!("needs_dup={:?} {:?}", parent_will_be_dup, page_rc); 420 | 421 | // If the reference count of the current page is n > 1, we need to 422 | // decrement it, as it will no longer be referenced from its 423 | // current reference. 424 | 425 | let del = if child_page > 0 { 426 | let next_page = txn.load_cow_page(child_page); 427 | Some(try!(delete(rng, txn, next_page, comp, this_will_be_dup))) 428 | } else { 429 | None 430 | }; 431 | match del { 432 | None if eq => { 433 | debug!("deleting here, rc={:?}", page_rc); 434 | let (next_key,next_value) = { 435 | let cur_ptr = page.offset(levels[0] as isize) as *const u16; 436 | let next_off = u16::from_le(unsafe { *cur_ptr }); 437 | debug_assert!(next_off > 0 && next_off != NIL); 438 | let next_ptr = page.offset(next_off as isize); 439 | unsafe { read_key_value(next_ptr) } 440 | }; 441 | let deleted_size = record_size(next_key.len(), next_value.len() as usize); 442 | 443 | let will_be_underfull = page.occupied() - deleted_size < (PAGE_SIZE as u16)/2; 444 | 445 | debug!("will_be_underfull = {:?} {:?}", will_be_underfull, levels); 446 | if will_be_underfull { 447 | if let UnsafeValue::O { offset, len } = next_value { 448 | if let C::Smallest = comp { 449 | if this_will_be_dup { 450 | debug!("incr_rc"); 451 | try!(incr_rc(rng, txn, offset)) 452 | } 453 | } else { 454 | if !this_will_be_dup { 455 | debug!("free_value"); 456 | try!(free_value(rng, txn, offset, len)) 457 | } 458 | } 459 | } 460 | Ok(Res::Underfull { page:page, delete: levels, merged:0, must_be_dup: page_rc > 1 }) 461 | } else { 462 | let mut new_levels = [0;N_LEVELS]; 463 | 464 | if !parent_will_be_dup && page_rc > 1 { 465 | // The parent contained a pointer to this page, 466 | // which will be dropped since the parent is not duplicated. 467 | try!(decr_rc(rng, txn, page.page_offset())) 468 | } 469 | 470 | let page = 471 | if this_will_be_dup { 472 | // After this page is copied, if we're in case 473 | // C::Smallest, there will be one more 474 | // reference to the value. 475 | match (comp,next_value) { 476 | (C::Smallest, UnsafeValue::O { offset, .. }) => { 477 | try!(incr_rc(rng, txn, offset)); 478 | }, 479 | _ => { } 480 | } 481 | // Never free the value here. 482 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, true, false, 0, true)) 483 | } else { 484 | let free_value = match comp { 485 | C::Smallest => false, 486 | _ => true 487 | }; 488 | try!(cow_pinpointing(rng, txn, page, &levels, &mut new_levels, true, free_value, 0)) 489 | }; 490 | debug!("page={:?}", page); 491 | Ok(Res::Ok { page:page }) 492 | } 493 | }, 494 | Some(Res::Nothing { .. }) if eq => { 495 | // Find smallest, etc. 496 | let page_offset = page.page_offset(); 497 | let result = try!(delete_at_internal_node(rng, txn, page, levels, this_will_be_dup)); 498 | match result { 499 | Res::Underfull { .. } => { 500 | // This case will be handled by the parent. 501 | }, 502 | _ if page_rc > 1 && !parent_will_be_dup => // decrease the RC of the first page on the path referenced at least twice. 503 | try!(decr_rc(rng, txn, page_offset)), 504 | _ => {} 505 | } 506 | Ok(result) 507 | 508 | }, 509 | Some(Res::Underfull { page:child_page, delete, merged, must_be_dup }) => { 510 | 511 | debug!("delete: underfull {:?}", child_page); 512 | let page_offset = page.page_offset(); 513 | let result = try!(handle_underfull(rng, txn, page, levels, child_page, 514 | must_be_dup, 515 | delete, merged, 516 | this_will_be_dup)); 517 | match result { 518 | Res::Underfull { .. } => { 519 | // This case will be handled by the parent. 520 | }, 521 | _ if page_rc > 1 && !parent_will_be_dup => // decrease the RC of the first page on the path referenced at least twice. 522 | try!(decr_rc(rng, txn, page_offset)), 523 | _ => {} 524 | } 525 | Ok(result) 526 | }, 527 | Some(Res::Ok { page:child_page }) => { 528 | debug!("ok, back to page {:?} with child {:?}", page.page_offset(), child_page.page_offset()); 529 | if page_rc > 1 && !parent_will_be_dup { 530 | // decrease the RC of the first page on the path referenced at least twice. 531 | try!(decr_rc(rng, txn, page.page_offset())) 532 | } 533 | // Update the pointer here 534 | let mut new_levels = [0;N_LEVELS]; 535 | let page = 536 | if this_will_be_dup { 537 | try!(copy_page(rng, txn, &page.as_page(), &levels, &mut new_levels, 538 | false, false, child_page.page_offset(), true)) 539 | } else { 540 | try!(cow_pinpointing(rng, txn, page, &levels, &mut new_levels, false, false, child_page.page_offset())) 541 | }; 542 | Ok(Res::Ok { page:page }) 543 | }, 544 | Some(Res::Nothing {.. }) | None => { 545 | Ok(Res::Nothing { page:page }) 546 | }, 547 | 548 | Some(Res::Split { key_ptr,key_len,value:value_,left,right,free_page }) => { 549 | // Now reinsert the element here. 550 | if page_rc > 1 && !parent_will_be_dup { 551 | // decrease the RC of the first page on the path referenced at least twice. 552 | try!(decr_rc(rng, txn, page.page_offset())) 553 | } 554 | let key_ = unsafe {std::slice::from_raw_parts(key_ptr, key_len)}; 555 | let result = unsafe { 556 | try!(full_local_insert(rng, txn, page, key_, value_, right.page_offset(), &mut levels, left.page_offset(), 557 | parent_will_be_dup, 558 | this_will_be_dup)) 559 | }; 560 | if !this_will_be_dup && free_page > 0 { 561 | try!(free(rng, txn, free_page)); 562 | } else { 563 | // incrementing value: already done in split_page 564 | /*if let UnsafeValue::O { offset, .. } = value_ { 565 | try!(incr_rc(rng, txn, offset)) 566 | }*/ 567 | } 568 | Ok(result) 569 | }, 570 | } 571 | 572 | } 573 | 574 | pub fn del(rng:&mut R, txn:&mut MutTxn, db:&mut Db, key:&[u8], value:Option<&[u8]>)->Result { 575 | 576 | assert!(key.len() < MAX_KEY_SIZE); 577 | let root_page = Cow { cow: txn.txn.load_cow_page(db.root) }; 578 | 579 | let comp = if let Some(value) = value { 580 | C::KV { key: key, 581 | value: UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } } 582 | } else { 583 | C::K { key:key } 584 | }; 585 | unsafe { 586 | debug!("root: {:?}", root_page); 587 | match try!(delete(rng,txn, root_page, comp, false)) { 588 | Res::Ok { page } => { 589 | // Maybe the root is empty. Check 590 | let next = u16::from_le(*(page.offset(FIRST_HEAD as isize) as *const u16)); 591 | let next_page = u64::from_le(*((page.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 592 | if next == NIL && next_page != 0 { 593 | db.root = next_page; 594 | try!(free(rng, txn, page.page_offset())); 595 | } else { 596 | db.root = page.page_offset(); 597 | } 598 | Ok(true) 599 | }, 600 | Res::Underfull { page, delete, merged, must_be_dup } => { 601 | let mut new_levels = [0;N_LEVELS]; 602 | 603 | debug!("del: must_be_dup = {:?}", must_be_dup); 604 | let page = 605 | if must_be_dup { 606 | try!(decr_rc(rng, txn, page.page_offset())); 607 | try!(copy_page( rng, txn, &page.as_page(), 608 | &delete, 609 | &mut new_levels, 610 | true, false, merged, true)) 611 | } else { 612 | try!(cow_pinpointing( rng, txn, page, 613 | &delete[..], 614 | &mut new_levels[..], 615 | true, false, 616 | merged)) 617 | }; 618 | 619 | // If this page is empty, replace with next page. 620 | let next = u16::from_le(*(page.offset(FIRST_HEAD as isize) as *const u16)); 621 | let next_page = u64::from_le(*((page.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 622 | if next == NIL && next_page != 0 { 623 | db.root = next_page; 624 | try!(free(rng, txn, page.page_offset())); 625 | } else { 626 | db.root = page.page_offset(); 627 | } 628 | Ok(true) 629 | }, 630 | Res::Nothing { .. } => { 631 | Ok(false) 632 | }, 633 | x => { 634 | debug!("root split"); 635 | db.root = try!(root_split(rng,txn,x)).page_offset(); 636 | Ok(true) 637 | } 638 | } 639 | } 640 | 641 | } 642 | 643 | pub fn replace(rng:&mut R, txn: &mut MutTxn, db: &mut Db, key: &[u8], value: &[u8])->Result<(),Error> { 644 | try!(del(rng,txn,db,key,None)); 645 | try!(put(rng,txn,db,key,value)); 646 | Ok(()) 647 | } 648 | 649 | 650 | fn drop_page(rng:&mut R, txn: &mut MutTxn, page:u64)->Result<(),Error> { 651 | let mut rc = if let Some(rc) = txn.rc() { rc } else { try!(txn.create_db()) }; 652 | let count = txn.get_u64(&rc, page).unwrap_or(1); 653 | if count > 1 { 654 | if count == 2 { 655 | try!(txn.del_u64(rng, &mut rc, page)); 656 | } else { 657 | try!(txn.replace_u64(rng, &mut rc, page, count-1)); 658 | } 659 | } else { 660 | let page = txn.load_page(page); 661 | for (_ , _, value, r) in PageIterator::new(&page,0) { 662 | if let UnsafeValue::O { offset, len } = value { 663 | try!(free_value(rng, txn, offset, len)) 664 | } 665 | try!(drop_page(rng, txn, r)) 666 | } 667 | unsafe { 668 | super::transaction::free(&mut txn.txn, page.page_offset()) 669 | } 670 | } 671 | Ok(()) 672 | } 673 | 674 | 675 | pub fn drop(rng:&mut R, txn: &mut MutTxn, db: Db)->Result<(),Error> { 676 | drop_page(rng, txn, db.root) 677 | } 678 | 679 | pub fn clear(rng:&mut R, txn: &mut MutTxn, db: &mut Db)->Result<(),Error> { 680 | if get_rc(txn, db.root) > 1 { 681 | decr_rc(rng, txn, db.root) 682 | } else { 683 | let page = txn.load_cow_page(db.root); 684 | for (_ , _, value, r) in PageIterator::new(&page,0) { 685 | if let UnsafeValue::O { offset, len } = value { 686 | try!(free_value(rng, txn, offset, len)) 687 | } 688 | try!(drop_page(rng, txn, r)) 689 | } 690 | match page.cow { 691 | super::transaction::Cow::Page(p0) => { 692 | unsafe { super::transaction::free(&mut txn.txn, p0.offset) } 693 | db.root = try!(txn.alloc_page()).page_offset(); 694 | } 695 | super::transaction::Cow::MutPage(p0) => { 696 | (MutPage { page:p0 }).init() 697 | } 698 | } 699 | Ok(()) 700 | } 701 | } 702 | 703 | 704 | ///////////////////////////////////////////////////////////// Tests 705 | 706 | #[test] 707 | fn test_delete_leaf() { 708 | extern crate tempdir; 709 | extern crate rand; 710 | extern crate env_logger; 711 | use super::{Env}; 712 | 713 | use rand::{Rng}; 714 | let mut rng = rand::thread_rng(); 715 | 716 | env_logger::init().unwrap_or(()); 717 | let dir = tempdir::TempDir::new("pijul").unwrap(); 718 | let tmp = tempdir::TempDir::new("pijul").unwrap(); 719 | { 720 | let tmp_path = tmp.path(); 721 | debug!("tmp_path: {:?}", tmp_path); 722 | let env = Env::new(dir.path(), 1000).unwrap(); 723 | let mut txn = env.mut_txn_begin().unwrap(); 724 | 725 | let mut page = txn.alloc_page().unwrap(); 726 | page.init(); 727 | let mut insertions = Vec::new(); 728 | for _ in 0..200 { 729 | let key_: String = rng 730 | .gen_ascii_chars() 731 | .take(20) 732 | .collect(); 733 | let value_: String = rng 734 | .gen_ascii_chars() 735 | .take(20) 736 | .collect(); 737 | { 738 | let key = key_.as_bytes(); 739 | let value = value_.as_bytes(); 740 | let value = if value.len() > VALUE_SIZE_THRESHOLD { 741 | super::put::alloc_value(&mut txn,value).unwrap() 742 | } else { 743 | UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } 744 | }; 745 | match insert(&mut rng, &mut txn, Cow::from_mut_page(page), key, value, 0, false) { 746 | Ok(Res::Ok { page:page_ }) => { 747 | page = page_ 748 | }, 749 | Ok(Res::Underfull { page:page_, .. }) => { 750 | page = page_.unwrap_mut(); 751 | }, 752 | Ok(Res::Nothing { page:page_ }) => { 753 | page = page_.unwrap_mut() 754 | }, 755 | Ok(x) => { 756 | page = root_split(&mut rng, &mut txn, x).unwrap() 757 | }, 758 | _ => panic!("") 759 | } 760 | } 761 | insertions.push((key_,value_)) 762 | } 763 | insertions.sort(); 764 | 765 | let db = Db { root_num: -1, root: page.page_offset() }; 766 | txn.debug(&[&db], tmp_path.join("before"), false, false); 767 | // Delete the 10th smallest entry. 768 | { 769 | let (ref key_,ref value_) = insertions[10]; 770 | let key = key_.as_bytes(); 771 | let value = value_.as_bytes(); 772 | let value = UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 }; 773 | match delete(&mut rng, &mut txn, Cow::from_mut_page(page), C::KV { key:key, value:value }, false) { 774 | Ok(Res::Ok { page:page_, .. }) => { 775 | page = page_ 776 | }, 777 | _ => panic!("") 778 | } 779 | } 780 | let db = Db { root_num: -1, root: page.page_offset() }; 781 | txn.debug(&[&db], tmp_path.join("after"), false, false); 782 | println!("tmp: {:?}", tmp_path); 783 | } 784 | std::mem::forget(tmp); 785 | } 786 | 787 | 788 | #[test] 789 | fn test_delete_root() { 790 | extern crate tempdir; 791 | extern crate rand; 792 | extern crate env_logger; 793 | use super::{Env}; 794 | 795 | use rand::{Rng}; 796 | let mut rng = rand::thread_rng(); 797 | 798 | env_logger::init().unwrap_or(()); 799 | let dir = tempdir::TempDir::new("pijul").unwrap(); 800 | let env = Env::new(dir.path(), 1000).unwrap(); 801 | let mut txn = env.mut_txn_begin().unwrap(); 802 | 803 | let mut page = txn.alloc_page().unwrap(); 804 | page.init(); 805 | let tmp = tempdir::TempDir::new("pijul").unwrap(); 806 | unsafe { 807 | let tmp_path = tmp.path(); 808 | debug!("tmp_path: {:?}", tmp_path); 809 | let mut insertions = Vec::new(); 810 | for _ in 0..200 { 811 | //println!("i={:?}", i); 812 | let key_: String = rng 813 | .gen_ascii_chars() 814 | .take(20) 815 | .collect(); 816 | //println!("key = {:?}", key); 817 | let value_: String = rng 818 | .gen_ascii_chars() 819 | .take(20) 820 | .collect(); 821 | { 822 | let key = key_.as_bytes(); 823 | let value = value_.as_bytes(); 824 | let value = if value.len() > VALUE_SIZE_THRESHOLD { 825 | super::put::alloc_value(&mut txn,value).unwrap() 826 | } else { 827 | UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } 828 | }; 829 | match insert(&mut rng, &mut txn, Cow::from_mut_page(page), key, value, 0, false) { 830 | Ok(Res::Ok { page:page_ }) => { 831 | page = page_ 832 | }, 833 | Ok(Res::Underfull { page:page_, .. }) => { 834 | page = page_.unwrap_mut(); 835 | }, 836 | Ok(Res::Nothing { page:page_ }) => { 837 | //println!("already present"); 838 | page = page_.unwrap_mut() 839 | }, 840 | Ok(x) => { 841 | page = root_split(&mut rng, &mut txn, x).unwrap() 842 | }, 843 | _ => panic!("") 844 | } 845 | } 846 | insertions.push((key_,value_)) 847 | } 848 | let db = Db { root_num: -1, root: page.page_offset() }; 849 | txn.debug(&[&db], tmp_path.join("before"), false, false); 850 | // Delete an entry in the root. 851 | { 852 | debug!("now deleting from the root page"); 853 | let current = page.offset(0) as *mut u16; 854 | let next_off = u16::from_le(*(current.offset(0))); 855 | let next = page.offset(next_off as isize); 856 | let (key,value) = read_key_value(next as *const u8); 857 | debug!("deleting key {:?}", std::str::from_utf8(key).unwrap()); 858 | match delete(&mut rng, &mut txn, Cow::from_mut_page(page), C::KV { key:key, value:value }, false) { 859 | Ok(Res::Ok { page:page_, .. }) => { 860 | page = page_ 861 | }, 862 | _ => panic!("") 863 | } 864 | } 865 | debug!("delete done, debugging"); 866 | let db = Db { root_num: -1, root: page.page_offset() }; 867 | txn.debug(&[&db], tmp_path.join("after"), false, false); 868 | } 869 | std::mem::forget(tmp); 870 | } 871 | 872 | #[cfg(test)] 873 | enum Sorted { 874 | No, Incr, Decr 875 | } 876 | 877 | #[cfg(test)] 878 | fn test_delete_all(n:usize, keysize:usize, valuesize:usize, sorted:Sorted) { 879 | extern crate tempdir; 880 | extern crate rand; 881 | extern crate env_logger; 882 | use super::{Env}; 883 | 884 | use rand::{Rng}; 885 | let mut rng = rand::thread_rng(); 886 | 887 | env_logger::init().unwrap_or(()); 888 | let dir = tempdir::TempDir::new("pijul").unwrap(); 889 | let env = Env::new(dir.path(), 1000).unwrap(); 890 | let mut txn = env.mut_txn_begin().unwrap(); 891 | 892 | let mut page = txn.alloc_page().unwrap(); 893 | page.init(); 894 | let tmp = tempdir::TempDir::new("pijul").unwrap(); 895 | let tmp_path = tmp.path().to_path_buf(); 896 | std::mem::forget(tmp); 897 | unsafe { 898 | debug!("tmp_path: {:?}", tmp_path); 899 | let mut insertions = Vec::new(); 900 | for i in 0..n { 901 | //println!("i={:?}", i); 902 | let key_: String = rng 903 | .gen_ascii_chars() 904 | .take(keysize) 905 | .collect(); 906 | //println!("key = {:?}", key); 907 | let value_: String = rng 908 | .gen_ascii_chars() 909 | .take(valuesize) 910 | .collect(); 911 | let value = { 912 | let key = key_.as_bytes(); 913 | let value = value_.as_bytes(); 914 | let value = if value.len() > VALUE_SIZE_THRESHOLD { 915 | super::put::alloc_value(&mut txn,value).unwrap() 916 | } else { 917 | UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 } 918 | }; 919 | match insert(&mut rng, &mut txn, Cow::from_mut_page(page), key, value, 0, false).unwrap() { 920 | Res::Ok { page:page_ } => { 921 | page = page_ 922 | }, 923 | Res::Underfull { .. } => { 924 | unreachable!() 925 | }, 926 | Res::Nothing { page:page_ } => { 927 | //println!("already present"); 928 | page = page_.unwrap_mut() 929 | }, 930 | x => { 931 | debug!("root split"); 932 | page = root_split(&mut rng, &mut txn, x).unwrap() 933 | }, 934 | } 935 | value 936 | }; 937 | debug!("put i = {:?}", i); 938 | debug!("key = {:?}", key_); 939 | 940 | let db = Db { root_num: -1, root: page.page_offset() }; 941 | txn.debug(&[&db], (&tmp_path).join(format!("before_{}", i)), false, false); 942 | 943 | insertions.push((key_,value_, value)) 944 | } 945 | let db = Db { root_num: -1, root: page.page_offset() }; 946 | txn.debug(&[&db], (&tmp_path).join("before"), false, false); 947 | 948 | match sorted { 949 | Sorted::No => {}, 950 | Sorted::Incr => { 951 | insertions.sort_by(|&(ref a,_,_),&(ref b,_,_)| a.cmp(b)) 952 | }, 953 | Sorted::Decr => { 954 | insertions.sort_by(|&(ref a,_,_),&(ref b,_,_)| b.cmp(a)) 955 | } 956 | } 957 | for i in 0..insertions.len() { 958 | 959 | let (ref key, ref value, ref val) = insertions[i]; 960 | 961 | println!("i = {:?}", i); 962 | println!("key = {:?}", key); 963 | debug!("i = {:?}", i); 964 | debug!("key = {:?}", key); 965 | debug!("allocated = {:?}", val); 966 | let key = key.as_bytes(); 967 | let value = value.as_bytes(); 968 | let value = UnsafeValue::S { p:value.as_ptr(), len:value.len() as u32 }; 969 | match delete(&mut rng, &mut txn, Cow::from_mut_page(page), C::KV { key:key, value:value }, false).unwrap() { 970 | Res::Ok { page:page_ } => { 971 | // If this page is empty, replace with next page. 972 | let next = u16::from_le(*(page_.offset(FIRST_HEAD as isize) as *const u16)); 973 | let next_page = u64::from_le(*((page_.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 974 | if next == NIL && next_page != 0 { 975 | page = txn.load_cow_page(u64::from_le(*((page_.offset(FIRST_HEAD as isize) as *const u64).offset(2)))).unwrap_mut() 976 | } else { 977 | page = page_ 978 | } 979 | }, 980 | Res::Underfull { page:page_, delete, merged, .. } => { 981 | println!("underfull, deleting {:?}", &delete[..]); 982 | let mut new_levels = [0;N_LEVELS]; 983 | let page_ = cow_pinpointing(&mut rng, &mut txn, page_, 984 | &delete[..], 985 | &mut new_levels[..], 986 | true, false, 987 | merged).unwrap(); 988 | 989 | // If this page is empty, replace with next page. 990 | let next = u16::from_le(*(page_.offset(FIRST_HEAD as isize) as *const u16)); 991 | let next_page = u64::from_le(*((page_.offset(FIRST_HEAD as isize) as *const u64).offset(2))); 992 | if next == NIL && next_page != 0 { 993 | page = txn.load_cow_page(u64::from_le(*((page_.offset(FIRST_HEAD as isize) as *const u64).offset(2)))).unwrap_mut() 994 | } else { 995 | page = page_ 996 | } 997 | }, 998 | Res::Nothing{..} => unreachable!(), 999 | x => page = root_split(&mut rng, &mut txn, x).unwrap(), 1000 | } 1001 | let db = Db { root_num: -1, root: page.page_offset() }; 1002 | txn.debug(&[&db], (&tmp_path).join(format!("after_{}", i)), false, false); 1003 | } 1004 | debug!("delete done, debugging"); 1005 | 1006 | let db = Db { root_num: -1, root: page.page_offset() }; 1007 | for _ in txn.iter(&db, b"", None) { 1008 | panic!("Database not empty") 1009 | } 1010 | //txn.debug(&[&db], format!("/tmp/after"), false, false); 1011 | } 1012 | } 1013 | 1014 | #[test] 1015 | fn test_delete_all_sorted_20_() { 1016 | test_delete_all(20, 10, 20, Sorted::Incr) 1017 | } 1018 | #[test] 1019 | fn test_delete_all_decr_20_() { 1020 | test_delete_all(20, 100, 20, Sorted::Decr) 1021 | } 1022 | #[test] 1023 | fn test_delete_all_unsorted_20_() { 1024 | test_delete_all(20, 200, 200, Sorted::No) 1025 | } 1026 | 1027 | #[test] 1028 | fn test_delete_all_sorted_200() { 1029 | test_delete_all(200, 100, 200, Sorted::Incr) 1030 | } 1031 | #[test] 1032 | fn test_delete_all_decr_200() { 1033 | test_delete_all(200, 100, 200, Sorted::Decr) 1034 | } 1035 | #[test] 1036 | fn test_delete_all_unsorted_200() { 1037 | test_delete_all(200, 200, 200, Sorted::No) 1038 | } 1039 | 1040 | #[test] 1041 | fn test_delete_all_unsorted_1000() { 1042 | test_delete_all(800, 200, 200, Sorted::No) 1043 | } 1044 | 1045 | 1046 | #[test] 1047 | fn test_delete_all_large() { 1048 | test_delete_all(200, 200, 2000, Sorted::No) 1049 | } 1050 | 1051 | #[test] 1052 | fn test_delete_all_really_large() { 1053 | test_delete_all(200, 200, 10000, Sorted::No) 1054 | } 1055 | --------------------------------------------------------------------------------