├── .github └── workflows │ └── rust.yml ├── .gitignore ├── .idea ├── .gitignore ├── badger-rs.iml ├── modules.xml └── vcs.xml ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── my_benchmark.rs ├── build.rs ├── examples └── badger.rs ├── lock.txt └── src ├── backup.rs ├── compaction.rs ├── doc └── write.md ├── event └── mod.rs ├── iterator.rs ├── kv.rs ├── kv_test.rs ├── level_handler.rs ├── levels.rs ├── lib.rs ├── lock.txt ├── log_file.rs ├── manifest.rs ├── options └── mod.rs ├── pb ├── backup.proto ├── backup.rs ├── badgerpb3.proto ├── badgerpb3.rs └── mod.rs ├── skl ├── alloc.rs ├── arena.rs ├── cursor.rs ├── mod.rs ├── node.rs └── skip.rs ├── st_manager.rs ├── table ├── builder.rs ├── iterator.rs ├── mod.rs ├── table.rs └── tests.rs ├── test_data └── vlog_file.text ├── test_util.rs ├── types.rs ├── value_log.rs ├── value_log_tests.rs └── y ├── codec.rs ├── iterator.rs ├── merge_iterator.rs ├── metrics.rs └── mod.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Install latest nightly 20 | uses: actions-rs/toolchain@v1 21 | with: 22 | toolchain: nightly 23 | override: true 24 | - name: Build 25 | run: cargo build --verbose 26 | - name: Run tests 27 | run: cargo test --verbose 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/badger-rs.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "badger-rs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | serde = { version = "1.0.171", features = ["derive"] } 10 | serde_json = { version = "1.0.103", default-features = true, features = ["alloc"] } 11 | anyhow = "1.0.72" 12 | thiserror = "1.0.43" 13 | tokio = { version = "1.29.1", features = ["full", "tracing"] } 14 | byteorder = "1.4.3" 15 | rand = "0.8.5" 16 | maligned = "0.2.1" 17 | atomic = "0.5.3" 18 | tabled = { version = "0.12.2", features = ["ansi-str", "color"] } 19 | memmap = "0.7.0" 20 | bytes = "1.4.0" 21 | bloom = "0.3.2" 22 | growable-bloom-filter = { version = "2.0.1", features = ["nightly"] } 23 | filename = "0.1.1" 24 | num_cpus = "1.16.0" 25 | threads_pool = "0.2.6" 26 | crc32fast = "1.3.2" 27 | async-trait = "0.1.71" 28 | fmmap = { version = "0.3.2", features = ["tokio-async"] } 29 | parking_lot = "0.12.1" 30 | bitflags = "2.3.3" 31 | libc = "0.2.147" 32 | log = { version = "0.4.19", features = ["kv_unstable", "kv_unstable_serde", "kv_unstable_sval"] } 33 | async-channel = "1.9.0" 34 | file-guard = "0.1.0" 35 | fs2 = "0.4.3" 36 | awaitgroup = "0.7.0" 37 | range-lock = "0.2.3" 38 | tracing = "0.1.37" 39 | drop_cell = "0.0.0" 40 | walkdir = "2.3.3" 41 | crossbeam-epoch = "0.9.15" 42 | tokio-context = "0.1.3" 43 | dyn-clone = "1.0.12" 44 | eieio = "1.0.0" 45 | either = "1.8.1" 46 | enum-unitary = "0.5.0" 47 | atom_box = "0.1.2" 48 | console-subscriber = "0.1.10" 49 | uuid = { version = "1.4.1", features = ["v5", "v4"] } 50 | winapi = "0.3.9" 51 | itertools = "0.11.0" 52 | tokio-metrics = "0.2.2" 53 | metrics = "0.21.1" 54 | metrics-prometheus = "0.4.1" 55 | prometheus = "0.13.3" 56 | lazy_static = "1.4.0" 57 | getset = "0.1.2" 58 | tokio-stream = "0.1.14" 59 | async-stream = "0.3.5" 60 | futures-core = "0.3.28" 61 | backtrace-on-stack-overflow = "0.3.0" 62 | protobuf = { version = "3.0.0-alpha.2", features = ["with-bytes"] } 63 | [dev-dependencies] 64 | tracing-subscriber = "0.3.17" 65 | tracing-log = "0.1.3" 66 | chrono = "0.4.26" 67 | env_logger = "0.10.0" 68 | console_log = { version = "1.0.0", features = ["color"] } 69 | itertools = "0.11.0" 70 | tokio-metrics = { version = "0.2.2", default-features = false } 71 | tokio = { version = "1.29.1", features = ["full", "rt", "time", "macros", "test-util"] } 72 | criterion = { version = "0.5.1", features = ["tokio"] } 73 | 74 | [build] 75 | rustflags = ["--cfg", "tokio_unstable"] 76 | 77 | [build-dependencies] 78 | protoc-rust = "3.0.0-alpha.2" 79 | 80 | [[bench]] 81 | name = "my_benchmark" 82 | harness = false 83 | 84 | [profile.dev] 85 | debug-assertions = false 86 | 87 | [profile.release] 88 | codegen-units=1 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # badger-rs 2 | ![example workflow name](https://github.com/laohanlinux/badger-rs/workflows/Rust/badge.svg) 3 | 4 | Badger is based on [WiscKey paper by University of Wisconsin, Madison](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf). 5 | 6 | Thanks for [dgraph-io/badger](https://github.com/dgraph-io/badger) -------------------------------------------------------------------------------- /benches/my_benchmark.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | fn main() { 3 | use criterion::BenchmarkId; 4 | use criterion::Criterion; 5 | use criterion::{criterion_group, criterion_main}; 6 | 7 | // This is a struct that tells Criterion.rs to use the "futures" crate's current-thread executor 8 | use criterion::async_executor::AsyncExecutor; 9 | 10 | // Here we have an async function to benchmark 11 | async fn do_something(size: usize) { 12 | // Do something async with the size 13 | } 14 | 15 | fn from_elem(c: &mut Criterion) { 16 | let size: usize = 1024; 17 | 18 | c.bench_with_input(BenchmarkId::new("input_example", size), &size, |b, &s| { 19 | // Insert a call to `to_async` to convert the bencher to async mode. 20 | // The timing loops are the same as with the normal bencher. 21 | // b.to_async(FuturesExecutor).iter(|| do_something(s)); 22 | }); 23 | } 24 | 25 | criterion_group!(benches, from_elem); 26 | criterion_main!(benches); 27 | } 28 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | extern crate protoc_rust; 2 | 3 | fn main() { 4 | // protoc --rust_out=src/pb src/pb/badgerpb3.proto 5 | //protoc_rust::Codegen::new() 6 | // .out_dir("src/pb") 7 | //.inputs(&["src/pb/badgerpb3.proto", "src/pb/backup.proto"]) 8 | // .run() 9 | // .expect("Running protoc failed"); 10 | } 11 | -------------------------------------------------------------------------------- /examples/badger.rs: -------------------------------------------------------------------------------- 1 | #[tokio::main] 2 | async fn main() { 3 | let env = tracing_subscriber::EnvFilter::from_default_env(); 4 | tracing_subscriber::FmtSubscriber::builder() 5 | .with_env_filter(env) 6 | .try_init() 7 | .unwrap(); 8 | let opt = badger_rs::Options::default(); 9 | let kv = badger_rs::KV::open(opt).await.unwrap(); 10 | kv.set( 11 | b"hello word".to_vec(), 12 | b">>>>>I LOVE YOU!<<<<<".to_vec(), 13 | 0x0, 14 | ) 15 | .await 16 | .unwrap(); 17 | 18 | let got = kv.get(b"hello word").await.unwrap(); 19 | println!("{}", String::from_utf8_lossy(&got)); 20 | } 21 | -------------------------------------------------------------------------------- /lock.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laohanlinux/badger-rs/00bbe70da1f4b0fe7d52ffbdf8da91867a147834/lock.txt -------------------------------------------------------------------------------- /src/backup.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use byteorder::{LittleEndian, WriteBytesExt}; 3 | use protobuf::Message; 4 | use crate::pb::backup::KVPair; 5 | 6 | pub fn write_to(entry: &KVPair, wt: &mut W) -> crate::Result<()> where W: Write { 7 | let buf = entry.write_to_bytes().unwrap(); 8 | wt.write_u64::(buf.len() as u64)?; 9 | wt.write_all(&buf)?; 10 | Ok(()) 11 | } 12 | -------------------------------------------------------------------------------- /src/compaction.rs: -------------------------------------------------------------------------------- 1 | use crate::hex_str; 2 | use crate::levels::CompactDef; 3 | use crate::table::table::Table; 4 | 5 | use log::{error, info, warn}; 6 | use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; 7 | use parking_lot::{RawRwLock, RwLock}; 8 | use std::fmt::{Display, Formatter}; 9 | use std::sync::atomic::{AtomicU64, Ordering}; 10 | use std::sync::Arc; 11 | 12 | #[derive(Debug)] 13 | pub(crate) struct CompactStatus { 14 | // every level has a *CompactionStatus* that includes multipart *KeyRange* 15 | pub(crate) levels: RwLock>, 16 | } 17 | 18 | impl Default for CompactStatus { 19 | fn default() -> Self { 20 | CompactStatus { 21 | levels: RwLock::new(vec![]), 22 | } 23 | } 24 | } 25 | 26 | impl CompactStatus { 27 | // Check whether we can run this *CompactDef*. That it doesn't overlap with any 28 | // other running Compaction. If it can be run, it would store this run in the compactStatus state. 29 | pub(crate) fn compare_and_add(&self, cd: &CompactDef) -> bool { 30 | let level = cd.this_level.level(); 31 | assert!( 32 | level + 1 < self.rl().len(), 33 | "Got level {}, max level {}", 34 | level, 35 | self.rl().len() 36 | ); 37 | let lc = self.rl(); 38 | let this_level = lc.get(level).unwrap(); 39 | let next_level = lc.get(level + 1).unwrap(); 40 | if this_level.overlaps_with(&cd.this_range) { 41 | return false; 42 | } 43 | if next_level.overlaps_with(&cd.next_range) { 44 | return false; 45 | } 46 | 47 | // Check whether this level really needs compaction or not. Otherwise, we'll end up 48 | // running parallel compactions for the same level. 49 | // *NOTE*: We can directly call this_level.total_size, because we already have acquired a read lock 50 | // over this and the next level. 51 | if cd.this_level.get_total_size() - this_level.get_del_size() 52 | < cd.this_level.get_max_total_size() 53 | { 54 | log::info!( 55 | "skip the compaction, top_size:{}, bot_size:{}, max_size:{}", 56 | cd.this_level.get_total_size(), 57 | cd.next_level.get_total_size(), 58 | cd.this_level.get_max_total_size() 59 | ); 60 | return false; 61 | } 62 | this_level.add(cd.this_range.clone()); 63 | next_level.add(cd.next_range.clone()); 64 | this_level.incr_del_size(cd.this_size.load(Ordering::Relaxed)); 65 | true 66 | } 67 | 68 | // Delete CompactDef. 69 | pub(crate) fn delete(&self, cd: &CompactDef) { 70 | let levels = self.wl(); 71 | let level = cd.this_level.level(); 72 | assert!( 73 | level < levels.len() - 1, 74 | "Got level {}, Max levels {}", 75 | level, 76 | levels.len() 77 | ); 78 | 79 | let this_level = levels.get(level).unwrap(); 80 | let next_level = levels.get(level + 1).unwrap(); 81 | // Decr delete size after compacted. 82 | this_level.decr_del_size(cd.this_size.load(Ordering::Relaxed)); 83 | let mut found = this_level.remove(&cd.this_range); 84 | // top level must have KeyRange because it is compact's base condition 85 | assert!(found, "{}", this_level); 86 | found = next_level.remove(&cd.next_range) && found; 87 | if !found { 88 | let this_kr = &cd.this_range; 89 | let next_kr = &cd.next_range; 90 | warn!("Looking for: [{}] in this level.", this_kr,); 91 | warn!("This Level: {}", level); 92 | warn!("Looking for: [{}] in next level.", next_kr); 93 | warn!("Next Level: {}", level + 1); 94 | warn!("KeyRange not found"); 95 | warn!("Looking for seek k range"); 96 | warn!("{}, {}", cd.this_range, cd.next_range); 97 | } 98 | } 99 | 100 | // Return trur if the level overlap with this, otherwise false 101 | pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool { 102 | let cstatus = &self.rl()[level]; 103 | let overlaps = cstatus.overlaps_with(this); 104 | #[cfg(test)] 105 | log::info!( 106 | "level{} compact status compare, {:?}, dst: {:?}, overlaps: {}", 107 | level, 108 | cstatus.rl(), 109 | this, 110 | overlaps 111 | ); 112 | overlaps 113 | } 114 | 115 | // Return level's deleted data count 116 | pub(crate) fn del_size(&self, level: usize) -> u64 { 117 | self.rl()[level].get_del_size() 118 | } 119 | 120 | // Return Level's compaction status with *WriteLockGuard* 121 | pub(crate) fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { 122 | self.levels.write() 123 | } 124 | 125 | // Return Level's compaction status with *ReadLockGuard* 126 | pub(crate) fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { 127 | self.levels.read() 128 | } 129 | 130 | pub(crate) fn to_log(&self) { 131 | let status = self.rl(); 132 | info!("Compact levels, count:{}", status.len()); 133 | for level in status.iter().enumerate() { 134 | info!("[{}] {}", level.0, level.1.to_string()) 135 | } 136 | } 137 | } 138 | 139 | // Every level compacted status(ranges). 140 | // del_size: all KeyRange size at the level (NOTE: equal LevelCompactStatus.ranges delete size, so after compacting, 141 | // KeyRange and del_size all be decr) 142 | #[derive(Clone, Debug)] 143 | pub(crate) struct LevelCompactStatus { 144 | ranges: Arc>>, 145 | del_size: Arc, 146 | } 147 | 148 | impl Default for LevelCompactStatus { 149 | fn default() -> Self { 150 | LevelCompactStatus { 151 | ranges: Arc::new(RwLock::new(Vec::new())), 152 | del_size: Arc::new(AtomicU64::new(0)), 153 | } 154 | } 155 | } 156 | 157 | impl Display for LevelCompactStatus { 158 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 159 | let ranges = self 160 | .rl() 161 | .iter() 162 | .map(|kr| kr.to_string()) 163 | .collect::>() 164 | .join(","); 165 | let del_size = self.get_del_size(); 166 | f.debug_struct("LevelCompactStatus") 167 | .field("ranges", &ranges) 168 | .field("del_size", &del_size) 169 | .finish() 170 | } 171 | } 172 | 173 | impl LevelCompactStatus { 174 | // returns true if self.ranges and dst has overlap, otherwise returns false 175 | fn overlaps_with(&self, dst: &KeyRange) -> bool { 176 | self.rl().iter().any(|kr| kr.overlaps_with(dst)) 177 | } 178 | 179 | // remove dst from self.ranges 180 | pub(crate) fn remove(&self, dst: &KeyRange) -> bool { 181 | let mut rlock = self.wl(); 182 | let len = rlock.len(); 183 | rlock.retain(|r| r != dst); 184 | len > rlock.len() 185 | } 186 | 187 | // add dst range 188 | fn add(&self, dst: KeyRange) { 189 | self.wl().push(dst); 190 | } 191 | 192 | pub(crate) fn get_del_size(&self) -> u64 { 193 | self.del_size.load(Ordering::Acquire) 194 | } 195 | 196 | fn incr_del_size(&self, n: u64) { 197 | self.del_size.fetch_add(n, Ordering::Release); 198 | } 199 | 200 | fn decr_del_size(&self, n: u64) { 201 | self.del_size.fetch_sub(n, Ordering::Release); 202 | } 203 | 204 | fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { 205 | self.ranges.write() 206 | } 207 | 208 | fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> { 209 | self.ranges.read() 210 | } 211 | } 212 | 213 | // [left, right], Special inf is range all if it be set `true` 214 | #[derive(Clone, Default, Debug)] 215 | pub(crate) struct KeyRange { 216 | pub(crate) left: Vec, 217 | // TODO zero Copy 218 | pub(crate) right: Vec, 219 | pub(crate) inf: bool, 220 | } 221 | 222 | impl PartialEq for KeyRange { 223 | fn eq(&self, other: &Self) -> bool { 224 | self.equals(other) 225 | } 226 | } 227 | 228 | impl Display for KeyRange { 229 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 230 | write!( 231 | f, 232 | "", 233 | hex_str(&self.left), 234 | hex_str(&self.right), 235 | self.inf 236 | ) 237 | } 238 | } 239 | 240 | // Including all keys 241 | pub(crate) const INFO_RANGE: KeyRange = KeyRange { 242 | left: vec![], 243 | right: vec![], 244 | inf: true, 245 | }; 246 | 247 | impl KeyRange { 248 | // Get the KeyRange of tables 249 | pub(crate) fn get_range(tables: &Vec) -> KeyRange { 250 | assert!(!tables.is_empty()); 251 | let mut smallest = tables[0].smallest(); 252 | let mut biggest = tables[0].biggest(); 253 | for i in 1..tables.len() { 254 | if tables[i].smallest() < smallest { 255 | smallest = tables[i].smallest(); 256 | } 257 | if tables[i].biggest() > biggest { 258 | biggest = tables[i].biggest(); 259 | } 260 | } 261 | KeyRange { 262 | left: smallest.to_vec(), 263 | right: biggest.to_vec(), 264 | inf: false, 265 | } 266 | } 267 | 268 | // Left, right, inf all same, indicate equal 269 | pub(crate) fn equals(&self, other: &KeyRange) -> bool { 270 | self.left == other.left && self.right == self.right && self.inf == self.inf 271 | } 272 | 273 | // Check for overlap, *Notice*, if a and b are all inf, indicate has overlap. 274 | pub(crate) fn overlaps_with(&self, other: &KeyRange) -> bool { 275 | if self.inf || other.inf { 276 | return true; 277 | } 278 | 279 | // ---[other_left, other_right]--[] 280 | if self.left > other.right { 281 | return false; 282 | } 283 | // ---[]--[other-left, other-right] 284 | if self.right < other.left { 285 | return false; 286 | } 287 | true 288 | } 289 | } 290 | 291 | mod tests { 292 | use crate::compaction::{KeyRange, INFO_RANGE}; 293 | 294 | #[test] 295 | fn key_range() { 296 | let mut v = vec![KeyRange { 297 | left: vec![], 298 | right: vec![], 299 | inf: true, 300 | }]; 301 | let cd = INFO_RANGE; 302 | v.retain(|kr| kr != &cd); 303 | assert!(v.is_empty()); 304 | let tests = vec![vec![2, 20], vec![30, 50], vec![70, 80]]; 305 | 306 | let inputs = vec![ 307 | vec![0, 1], 308 | vec![81, 100], 309 | vec![21, 25], 310 | vec![29, 40], 311 | vec![40, 60], 312 | vec![21, 51], 313 | vec![21, 100], 314 | vec![0, 200], 315 | vec![0, 70], 316 | vec![70, 80], 317 | ]; 318 | 319 | for (i, arg) in inputs.iter().enumerate() { 320 | let left = tests.binary_search_by(|probe| probe[1].cmp(&arg[0])); 321 | let left = left.unwrap_or_else(|n| n); 322 | let right = tests.binary_search_by(|probe| probe[0].cmp(&arg[1])); 323 | let right = right.map(|n| n + 1).unwrap_or_else(|n| n); 324 | println!("{}, {:?}, {:?}", i, left, right); 325 | } 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /src/doc/write.md: -------------------------------------------------------------------------------- 1 | Put Key 2 | 3 | ```mermaid 4 | %% Example of sequence diagram 5 | sequenceDiagram 6 | actor KV 7 | participant WriteCh 8 | actor FlushCh 9 | KV-->>WriteCh: Async Send Req 10 | activate WriteCh 11 | alt Inner Data Transfer 12 | WriteCh-->>WriteCh: 1. Call writeRequests[Mult Reqs] 13 | WriteCh -->>WriteCh: 2. Write Into Vlog, Fill Ptrs 14 | WriteCh -)WriteCh: 3. Check ensureRoomForWrite 15 | WriteCh -->>FlushCh: 4. Send flushTask{s.mt, s.vptr} to FlushCh 16 | Note right of WriteCh: 1) vlog.sync(): Ensure value log is synced to disk so this memtable's contents wouldn't be lost.
2) s.imm = append(s.imm, s.mt): We manage to push this task. Let's modify imm.
3) s.mt = skl.NewSkiplist(arenaSize(&s.opt)): New memtable is empty. We certainly have room. 17 | WriteCh -->>WriteCh: 5. If not pass 3, writeToLSM 18 | WriteCh-->>WriteCh: 6. updateOffset [update lasted Ptr] 19 | end 20 | WriteCh-->> KV: Async Return Req 21 | deactivate WriteCh 22 | activate FlushCh 23 | FlushCh -->> FlushCh: Receive FlushTask From 4 24 | FlushCh -->> FlushCh: ft.mt is nil ? and ft.vptr.IsZero()? Put Offset for replay 25 | FlushCh -->> FlushCh: Create a new table, writeLevel0Table and addLevel0Table 26 | deactivate FlushCh 27 | ``` 28 | 29 | -------------------------------------------------------------------------------- /src/event/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::table::table::Table; 2 | use lazy_static::lazy_static; 3 | use prometheus::{Gauge, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry}; 4 | use std::fmt; 5 | use std::fmt::Formatter; 6 | use std::time::{Duration, Instant}; 7 | 8 | lazy_static! { 9 | static ref EV: EvMetrics = EvMetrics { 10 | lsm_size: IntGaugeVec::new( 11 | prometheus::Opts::new("badger_lsm_size_bytes", "lsm size bytes by direct"), 12 | &["direct"] 13 | ) 14 | .unwrap(), 15 | vlog_size: IntGauge::new("vlog_size", "vlog size bytes").unwrap(), 16 | pending_writes: IntGauge::new("pending_writes_total", "pending writes total").unwrap(), 17 | num_reads: IntCounter::new("num_reads", "number of reads").unwrap(), 18 | num_writes: IntCounter::new("num_writes", "number of writes").unwrap(), 19 | num_bytes_read: IntCounter::new("num_bytes_read", "bytes of read").unwrap(), 20 | num_bytes_written: IntCounter::new("num_bytes_written", "bytes of written").unwrap(), 21 | num_lsm_gets: IntCounter::new("num_lsm_gets", "number of lsm gets").unwrap(), 22 | num_lsm_bloom_hits: IntCounter::new("num_bloom_hits", "number of bloom hits").unwrap(), 23 | num_blocked_puts: IntCounter::new("num_blocked_hits", "number of blocked hits").unwrap(), 24 | num_mem_tables_gets: IntCounter::new("num_mem_tables", "number of the memtable gets") 25 | .unwrap(), 26 | num_gets: IntCounter::new("num_gets", "number of gets").unwrap(), 27 | num_puts: IntCounter::new("num_puts", "number of puts").unwrap(), 28 | block_hash_calc_cost: IntCounter::new( 29 | "block_hash_calc_cost", 30 | "block hash calc cost for bloom" 31 | ) 32 | .unwrap(), 33 | }; 34 | } 35 | 36 | #[derive(Debug)] 37 | pub struct EvMetrics { 38 | pub lsm_size: IntGaugeVec, 39 | pub vlog_size: IntGauge, 40 | pub pending_writes: IntGauge, 41 | 42 | /// These are cumulative 43 | pub num_reads: IntCounter, 44 | pub num_writes: IntCounter, 45 | pub num_bytes_read: IntCounter, 46 | pub num_bytes_written: IntCounter, 47 | pub num_lsm_gets: IntCounter, 48 | pub num_lsm_bloom_hits: IntCounter, 49 | pub num_gets: IntCounter, 50 | pub num_puts: IntCounter, 51 | pub num_blocked_puts: IntCounter, 52 | /// number of the memtable gets 53 | pub num_mem_tables_gets: IntCounter, 54 | pub block_hash_calc_cost: IntCounter, 55 | } 56 | 57 | impl fmt::Display for EvMetrics { 58 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 59 | use tabled::{Table, Tabled}; 60 | 61 | #[derive(Tabled)] 62 | struct KeyPair { 63 | label: String, 64 | value: String, 65 | } 66 | let mut kv = vec![]; 67 | kv.push(KeyPair { 68 | label: "num_reads".to_owned(), 69 | value: self.num_reads.get().to_string(), 70 | }); 71 | kv.push(KeyPair { 72 | label: "num_writes".to_owned(), 73 | value: self.num_writes.get().to_string(), 74 | }); 75 | kv.push(KeyPair { 76 | label: "num_bytes_read".to_owned(), 77 | value: self.num_bytes_read.get().to_string(), 78 | }); 79 | kv.push(KeyPair { 80 | label: "num_bytes_written".to_owned(), 81 | value: self.num_bytes_written.get().to_string(), 82 | }); 83 | kv.push(KeyPair { 84 | label: "num_lsm_gets".to_owned(), 85 | value: self.num_lsm_gets.get().to_string(), 86 | }); 87 | kv.push(KeyPair { 88 | label: "num_lsm_bloom_hits".to_owned(), 89 | value: self.num_lsm_bloom_hits.get().to_string(), 90 | }); 91 | kv.push(KeyPair { 92 | label: "num_gets".to_owned(), 93 | value: self.num_gets.get().to_string(), 94 | }); 95 | kv.push(KeyPair { 96 | label: "num_puts".to_owned(), 97 | value: self.num_puts.get().to_string(), 98 | }); 99 | kv.push(KeyPair { 100 | label: "num_blocked_puts".to_owned(), 101 | value: self.num_blocked_puts.get().to_string(), 102 | }); 103 | kv.push(KeyPair { 104 | label: "num_mem_tables_gets".to_owned(), 105 | value: self.num_mem_tables_gets.get().to_string(), 106 | }); 107 | kv.push(KeyPair { 108 | label: "block_hash_calc_cost".to_owned(), 109 | value: self.block_hash_calc_cost.get().to_string(), 110 | }); 111 | let table_str = Table::new(kv).to_string(); 112 | f.write_str(&table_str) 113 | } 114 | } 115 | 116 | pub fn get_metrics() -> &'static EvMetrics { 117 | &EV 118 | } 119 | -------------------------------------------------------------------------------- /src/iterator.rs: -------------------------------------------------------------------------------- 1 | use crate::iterator::PreFetchStatus::Prefetched; 2 | use crate::kv::_BADGER_PREFIX; 3 | use crate::types::{ArcRW, Channel, Closer, TArcMx, TArcRW}; 4 | use crate::{hex_str, ValueStruct, KV}; 5 | use crate::{ 6 | value_log::{MetaBit, ValuePointer}, 7 | Decode, MergeIterator, Result, Xiterator, EMPTY_SLICE, 8 | }; 9 | 10 | use atomic::Atomic; 11 | 12 | use std::fmt::{Debug, Display, Formatter, Pointer}; 13 | use std::future::Future; 14 | 15 | use std::pin::{pin, Pin}; 16 | 17 | use std::sync::atomic::Ordering; 18 | use std::sync::Arc; 19 | use std::{io::Cursor, sync::atomic::AtomicU64}; 20 | use tokio::io::AsyncWriteExt; 21 | use tokio::sync::{RwLockReadGuard, RwLockWriteGuard}; 22 | 23 | #[derive(Debug, PartialEq, Copy, Clone)] 24 | pub(crate) enum PreFetchStatus { 25 | Empty, 26 | Prefetched, 27 | } 28 | 29 | #[derive(Clone, Debug)] 30 | pub struct KVItem { 31 | inner: TArcRW, 32 | } 33 | 34 | impl From for KVItem { 35 | fn from(value: KVItemInner) -> Self { 36 | Self { 37 | inner: TArcRW::new(tokio::sync::RwLock::new(value)), 38 | } 39 | } 40 | } 41 | // impl Deref for KVItem { 42 | // type Target = tokio::sync::RwLock; 43 | // 44 | // fn deref(&self) -> &Self::Target { 45 | // self.inner.as_ref() 46 | // } 47 | // } 48 | 49 | impl KVItem { 50 | pub async fn key(&self) -> Vec { 51 | let inner = self.rl().await; 52 | inner.key().to_vec() 53 | } 54 | 55 | pub async fn value(&self) -> Result> { 56 | let inner = self.rl().await; 57 | inner.get_value().await 58 | } 59 | 60 | pub async fn has_value(&self) -> bool { 61 | let inner = self.rl().await; 62 | inner.has_value() 63 | } 64 | 65 | pub async fn counter(&self) -> u64 { 66 | let inner = self.rl().await; 67 | inner.counter() 68 | } 69 | 70 | pub async fn user_meta(&self) -> u8 { 71 | let inner = self.rl().await; 72 | inner.user_meta() 73 | } 74 | 75 | pub(crate) async fn rl(&self) -> RwLockReadGuard<'_, KVItemInner> { 76 | self.inner.read().await 77 | } 78 | 79 | pub(crate) async fn wl(&self) -> RwLockWriteGuard<'_, KVItemInner> { 80 | self.inner.write().await 81 | } 82 | } 83 | 84 | // Returned during iteration. Both the key() and value() output is only valid until 85 | // iterator.next() is called. 86 | #[derive(Clone)] 87 | pub(crate) struct KVItemInner { 88 | status: Arc>, 89 | kv: KV, 90 | key: Vec, 91 | // TODO, Opz memory 92 | vptr: Vec, 93 | value: TArcMx>, 94 | meta: u8, 95 | user_meta: u8, 96 | cas_counter: Arc, 97 | wg: Closer, 98 | err: Result<()>, 99 | } 100 | 101 | impl Display for KVItemInner { 102 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 103 | f.debug_struct("kv") 104 | .field("key", &hex_str(&self.key)) 105 | .field("meta", &self.meta) 106 | .field("user_meta", &self.user_meta) 107 | .field("cas", &self.counter()) 108 | .finish() 109 | } 110 | } 111 | 112 | impl Debug for KVItemInner { 113 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 114 | f.debug_struct("kv") 115 | .field("key", &hex_str(&self.key)) 116 | .field("meta", &self.meta) 117 | .field("user_meta", &self.user_meta) 118 | .field("cas", &self.counter()) 119 | .finish() 120 | } 121 | } 122 | 123 | impl KVItemInner { 124 | pub(crate) fn new(key: Vec, value: ValueStruct, kv: KV) -> KVItemInner { 125 | Self { 126 | status: Arc::new(Atomic::new(PreFetchStatus::Empty)), 127 | kv, 128 | key, 129 | value: TArcMx::new(Default::default()), 130 | vptr: value.value, 131 | meta: value.meta, 132 | user_meta: value.user_meta, 133 | cas_counter: Arc::new(AtomicU64::new(value.cas_counter)), 134 | wg: Closer::new("kv".to_owned()), 135 | err: Ok(()), 136 | } 137 | } 138 | 139 | // Returns the key. Remember to copy if you need to access it outside the iteration loop. 140 | pub(crate) fn key(&self) -> &[u8] { 141 | &self.key 142 | } 143 | 144 | // Return value 145 | pub(crate) async fn get_value(&self) -> Result> { 146 | let ch = Channel::new(1); 147 | self.value(|value| { 148 | let tx = ch.tx(); 149 | let value = value.to_vec(); 150 | Box::pin(async move { 151 | tx.send(value).await.unwrap(); 152 | Ok(()) 153 | }) 154 | }) 155 | .await?; 156 | Ok(ch.recv().await.unwrap()) 157 | } 158 | 159 | // Value retrieves the value of the item from the value log. It calls the 160 | // consumer function with a slice argument representing the value. In case 161 | // of error, the consumer function is not called. 162 | // 163 | // Note that the call to the consumer func happens synchronously. 164 | pub(crate) async fn value( 165 | &self, 166 | mut consumer: impl FnMut(&[u8]) -> Pin> + Send>>, 167 | ) -> Result<()> { 168 | // Wait result 169 | self.wg.wait().await; 170 | if self.status.load(Ordering::Acquire) == Prefetched { 171 | if self.err.is_err() { 172 | return self.err.clone(); 173 | } 174 | let value = self.value.lock().await; 175 | return if value.is_empty() { 176 | consumer(&EMPTY_SLICE).await 177 | } else { 178 | consumer(&value).await 179 | }; 180 | } 181 | return self.kv.yield_item_value(self.clone(), consumer).await; 182 | } 183 | 184 | pub(crate) fn has_value(&self) -> bool { 185 | if self.meta == 0 && self.vptr.is_empty() { 186 | return false; 187 | } 188 | if (self.meta & MetaBit::BIT_DELETE.bits()) > 0 { 189 | return false; 190 | } 191 | true 192 | } 193 | 194 | // async fetch value from value_log. 195 | pub(crate) async fn pre_fetch_value(&self) -> Result<()> { 196 | let kv = self.kv.clone(); 197 | kv.yield_item_value(self.clone(), |value| { 198 | let status_wl = self.status.clone(); 199 | let value = value.to_vec(); 200 | let value_wl = self.value.clone(); 201 | Box::pin(async move { 202 | status_wl.store(Prefetched, Ordering::Release); 203 | if value.is_empty() { 204 | return Ok(()); 205 | } 206 | let mut value_wl = value_wl.lock().await; 207 | *value_wl = value; 208 | Ok(()) 209 | }) 210 | }) 211 | .await 212 | } 213 | 214 | // Returns approximate size of the key-value pair. 215 | // 216 | // This can be called while iterating through a store to quickly estimate the 217 | // size of a range of key-value pairs (without fetching the corresponding) 218 | // values). 219 | pub(crate) fn estimated_size(&self) -> u64 { 220 | if !self.has_value() { 221 | return 0; 222 | } 223 | if self.meta & MetaBit::BIT_VALUE_POINTER.bits() == 0 { 224 | return (self.key.len() + self.vptr.len()) as u64; 225 | } 226 | let mut vpt = ValuePointer::default(); 227 | vpt.dec(&mut Cursor::new(&self.vptr)).unwrap(); 228 | vpt.len as u64 // includes key length 229 | } 230 | 231 | // Returns the CAS counter associated with the value. 232 | pub(crate) fn counter(&self) -> u64 { 233 | self.cas_counter.load(atomic::Ordering::Acquire) 234 | } 235 | 236 | // Returns the user_meta set by the user. Typically, this byte, optionally set by the user 237 | // is used to interpret the value. 238 | pub(crate) fn user_meta(&self) -> u8 { 239 | self.user_meta 240 | } 241 | 242 | pub(crate) fn meta(&self) -> u8 { 243 | self.meta 244 | } 245 | 246 | pub(crate) fn vptr(&self) -> &[u8] { 247 | &self.vptr 248 | } 249 | } 250 | 251 | // Used to set options when iterating over Badger key-value stores. 252 | #[derive(Debug, Clone, Copy)] 253 | pub struct IteratorOptions { 254 | // Indicates whether we should prefetch values during iteration and store them. 255 | pub(crate) pre_fetch_values: bool, 256 | // How may KV pairs to prefetch while iterating. Valid only if PrefetchValues is true. 257 | pub(crate) pre_fetch_size: isize, 258 | // Direction of iteration. False is forward, true is backward. 259 | pub(crate) reverse: bool, 260 | } 261 | 262 | impl Default for IteratorOptions { 263 | fn default() -> Self { 264 | DEF_ITERATOR_OPTIONS 265 | } 266 | } 267 | 268 | impl IteratorOptions { 269 | pub fn new(pre_fetch_values: bool, pre_fetch_size: isize, reverse: bool) -> Self { 270 | IteratorOptions { 271 | pre_fetch_values, 272 | pre_fetch_size, 273 | reverse, 274 | } 275 | } 276 | } 277 | 278 | pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions { 279 | pre_fetch_size: 100, 280 | pre_fetch_values: true, 281 | reverse: false, 282 | }; 283 | 284 | /// Helps iterating over the KV pairs in a lexicographically sorted order. 285 | /// skiplist, sst vlog 286 | /// | | | 287 | /// | | | 288 | /// IteratorExt reference 289 | pub struct IteratorExt { 290 | kv: KV, 291 | itr: MergeIterator, 292 | opt: IteratorOptions, 293 | item: ArcRW>, 294 | // Cache the prefetch keys, not inlcude current value 295 | data: ArcRW>, 296 | has_rewind: ArcRW, 297 | } 298 | 299 | /// TODO FIXME 300 | // impl futures_core::Stream for IteratorExt { 301 | // type Item = KVItem; 302 | // 303 | // fn poll_next( 304 | // mut self: Pin<&mut Self>, 305 | // cx: &mut std::task::Context<'_>, 306 | // ) -> std::task::Poll> { 307 | // let mut has_rewind = self.has_rewind.write(); 308 | // if !*has_rewind { 309 | // *has_rewind = true; 310 | // match Pin::new(&mut pin!(self.rewind())).poll(cx) { 311 | // std::task::Poll::Pending => { 312 | // warn!("<<<>>>>"); 313 | // std::task::Poll::Pending 314 | // } 315 | // std::task::Poll::Ready(None) => std::task::Poll::Ready(None), 316 | // std::task::Poll::Ready(t) => std::task::Poll::Ready(t), 317 | // } 318 | // } else { 319 | // match Pin::new(&mut pin!(self.next())).poll(cx) { 320 | // std::task::Poll::Pending => { 321 | // warn!("<<<>>>>"); 322 | // std::task::Poll::Pending 323 | // } 324 | // std::task::Poll::Ready(None) => std::task::Poll::Ready(None), 325 | // std::task::Poll::Ready(t) => std::task::Poll::Ready(t), 326 | // } 327 | // } 328 | // } 329 | // } 330 | 331 | impl IteratorExt { 332 | pub(crate) fn new(kv: KV, itr: MergeIterator, opt: IteratorOptions) -> IteratorExt { 333 | IteratorExt { 334 | kv, 335 | opt, 336 | itr, 337 | data: ArcRW::default(), 338 | item: Arc::new(Default::default()), 339 | has_rewind: ArcRW::default(), 340 | } 341 | } 342 | 343 | // pub(crate) async fn new_async_iterator( 344 | // kv: KV, 345 | // itr: MergeIterator, 346 | // opt: IteratorOptions, 347 | // ) -> Box> { 348 | // let itr = Self::new(kv, itr, opt); 349 | // Box::new(itr) 350 | // } 351 | 352 | // Seek to the provided key if present. If absent, if would seek to the next smallest key 353 | // greater than provided if iterating in the forward direction. Behavior would be reversed is 354 | // iterating backwards. 355 | pub async fn seek(&self, key: &[u8]) -> Option { 356 | while let Some(el) = self.data.write().pop_front() { 357 | el.rl().await.wg.wait().await; 358 | } 359 | while let Some(el) = self.itr.seek(key) { 360 | if el.key().starts_with(_BADGER_PREFIX) { 361 | continue; 362 | } 363 | break; 364 | } 365 | self.pre_fetch().await; 366 | self.item.read().clone() 367 | } 368 | 369 | // Rewind the iterator cursor all the wy to zero-th position, which would be the 370 | // smallest key if iterating forward, and largest if iterating backward. It dows not 371 | // keep track of whether the cursor started with a `seek`. 372 | pub async fn rewind(&self) -> Option { 373 | while let Some(el) = self.data.write().pop_front() { 374 | // Just cleaner to wait before pushing. No ref counting need. 375 | el.rl().await.wg.wait().await; 376 | } 377 | // rewind the iterator 378 | // rewind, next, rewind?, thie item is who! 379 | let mut item = self.itr.rewind(); 380 | // filter internal data 381 | while item.is_some() && item.as_ref().unwrap().key().starts_with(_BADGER_PREFIX) { 382 | item = self.itr.next(); 383 | } 384 | // Before every rewind, the item will be reset to None 385 | self.item.write().take(); 386 | // prefetch item. 387 | self.pre_fetch().await; 388 | // return the first el. 389 | self.item.read().clone() 390 | } 391 | 392 | // Advance the iterator by one (*NOTICE*: must be rewind when you call self.next()) 393 | pub async fn next(&self) -> Option { 394 | // Ensure current item has load 395 | if let Some(el) = self.item.write().take() { 396 | el.rl().await.wg.wait().await; // Just cleaner to wait before pushing to avoid doing ref counting. 397 | } 398 | // Set next item to current 399 | if let Some(el) = self.data.write().pop_front() { 400 | self.item.write().replace(el); 401 | } 402 | // Advance internal iterator until entry is not deleted 403 | while let Some(el) = self.itr.next() { 404 | if el.key().starts_with(_BADGER_PREFIX) { 405 | continue; 406 | } 407 | if el.value().meta & MetaBit::BIT_DELETE.bits() == 0 { 408 | // Not deleted 409 | break; 410 | } 411 | } 412 | let item = self.itr.peek(); 413 | if item.is_none() { 414 | return None; 415 | } 416 | 417 | let xitem = self.new_item(); 418 | self.fill(xitem.clone()).await; 419 | self.data.write().push_back(xitem.clone()); 420 | Some(xitem) 421 | } 422 | 423 | pub async fn peek(&self) -> Option { 424 | self.item.read().clone() 425 | } 426 | } 427 | 428 | impl IteratorExt { 429 | // Returns false when iteration is done 430 | // or when the current key is not prefixed by the specified prefix. 431 | async fn valid_for_prefix(&self, prefix: &[u8]) -> bool { 432 | self.item.read().is_some() 433 | && self 434 | .item 435 | .read() 436 | .as_ref() 437 | .unwrap() 438 | .rl() 439 | .await 440 | .key() 441 | .starts_with(prefix) 442 | } 443 | 444 | // Close the iterator, It is important to call this when you're done with iteration. 445 | pub async fn close(&self) -> Result<()> { 446 | // TODO: We could handle this error. 447 | self.kv.vlog.as_ref().unwrap().decr_iterator_count().await?; 448 | Ok(()) 449 | } 450 | 451 | // fill the value 452 | async fn fill(&self, item: KVItem) { 453 | let vs = self.itr.peek().unwrap(); 454 | let vs = vs.value(); 455 | { 456 | let mut item = item.wl().await; 457 | item.meta = vs.meta; 458 | item.user_meta = vs.user_meta; 459 | item.cas_counter.store(vs.cas_counter, Ordering::Release); 460 | item.key.extend(self.itr.peek().as_ref().unwrap().key()); 461 | item.vptr.extend(&vs.value); 462 | item.value.lock().await.clear(); 463 | } 464 | 465 | // need fetch value, use new coroutine to load value. 466 | if self.opt.pre_fetch_values { 467 | item.rl().await.wg.add_running(1); 468 | tokio::spawn(async move { 469 | // FIXME we are not handling errors here. 470 | { 471 | let item = item.rl().await; 472 | if let Err(err) = item.pre_fetch_value().await { 473 | log::error!("Failed to fetch value, {}", err); 474 | } 475 | } 476 | item.rl().await.wg.done(); 477 | }); 478 | } 479 | } 480 | 481 | // Prefetch load items. 482 | async fn pre_fetch(&self) { 483 | let mut pre_fetch_size = 2; 484 | if self.opt.pre_fetch_values && self.opt.pre_fetch_size > 1 { 485 | pre_fetch_size = self.opt.pre_fetch_size; 486 | } 487 | 488 | let itr = &self.itr; 489 | let mut count = 0; 490 | while let Some(item) = itr.peek() { 491 | if item.key().starts_with(crate::kv::_BADGER_PREFIX) { 492 | itr.next(); 493 | continue; 494 | } 495 | if item.value().meta & MetaBit::BIT_DELETE.bits() > 0 { 496 | itr.next(); 497 | continue; 498 | } 499 | count += 1; 500 | let xitem = self.new_item(); 501 | // fill a el from itr.peek 502 | self.fill(xitem.clone()).await; 503 | if self.item.read().is_none() { 504 | self.item.write().replace(xitem); // store it 505 | } else { 506 | // push prefetch el into cache queue, Notice it not including current item 507 | self.data.write().push_back(xitem); 508 | } 509 | if count == pre_fetch_size { 510 | break; 511 | } 512 | itr.next(); 513 | } 514 | } 515 | 516 | fn new_item(&self) -> KVItem { 517 | let inner_item = KVItemInner { 518 | status: Arc::new(Atomic::new(PreFetchStatus::Empty)), 519 | kv: self.kv.clone(), 520 | key: vec![], 521 | value: TArcMx::new(Default::default()), 522 | vptr: vec![], 523 | meta: 0, 524 | user_meta: 0, 525 | cas_counter: Arc::new(Default::default()), 526 | wg: Closer::new("IteratorExt".to_owned()), 527 | err: Ok(()), 528 | }; 529 | return KVItem::from(inner_item); 530 | } 531 | 532 | // Returns false when iteration is done. 533 | fn valid(&self) -> bool { 534 | self.item.read().is_some() 535 | } 536 | } 537 | -------------------------------------------------------------------------------- /src/level_handler.rs: -------------------------------------------------------------------------------- 1 | use crate::compaction::KeyRange; 2 | 3 | use crate::table::iterator::{IteratorImpl, IteratorItem}; 4 | use crate::table::table::Table; 5 | use crate::types::XArc; 6 | 7 | use crate::{event, hex_str, Result}; 8 | use core::slice::SlicePattern; 9 | use std::fmt::Display; 10 | 11 | use crate::options::Options; 12 | 13 | use drop_cell::defer; 14 | use log::{debug, info, warn}; 15 | use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard}; 16 | use parking_lot::{RawRwLock, RwLock}; 17 | use std::collections::HashSet; 18 | 19 | use std::sync::atomic::{AtomicI32, AtomicU64, AtomicUsize, Ordering}; 20 | use std::sync::Arc; 21 | 22 | pub(crate) type LevelHandler = XArc; 23 | 24 | impl From for LevelHandler { 25 | fn from(value: LevelHandlerInner) -> Self { 26 | XArc::new(value) 27 | } 28 | } 29 | 30 | impl LevelHandler { 31 | // Check does some sanity check on one level of data or in-memory index. 32 | pub(crate) fn validate(&self) -> Result<()> { 33 | self.lock_exclusive(); 34 | defer! {self.unlock_exclusive();} 35 | if self.level() == 0 { 36 | return Ok(()); 37 | } 38 | let tables = self.tables.write(); 39 | let num_tables = tables.len(); 40 | for j in 1..num_tables { 41 | if j >= tables.len() { 42 | return Err(format!( 43 | "Level={}, j={}, number_tables={}", 44 | self.level(), 45 | j, 46 | num_tables 47 | ) 48 | .into()); 49 | } 50 | 51 | // overlap occurs 52 | if tables[j - 1].biggest() >= tables[j].smallest() { 53 | return Err(format!( 54 | "Inter: {} vs {}: level={} j={} numTables={}", 55 | hex_str(tables[j - 1].biggest()), 56 | hex_str(tables[j].smallest()), 57 | self.level(), 58 | j, 59 | num_tables 60 | ) 61 | .into()); 62 | } 63 | if tables[j].smallest() > tables[j].biggest() { 64 | return Err(format!( 65 | "Intra: {} vs {}: level={} j={} numTables={}", 66 | hex_str(tables[j].smallest()), 67 | hex_str(tables[j].biggest()), 68 | self.level(), 69 | j, 70 | num_tables 71 | ) 72 | .into()); 73 | } 74 | } 75 | 76 | Ok(()) 77 | } 78 | 79 | // Returns true if the non-zero level may be compacted. *del_size* provides the size of the tables 80 | // which are currently being compacted so that we treat them as already having started being 81 | // compacted (because they have been, yet their size is already counted in get_total_size). 82 | pub(crate) fn is_compactable(&self, del_size: u64) -> bool { 83 | let compactable = self.get_total_size() - del_size >= self.get_max_total_size(); 84 | 85 | #[cfg(test)] 86 | debug!( 87 | "trace level{}, does it compactable, total_size:{}, del_size:{}, max_size:{}, yes: {}", 88 | self.level(), 89 | self.get_total_size(), 90 | del_size, 91 | self.get_max_total_size(), 92 | compactable, 93 | ); 94 | 95 | compactable 96 | } 97 | 98 | pub(crate) fn get_total_size(&self) -> u64 { 99 | self.total_size.load(Ordering::Relaxed) 100 | } 101 | 102 | pub(crate) fn incr_total_size(&self, n: u64) { 103 | let old = self.total_size.fetch_add(n, Ordering::Relaxed); 104 | #[cfg(test)] 105 | info!( 106 | "incr level{} total size: {} => {}", 107 | self.level(), 108 | old, 109 | self.get_total_size() 110 | ); 111 | } 112 | 113 | pub(crate) fn decr_total_size(&self, n: u64) { 114 | let old = self.total_size.fetch_sub(n, Ordering::Relaxed); 115 | #[cfg(test)] 116 | info!( 117 | "decr level{} total size: {} => {}", 118 | self.level(), 119 | old, 120 | self.get_total_size() 121 | ); 122 | } 123 | 124 | pub(crate) fn get_max_total_size(&self) -> u64 { 125 | self.max_total_size.load(Ordering::Relaxed) 126 | } 127 | 128 | // delete current level's tables of to_del 129 | pub(crate) fn delete_tables(&self, to_del: Vec) { 130 | let to_del_set = to_del.iter().map(|id| *id).collect::>(); 131 | let level = self.level(); 132 | let mut tb_wl = self.tables_wl(); 133 | let before_tids = tb_wl.iter().map(|tb| tb.id()).collect::>(); 134 | { 135 | tb_wl.retain_mut(|tb| { 136 | if to_del_set.contains(&tb.id()) { 137 | // delete table reference 138 | tb.decr_ref(); 139 | self.decr_total_size(tb.size() as u64); 140 | return false; 141 | } 142 | true 143 | }); 144 | } 145 | let after_tids = tb_wl.iter().map(|tb| tb.id()).collect::>(); 146 | warn!( 147 | "after delete tables level:{}, {:?} => {:?}, to_del: {:?}", 148 | level, before_tids, after_tids, to_del, 149 | ); 150 | } 151 | 152 | // init with tables 153 | pub(crate) fn init_tables(&self, tables: Vec

) { 154 | let total_size = tables.iter().fold(0, |acc, table| acc + table.size()); 155 | self.total_size.store(total_size as u64, Ordering::Relaxed); 156 | let mut tb_wl = self.tables_wl(); 157 | (*tb_wl) = tables; 158 | if self.level() == 0 { 159 | // key range will overlap. Just sort by file_id in ascending order 160 | // because newer tables are at the end of level 0. 161 | tb_wl.sort_by_key(|tb| tb.id()); 162 | } else { 163 | // Sort tables by keys. 164 | tb_wl.sort_by_key(|tb| tb.smallest().to_vec()); 165 | } 166 | } 167 | 168 | // Get table write lock guards. 169 | fn tables_wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec

> { 170 | self.tables.write() 171 | } 172 | 173 | // Get table read lock guards 174 | fn tables_rd(&self) -> RwLockReadGuard<'_, RawRwLock, Vec

> { 175 | self.tables.read() 176 | } 177 | 178 | pub(crate) fn debug_tables(&self) { 179 | let tw = self.tables_rd(); 180 | info!( 181 | "=============debug tables, level: {}=====================", 182 | self.level() 183 | ); 184 | for tb in tw.iter() { 185 | info!( 186 | "|tid:{}, smallest:{}, biggest:{}, size: {}|", 187 | tb.id(), 188 | hex_str(tb.smallest()), 189 | hex_str(tb.biggest()), 190 | tb.size(), 191 | ); 192 | } 193 | info!("------------------------end-----------------------------"); 194 | } 195 | 196 | // Returns the tables that intersect with key range. Returns a half-interval [left, right). 197 | // This function should already have acquired a read lock, and this is so important the caller must 198 | // pass an empty parameter declaring such. 199 | pub(crate) fn overlapping_tables(&self, key_range: &KeyRange) -> (usize, usize) { 200 | // probe.biggest() >= left 201 | let left = self 202 | .tables_rd() 203 | .binary_search_by(|probe| probe.biggest().cmp(&key_range.left)); 204 | let right = self 205 | .tables_rd() 206 | .binary_search_by(|probe| probe.smallest().cmp(&key_range.right)); 207 | 208 | info!( 209 | "overlapping tables, range: {}, left: {:?}, right: {:?}", 210 | key_range, left, right 211 | ); 212 | let left = left.unwrap_or_else(|n| n); 213 | let right = right.map(|n| n + 1).unwrap_or_else(|n| n); 214 | if left == right { 215 | // simple handle 216 | return (0, 0); 217 | } 218 | (left, right) 219 | } 220 | 221 | pub(crate) fn get_total_siz(&self) -> u64 { 222 | self.total_size.load(Ordering::Relaxed) 223 | } 224 | 225 | // Replace tables[left:right] with new_tables, Note this EXCLUDES tables[right]. 226 | // You must be call decr() to delete the old tables _after_ writing the update to the manifest. 227 | pub(crate) fn replace_tables(&self, new_tables: Vec

) -> Result<()> { 228 | // Need to re-search the range of tables in this level to be replaced as other goroutines might 229 | // be changing it as well. (They can't touch our tables, but if they add/remove other tables, 230 | // the indices get shifted around.) 231 | if new_tables.is_empty() { 232 | info!("No tables need to replace"); 233 | return Ok(()); 234 | } 235 | // TODO Add lock (think of level's sharing lock) 236 | // Increase total_size first. 237 | for tb in &new_tables { 238 | self.incr_total_size(tb.size() as u64); 239 | // add table reference 240 | tb.incr_ref(); 241 | } 242 | let key_range = KeyRange { 243 | left: new_tables.first().unwrap().smallest().to_vec(), 244 | right: new_tables.last().unwrap().biggest().to_vec(), 245 | inf: false, 246 | }; 247 | 248 | // TODO Opz code 249 | { 250 | let level_id = self.level(); 251 | let mut tables_lck = self.tables_wl(); 252 | let old_ids = tables_lck.iter().map(|tb| tb.id()).collect::>(); 253 | // TODO FIXME may be it is error. 254 | tables_lck.retain_mut(|tb| { 255 | let left = tb.biggest() <= key_range.left.as_slice(); 256 | let right = tb.smallest() > key_range.right.as_slice(); 257 | if left || right { 258 | return true; 259 | } else { 260 | // TODO it should be not a good idea decr reference here, slow lock 261 | // decr table reference 262 | tb.decr_ref(); 263 | self.decr_total_size(tb.size() as u64); 264 | false 265 | } 266 | }); 267 | let will_add = new_tables.iter().map(|tb| tb.id()).collect::>(); 268 | tables_lck.extend(new_tables); 269 | // TODO avoid resort 270 | tables_lck.sort_by(|a, b| a.smallest().cmp(b.smallest())); 271 | 272 | let new_ids = tables_lck.iter().map(|tb| tb.id()).collect::>(); 273 | info!( 274 | "after replace tables, level:{}, will_add:{:?}, {:?} => {:?}", 275 | level_id, will_add, old_ids, new_ids 276 | ); 277 | } 278 | Ok(()) 279 | } 280 | 281 | // Return true if ok and no stalling that will hold a new table reference 282 | pub(crate) async fn try_add_level0_table(&self, t: Table) -> bool { 283 | assert_eq!(self.get_level(), 0); 284 | let mut tw = self.tables_wl(); 285 | if tw.len() >= self.opt.num_level_zero_tables_stall { 286 | // Too many tables at zero level need compact 287 | return false; 288 | } 289 | t.incr_ref(); 290 | self.incr_total_size(t.size() as u64); 291 | tw.push(t); 292 | true 293 | } 294 | 295 | pub(crate) fn num_tables(&self) -> usize { 296 | self.tables_rd().len() 297 | } 298 | 299 | // Must be call only once 300 | pub(crate) fn close(&self) -> Result<()> { 301 | let tw = self.tables_wl(); 302 | tw.iter().for_each(|tb| tb.decr_ref()); 303 | Ok(()) 304 | } 305 | 306 | // Acquires a read-lock to access s.tables. It returns a list of table_handlers. 307 | pub(crate) fn get_table_for_key(&self, key: &[u8]) -> Option { 308 | return if self.get_level() == 0 { 309 | // For level 0, we need to check every table. Remember to make a copy as self.tables may change 310 | // once we exit this function, and we don't want to lock the self.tables while seeking in tabbles. 311 | // CAUTION: Reverse the tables. 312 | let tw = self.tables_rd(); 313 | for tb in tw.iter().rev() { 314 | tb.incr_ref(); 315 | // check it by bloom filter 316 | if tb.does_not_have(key) { 317 | //debug!("not contain it, key #{}, st: {}", hex_str(key), tb.id()); 318 | event::get_metrics().num_lsm_bloom_hits.inc(); 319 | tb.decr_ref(); 320 | continue; 321 | } 322 | event::get_metrics().num_lsm_gets.inc(); 323 | let it = IteratorImpl::new(tb.clone(), false); 324 | let item = it.seek(key); 325 | tb.decr_ref(); 326 | if let Some(item) = item { 327 | if item.key() != key { 328 | continue; 329 | } 330 | return Some(item); 331 | } 332 | } 333 | None 334 | } else { 335 | //self.debug_tables(); 336 | let tw = self.tables_rd(); 337 | let ok = tw.binary_search_by(|tb| tb.biggest().cmp(key)); 338 | // #[cfg(test)] 339 | // info!("find key #{} at level{}, {:?}", hex_str(key), self.level(), ok.unwrap_or_else(|n| n)); 340 | 341 | let index = ok.unwrap_or_else(|n| n); 342 | if index >= tw.len() { 343 | // todo add metrics 344 | return None; 345 | } 346 | let tb = tw.get(index).unwrap(); 347 | tb.incr_ref(); 348 | if tb.does_not_have(key) { 349 | //debug!("not contain it, key #{}, st: {}", hex_str(key), tb.id()); 350 | event::get_metrics().num_lsm_bloom_hits.inc(); 351 | tb.decr_ref(); 352 | return None; 353 | } 354 | event::get_metrics().num_lsm_gets.inc(); 355 | let it = IteratorImpl::new(tb.clone(), false); 356 | let item = it.seek(key); 357 | tb.decr_ref(); 358 | if let Some(item) = item { 359 | if item.key() == key { 360 | return Some(item); 361 | } 362 | } 363 | return None; 364 | }; 365 | } 366 | 367 | pub(crate) fn get(&self, key: &[u8]) -> Option { 368 | self.get_table_for_key(key) 369 | } 370 | 371 | // returns current level 372 | pub(crate) fn level(&self) -> usize { 373 | self.level.load(Ordering::Relaxed) as usize 374 | } 375 | 376 | pub(crate) fn to_log(&self) -> String { 377 | format!("{}", self) 378 | } 379 | } 380 | 381 | impl Display for LevelHandler { 382 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 383 | f.debug_struct("LevelHandler") 384 | .field("level", &self.get_level()) 385 | .field("max", &self.max_total_size.load(Ordering::Relaxed)) 386 | .field( 387 | "tables", 388 | &self 389 | .tables_rd() 390 | .iter() 391 | .map(|tb| tb.id()) 392 | .collect::>(), 393 | ) 394 | .finish() 395 | } 396 | } 397 | 398 | pub(crate) struct LevelHandlerInner { 399 | // TODO this lock maybe global, not only for compacted 400 | pub(crate) self_lock: Arc>, 401 | // Guards tables, total_size. 402 | // For level >= 1, *tables* are sorted by key ranges, which do not overlap. 403 | // For level 0, *tables* are sorted by time. 404 | // For level 0, *newest* table are at the back. Compact the oldest one first, which is at the front. 405 | // TODO tables and total_size maybe should be lock with same lock. 406 | pub(crate) tables: Arc>>, 407 | pub(crate) total_size: AtomicU64, 408 | // The following are initialized once and const. 409 | pub(crate) level: AtomicUsize, 410 | str_level: Arc, 411 | pub(crate) max_total_size: AtomicU64, 412 | opt: Options, 413 | } 414 | 415 | impl LevelHandlerInner { 416 | pub(crate) fn new(opt: Options, level: usize) -> LevelHandlerInner { 417 | LevelHandlerInner { 418 | self_lock: Arc::new(Default::default()), 419 | tables: Arc::new(Default::default()), 420 | total_size: Default::default(), 421 | level: AtomicUsize::new(level), 422 | str_level: Arc::new(format!("L{}", level)), 423 | max_total_size: Default::default(), 424 | opt, 425 | } 426 | } 427 | 428 | #[inline] 429 | pub(crate) fn get_level(&self) -> usize { 430 | self.level.load(Ordering::Acquire) 431 | } 432 | 433 | #[inline] 434 | pub(crate) fn lock_shared(&self) { 435 | use parking_lot::lock_api::RawRwLock; 436 | unsafe { self.self_lock.raw().lock_shared() } 437 | } 438 | 439 | #[inline] 440 | pub(crate) fn try_lock_share(&self) -> bool { 441 | use parking_lot::lock_api::RawRwLock; 442 | unsafe { self.self_lock.raw().try_lock_shared() } 443 | } 444 | 445 | #[inline] 446 | pub(crate) fn unlock_shared(&self) { 447 | use parking_lot::lock_api::RawRwLock; 448 | unsafe { self.self_lock.raw().unlock_shared() } 449 | } 450 | 451 | #[inline] 452 | pub(crate) fn lock_exclusive(&self) { 453 | use parking_lot::lock_api::RawRwLock; 454 | unsafe { self.self_lock.raw().lock_exclusive() } 455 | } 456 | 457 | #[inline] 458 | pub(crate) fn try_lock_exclusive(&self) -> bool { 459 | use parking_lot::lock_api::RawRwLock; 460 | unsafe { self.self_lock.raw().try_lock_exclusive() } 461 | } 462 | 463 | #[inline] 464 | pub(crate) fn unlock_exclusive(&self) { 465 | use parking_lot::lock_api::RawRwLock; 466 | unsafe { self.self_lock.raw().unlock_exclusive() } 467 | } 468 | } 469 | 470 | #[test] 471 | fn raw_lock() { 472 | let lock = LevelHandlerInner::new(Options::default(), 10); 473 | lock.lock_shared(); 474 | lock.lock_shared(); 475 | assert_eq!(false, lock.try_lock_exclusive()); 476 | lock.unlock_shared(); 477 | lock.unlock_shared(); 478 | 479 | assert_eq!(true, lock.try_lock_exclusive()); 480 | assert_eq!(false, lock.try_lock_share()); 481 | lock.unlock_exclusive(); 482 | assert_eq!(true, lock.try_lock_share()); 483 | } 484 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(async_iterator)] 2 | #![feature(pointer_byte_offsets)] 3 | #![feature(sync_unsafe_cell)] 4 | #![feature(associated_type_defaults)] 5 | #![feature(type_alias_impl_trait)] 6 | #![feature(strict_provenance_atomic_ptr)] 7 | #![feature(atomic_from_mut)] 8 | #![feature(cursor_remaining)] 9 | #![feature(pattern)] 10 | #![feature(cell_leak)] 11 | #![feature(path_file_prefix)] 12 | #![feature(fs_try_exists)] 13 | #![feature(generic_associated_types)] 14 | #![feature(unwrap_infallible)] 15 | #![feature(slice_pattern)] 16 | #![feature(slice_take)] 17 | #![feature(arc_into_inner)] 18 | #![feature(async_closure)] 19 | #![feature(let_chains)] 20 | #![feature(stmt_expr_attributes)] 21 | #![feature(backtrace_frames)] 22 | #![feature(binary_heap_into_iter_sorted)] 23 | #![feature(test)] 24 | #![feature(atomic_from_ptr, pointer_is_aligned)] 25 | 26 | 27 | /// Badger DB is an embedded keyvalue database. 28 | /// 29 | /// Badger DB is a library written in Rust that implements a badger-go [https://github.com/dgraph-io/badger] 30 | /// bager-rs will implements all features of badger-go 31 | use std::mem::align_of; 32 | 33 | mod event; 34 | mod iterator; 35 | pub mod kv; 36 | mod level_handler; 37 | mod log_file; 38 | mod manifest; 39 | mod options; 40 | mod skl; 41 | mod table; 42 | mod types; 43 | mod value_log; 44 | #[cfg(test)] 45 | mod value_log_tests; 46 | mod y; 47 | 48 | mod compaction; 49 | // #[cfg(test)] 50 | // mod kv_test; 51 | #[cfg(test)] 52 | mod kv_test; 53 | mod levels; 54 | mod pb; 55 | mod st_manager; 56 | #[cfg(test)] 57 | mod test_util; 58 | mod backup; 59 | 60 | pub use iterator::*; 61 | pub use kv::*; 62 | pub use options::*; 63 | pub use skl::*; 64 | pub use st_manager::*; 65 | pub use y::*; 66 | 67 | #[allow(dead_code)] 68 | #[inline] 69 | pub(crate) fn must_align(ptr: *const T) { 70 | let actual = (ptr as usize) % align_of::() == 0; 71 | assert!(actual); 72 | } 73 | 74 | #[allow(dead_code)] 75 | #[inline] 76 | pub(crate) fn cals_size_with_align(sz: usize, align_sz: usize) -> usize { 77 | let size = (sz + align_sz) & !align_sz; 78 | size 79 | } -------------------------------------------------------------------------------- /src/lock.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laohanlinux/badger-rs/00bbe70da1f4b0fe7d52ffbdf8da91867a147834/src/lock.txt -------------------------------------------------------------------------------- /src/log_file.rs: -------------------------------------------------------------------------------- 1 | use crate::types::Closer; 2 | use crate::value_log::{Entry, Header, ValuePointer}; 3 | use crate::y::{create_synced_file, Result}; 4 | use crate::y::{is_eof, Decode}; 5 | use std::env::temp_dir; 6 | 7 | use async_channel::Sender; 8 | use byteorder::{BigEndian, ReadBytesExt}; 9 | use drop_cell::defer; 10 | use either::Either; 11 | use log::{debug, info}; 12 | use memmap::{Mmap, MmapMut}; 13 | 14 | use std::fmt::{Debug, Formatter}; 15 | use std::fs::File; 16 | use std::future::Future; 17 | use std::io::{Read, Seek, SeekFrom, Write}; 18 | use std::ops::Deref; 19 | use std::pin::Pin; 20 | use std::sync::atomic::AtomicU64; 21 | 22 | use crate::event; 23 | use tokio::select; 24 | 25 | // MmapType is a Mmap and MmapMut tule 26 | pub(crate) struct MmapType(Either); 27 | 28 | impl MmapType { 29 | pub(crate) fn get_mmap(&self) -> &Mmap { 30 | match self.0 { 31 | Either::Left(ref _mmap) => _mmap, 32 | _ => panic!("It should be not happen"), 33 | } 34 | } 35 | 36 | pub(crate) fn get_mut_mmap(&self) -> &MmapMut { 37 | match self.0 { 38 | Either::Right(ref m) => m, 39 | _ => panic!("It should be not happen"), 40 | } 41 | } 42 | 43 | pub(crate) fn get_mut_mmap_ref(&mut self) -> &mut MmapMut { 44 | match self.0 { 45 | Either::Right(ref mut m) => m, 46 | _ => panic!("It should be not happen"), 47 | } 48 | } 49 | } 50 | 51 | impl Deref for MmapType { 52 | type Target = Either; 53 | 54 | fn deref(&self) -> &Self::Target { 55 | &self.0 56 | } 57 | } 58 | 59 | impl From for MmapType { 60 | fn from(value: Mmap) -> Self { 61 | Self(Either::Left(value)) 62 | } 63 | } 64 | 65 | impl From for MmapType { 66 | fn from(value: MmapMut) -> Self { 67 | Self(Either::Right(value)) 68 | } 69 | } 70 | 71 | pub(crate) struct LogFile { 72 | pub(crate) _path: Box, 73 | pub(crate) fd: Option, 74 | pub(crate) fid: u32, 75 | pub(crate) _mmap: Option, 76 | pub(crate) sz: u32, 77 | } 78 | 79 | impl Debug for LogFile { 80 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 81 | f.debug_struct("LogFile") 82 | .field("path", self._path.as_ref()) 83 | .field("fd", &self.fid) 84 | .field("size", &self.sz) 85 | .finish() 86 | } 87 | } 88 | 89 | impl LogFile { 90 | // async read *n* entries 91 | pub(crate) async fn read_entries( 92 | &self, 93 | offset: u32, 94 | n: usize, 95 | ) -> Result<(Vec<(Entry, ValuePointer)>, u32)> { 96 | let m = self.mmap_slice(); 97 | let mut cursor_offset = offset; 98 | let mut v = vec![]; 99 | while cursor_offset < m.len() as u32 && v.len() < n { 100 | let entry = Entry::from_slice(cursor_offset, m)?; 101 | let mut vpt = ValuePointer::default(); 102 | vpt.fid = self.fid; 103 | vpt.len = 104 | Header::encoded_size() as u32 + (entry.key.len() + entry.value.len()) as u32 + 4; 105 | vpt.offset = cursor_offset; 106 | cursor_offset += vpt.len; 107 | v.push((entry, vpt)) 108 | } 109 | Ok((v, cursor_offset)) 110 | } 111 | 112 | pub(crate) async fn async_iterate_by_offset( 113 | &self, 114 | ctx: Closer, 115 | mut offset: u32, 116 | notify: Sender<(Entry, ValuePointer)>, 117 | ) { 118 | defer! {ctx.done()} 119 | defer! {notify.close();} 120 | let has_been_close = ctx.has_been_closed(); 121 | loop { 122 | let (v, next) = self.read_entries(offset, 1).await.unwrap(); 123 | offset = next; 124 | if v.is_empty() { 125 | return; 126 | } else { 127 | // TODO batch sender 128 | for item in v { 129 | select! { 130 | _ = has_been_close.recv() => {}, 131 | _ = notify.send(item) => {}, 132 | } 133 | } 134 | } 135 | } 136 | } 137 | 138 | // async iterate from offset that must be call with thread safety 139 | pub(crate) async fn iterate_by_offset( 140 | &self, 141 | mut offset: u32, 142 | f: &mut impl for<'a> FnMut( 143 | &'a Entry, 144 | &'a ValuePointer, 145 | ) -> Pin> + 'a>>, 146 | ) -> Result<()> { 147 | loop { 148 | let (v, next) = self.read_entries(offset, 1).await?; 149 | if v.is_empty() { 150 | return Ok(()); 151 | } 152 | 153 | for (entry, vptr) in v.iter() { 154 | if !f(entry, vptr).await? { 155 | return Ok(()); 156 | } 157 | offset = next; 158 | } 159 | } 160 | } 161 | 162 | // It should be call by one thread. 163 | pub(crate) async fn iterate( 164 | &mut self, 165 | offset: u32, 166 | f: &mut impl for<'a> FnMut( 167 | &'a Entry, 168 | &'a ValuePointer, 169 | ) -> Pin> + 'a>>, 170 | ) -> Result<()> { 171 | let mut fd = self.fd.as_mut().unwrap(); 172 | fd.seek(SeekFrom::Start(offset as u64))?; 173 | let mut entry = Entry::default(); 174 | let _truncate = false; // because maybe abort before write 175 | let mut record_offset = offset; 176 | loop { 177 | let mut h = Header::default(); 178 | let ok = h.dec(&mut fd); 179 | if ok.is_err() && ok.as_ref().unwrap_err().is_io_eof() { 180 | break; 181 | } 182 | // todo add truncate currenct 183 | ok?; 184 | if h.k_len as usize > entry.key.capacity() { 185 | entry.key = vec![0u8; h.k_len as usize]; 186 | } 187 | if h.v_len as usize > entry.value.capacity() { 188 | entry.value = vec![0u8; h.v_len as usize]; 189 | } 190 | entry.key.clear(); 191 | entry.value.clear(); 192 | 193 | let ok = fd.read(&mut entry.key); 194 | if is_eof(&ok) { 195 | break; 196 | } 197 | ok?; 198 | 199 | let ok = fd.read(&mut entry.value); 200 | if is_eof(&ok) { 201 | break; 202 | } 203 | ok?; 204 | entry.offset = record_offset; 205 | entry.meta = h.meta; 206 | entry.user_meta = h.user_mata; 207 | entry.cas_counter = AtomicU64::new(h.cas_counter); 208 | entry.cas_counter_check = h.cas_counter_check; 209 | let ok = fd.read_u32::(); 210 | if is_eof(&ok) { 211 | break; 212 | } 213 | let _crc = ok?; 214 | 215 | let mut vp = ValuePointer::default(); 216 | vp.len = Header::encoded_size() as u32 + h.k_len + h.v_len + 4; 217 | record_offset += vp.len; 218 | 219 | vp.offset = entry.offset; 220 | vp.fid = self.fid; 221 | 222 | let _continue = f(&entry, &vp).await?; 223 | if !_continue { 224 | break; 225 | } 226 | } 227 | 228 | // todo add truncate 229 | Ok(()) 230 | } 231 | } 232 | 233 | impl LogFile { 234 | // new LogFile with special path. 235 | pub(crate) fn new(path: &str) -> Result { 236 | let mut lf = LogFile { 237 | _path: Box::new(path.to_string()), 238 | fd: None, 239 | fid: 0, 240 | _mmap: None, 241 | sz: 0, 242 | }; 243 | lf.open_read_only()?; 244 | Ok(lf) 245 | } 246 | 247 | // open only read permission 248 | pub(crate) fn open_read_only(&mut self) -> Result<()> { 249 | let fd = std::fs::OpenOptions::new() 250 | .read(true) 251 | .open(self._path.as_ref())?; 252 | let meta = fd.metadata()?; 253 | let file_sz = meta.len(); 254 | let mut _mmap = unsafe { Mmap::map(&fd)? }; 255 | self._mmap.replace(_mmap.into()); 256 | self.fd.replace(fd); 257 | self.sz = file_sz as u32; 258 | Ok(()) 259 | } 260 | 261 | // Acquire lock on mmap if you are calling this. 262 | pub(crate) fn read(&self, p: &ValuePointer) -> Result<&[u8]> { 263 | #[cfg(test)] 264 | debug!( 265 | "ready to read bytes from mmap, {}, {:?}", 266 | self._mmap.as_ref().unwrap().is_left(), 267 | p 268 | ); 269 | let offset = p.offset; 270 | let mmp = self._mmap.as_ref().unwrap(); 271 | event::get_metrics().num_reads.inc(); 272 | event::get_metrics().num_bytes_read.inc_by(p.len as u64); 273 | // todo add metrics 274 | match mmp.0 { 275 | Either::Left(ref m) => Ok(&m.as_ref()[offset as usize..(offset + p.len) as usize]), 276 | Either::Right(ref m) => Ok(&m.as_ref()[offset as usize..(offset + p.len) as usize]), 277 | } 278 | } 279 | 280 | // Done written, reopen with read only permisson for file and mmap. 281 | pub(crate) fn done_writing(&mut self, offset: u32) -> Result<()> { 282 | self.sync()?; 283 | let mut_mmap = self.mut_mmap(); 284 | mut_mmap.flush_async()?; 285 | self.fd.as_mut().unwrap().set_len(offset as u64)?; 286 | self.fd.as_mut().unwrap().sync_all()?; 287 | { 288 | self._mmap.take(); 289 | self.fd.take(); 290 | } 291 | self.open_read_only() 292 | } 293 | 294 | pub(crate) fn set_write(&mut self, sz: u64) -> Result<()> { 295 | self.fd.as_mut().unwrap().set_len(sz as u64)?; 296 | info!("reset file size:{}", sz); 297 | let mut _mmap = unsafe { Mmap::map(&self.fd.as_ref().unwrap())?.make_mut()? }; 298 | self._mmap.replace(MmapType(Either::Right(_mmap))); 299 | self.sz = sz as u32; 300 | Ok(()) 301 | } 302 | 303 | // return mmap slice 304 | fn mmap_slice(&self) -> &[u8] { 305 | let mmap = self._mmap.as_ref().unwrap(); 306 | match mmap.0 { 307 | Either::Left(ref _mmap) => _mmap.as_ref(), 308 | Either::Right(ref _mmap) => _mmap.as_ref(), 309 | } 310 | } 311 | 312 | // return file reference 313 | fn file_ref(&self) -> &File { 314 | self.fd.as_ref().unwrap() 315 | } 316 | 317 | pub(crate) fn mut_mmap(&mut self) -> &mut MmapMut { 318 | let mp = self._mmap.as_mut().unwrap(); 319 | mp.get_mut_mmap_ref() 320 | } 321 | 322 | pub(crate) fn write_buffer(&mut self, buffer: &[u8], offset: usize) -> Result { 323 | let wt = self.mut_mmap(); 324 | let mut wt = &mut wt[offset..]; 325 | wt.write(buffer).map_err(|err| err.into()) 326 | } 327 | 328 | fn mmap_ref(&self) -> &Mmap { 329 | self._mmap.as_ref().unwrap().get_mmap() 330 | } 331 | 332 | // You must hold lf.lock to sync() 333 | fn sync(&mut self) -> Result<()> { 334 | self.fd.as_mut().unwrap().sync_all()?; 335 | Ok(()) 336 | } 337 | } 338 | 339 | #[test] 340 | fn concurrency() { 341 | let mut lf = LogFile::new("src/test_data/vlog_file.text"); 342 | assert!(lf.is_ok(), "{:?}", lf.unwrap_err().to_string()); 343 | } 344 | 345 | #[test] 346 | fn test_mmap() { 347 | let mut fd = std::fs::OpenOptions::new() 348 | .read(true) 349 | .write(true) 350 | .open("src/test_data/vlog_file.text") 351 | .unwrap(); 352 | 353 | let _mmap = unsafe { Mmap::map(&fd).unwrap() }; 354 | println!("{}", _mmap.len()); 355 | println!("{}", _mmap.make_mut().is_err()); 356 | } 357 | 358 | #[test] 359 | fn test_write_file() { 360 | use crate::test_util; 361 | test_util::tracing_log(); 362 | use std::io::Write; 363 | 364 | let tmp_path = temp_dir().join("mmap_test.txt"); 365 | let tmp_path = tmp_path.to_str().unwrap(); 366 | std::fs::write(tmp_path, b"hellow, word").unwrap(); 367 | info!("path: {}", tmp_path); 368 | let mut vlog = LogFile::new(tmp_path).unwrap(); 369 | vlog.fd.take(); 370 | vlog.fd = Some(create_synced_file(tmp_path, true).unwrap()); 371 | info!( 372 | "{},{:?}", 373 | vlog.sz, 374 | String::from_utf8_lossy(vlog.mmap_slice()) 375 | ); 376 | vlog.set_write(1024).unwrap(); 377 | // vlog.fd.as_mut().unwrap().write_all(b"foobat").unwrap(); 378 | // vlog.fd.as_mut().unwrap().sync_all().unwrap(); 379 | // vlog.mut_mmap().flush_async().unwrap(); 380 | { 381 | let mut buffer = vlog._mmap.as_mut().unwrap(); 382 | let mut buffer = buffer.get_mut_mmap_ref(); 383 | let mut wt = buffer.as_mut(); 384 | wt.write_all(b"1234").unwrap(); 385 | } 386 | info!( 387 | "{},{:?}", 388 | vlog.sz, 389 | String::from_utf8_lossy(vlog.mmap_slice()) 390 | ); 391 | } 392 | -------------------------------------------------------------------------------- /src/options/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::skl::PtrAlign; 2 | use crate::value_log::Entry; 3 | use crate::y::{CAS_SIZE, META_SIZE, USER_META_SIZE}; 4 | use crate::{cals_size_with_align, Node, ValueStruct}; 5 | use rand::random; 6 | use std::env::temp_dir; 7 | 8 | /// Specifies how data in LSM table files and value log files should 9 | /// be loaded. 10 | #[derive(Debug, Clone, Copy, PartialEq)] 11 | pub enum FileLoadingMode { 12 | /// Indicates that files must be loaded using standard I/O 13 | FileIO, 14 | /// Indicates that files must be loaded into RAM 15 | LoadToRADM, 16 | /// Indicates that the file must be memory-mapped 17 | MemoryMap, 18 | } 19 | 20 | /// Params for creating DB object. 21 | #[derive(Debug, Clone)] 22 | pub struct Options { 23 | /// 1. Mandatory flags 24 | /// ------------------- 25 | /// Directory to store the data in. Should exist and be writable. 26 | pub dir: Box, 27 | /// Directory to store the value log in. Can be the same as Dir. Should 28 | /// exist and be writable. 29 | pub value_dir: Box, 30 | /// 2. Frequently modified flags 31 | /// ----------------------------- 32 | /// Sync all writes to disk. Setting this to true would slow down data 33 | /// loading significantly. 34 | pub sync_writes: bool, 35 | /// How should LSM tree be accessed. 36 | pub table_loading_mode: FileLoadingMode, 37 | /// 3. Flags that user might want to review 38 | /// ---------------------------------------- 39 | /// The following affect all levels of LSM tree. 40 | /// Each table (or file) is at most this size. 41 | pub max_table_size: u64, 42 | /// Equals SizeOf(Li+1)/SizeOf(Li). 43 | pub level_size_multiplier: u64, 44 | /// Maximum number of levels of compaction. 45 | pub max_levels: usize, 46 | /// If value size >= this threshold, only store value offsets in tree. 47 | pub value_threshold: usize, 48 | /// Maximum number of tables to keep in memory, before stalling. 49 | pub num_mem_tables: usize, 50 | /// The following affect how we handle LSM tree L0. 51 | /// Maximum number of Level 0 tables before we start compacting. 52 | pub num_level_zero_tables: usize, 53 | 54 | /// If we hit this number of Level 0 tables, we will stall until L0 is 55 | /// compacted away. 56 | pub num_level_zero_tables_stall: usize, 57 | 58 | /// Maximum total size for L1. 59 | pub level_one_size: u64, 60 | 61 | /// Size of single value log file. 62 | pub value_log_file_size: u64, 63 | 64 | /// Number of compaction workers to run concurrently. 65 | pub num_compactors: u64, 66 | 67 | /// 4. Flags for testing purposes 68 | /// ------------------------------ 69 | /// Stops LSM tree from compactions. 70 | pub do_not_compact: bool, 71 | /// max entries in batch 72 | pub max_batch_count: u64, 73 | // max batch size in bytes 74 | pub max_batch_size: u64, 75 | } 76 | 77 | impl Options { 78 | // TODO FIXME 79 | pub fn estimate_size(&self, entry: &Entry) -> usize { 80 | let key_size = entry.key.len(); 81 | if entry.value.len() < self.value_threshold { 82 | key_size + entry.value.len() 83 | } else { 84 | let value_size = ValueStruct::header_size(); 85 | key_size + value_size 86 | } 87 | } 88 | 89 | /// Return the size of allocator arena 90 | pub fn arena_size(&self) -> u64 { 91 | self.max_table_size 92 | + self.max_batch_size 93 | + self.max_batch_count * (Node::size() as u64) 94 | } 95 | } 96 | 97 | impl Default for Options { 98 | fn default() -> Self { 99 | let id = random::(); 100 | Options { 101 | dir: Box::new(id.to_string()), 102 | value_dir: Box::new(id.to_string()), 103 | sync_writes: false, 104 | table_loading_mode: FileLoadingMode::LoadToRADM, 105 | max_table_size: 64 << 20, 106 | level_size_multiplier: 10, 107 | max_levels: 7, 108 | value_threshold: 20, 109 | num_mem_tables: 5, 110 | num_level_zero_tables: 5, 111 | num_level_zero_tables_stall: 10, 112 | level_one_size: 256 << 20, 113 | value_log_file_size: 1 << 30, 114 | num_compactors: 3, 115 | do_not_compact: false, 116 | max_batch_count: 200, 117 | max_batch_size: 1 << 13, 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/pb/backup.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Use protos/gen.sh to generate .pb.go files. 18 | syntax = "proto3"; 19 | 20 | package protos; 21 | 22 | message KVPair { 23 | bytes key = 1; 24 | bytes value = 2; 25 | bytes userMeta = 3; 26 | } -------------------------------------------------------------------------------- /src/pb/backup.rs: -------------------------------------------------------------------------------- 1 | // This file is generated by rust-protobuf 3.0.0-alpha.2. Do not edit 2 | // .proto file is parsed by protoc 23.3 3 | // @generated 4 | 5 | // https://github.com/rust-lang/rust-clippy/issues/702 6 | #![allow(unknown_lints)] 7 | #![allow(clippy::all)] 8 | 9 | #![allow(unused_attributes)] 10 | #![cfg_attr(rustfmt, rustfmt::skip)] 11 | 12 | #![allow(box_pointers)] 13 | #![allow(dead_code)] 14 | #![allow(missing_docs)] 15 | #![allow(non_camel_case_types)] 16 | #![allow(non_snake_case)] 17 | #![allow(non_upper_case_globals)] 18 | #![allow(trivial_casts)] 19 | #![allow(unused_results)] 20 | #![allow(unused_mut)] 21 | 22 | //! Generated file from `src/pb/backup.proto` 23 | 24 | /// Generated files are compatible only with the same version 25 | /// of protobuf runtime. 26 | const _PROTOBUF_VERSION_CHECK: () = ::protobuf::VERSION_3_0_0_ALPHA_2; 27 | 28 | #[derive(PartialEq,Clone,Default)] 29 | pub struct KVPair { 30 | // message fields 31 | pub key: ::std::vec::Vec, 32 | pub value: ::std::vec::Vec, 33 | pub userMeta: ::std::vec::Vec, 34 | // special fields 35 | pub unknown_fields: ::protobuf::UnknownFields, 36 | pub cached_size: ::protobuf::rt::CachedSize, 37 | } 38 | 39 | impl<'a> ::std::default::Default for &'a KVPair { 40 | fn default() -> &'a KVPair { 41 | ::default_instance() 42 | } 43 | } 44 | 45 | impl KVPair { 46 | pub fn new() -> KVPair { 47 | ::std::default::Default::default() 48 | } 49 | 50 | fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { 51 | let mut fields = ::std::vec::Vec::new(); 52 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 53 | "key", 54 | |m: &KVPair| { &m.key }, 55 | |m: &mut KVPair| { &mut m.key }, 56 | )); 57 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 58 | "value", 59 | |m: &KVPair| { &m.value }, 60 | |m: &mut KVPair| { &mut m.value }, 61 | )); 62 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 63 | "userMeta", 64 | |m: &KVPair| { &m.userMeta }, 65 | |m: &mut KVPair| { &mut m.userMeta }, 66 | )); 67 | ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( 68 | "KVPair", 69 | 0, 70 | fields, 71 | ) 72 | } 73 | } 74 | 75 | impl ::protobuf::Message for KVPair { 76 | fn is_initialized(&self) -> bool { 77 | true 78 | } 79 | 80 | fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::ProtobufResult<()> { 81 | while !is.eof()? { 82 | let (field_number, wire_type) = is.read_tag_unpack()?; 83 | match field_number { 84 | 1 => { 85 | if wire_type != ::protobuf::wire_format::WireTypeLengthDelimited { 86 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 87 | } 88 | self.key = is.read_bytes()?; 89 | }, 90 | 2 => { 91 | if wire_type != ::protobuf::wire_format::WireTypeLengthDelimited { 92 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 93 | } 94 | self.value = is.read_bytes()?; 95 | }, 96 | 3 => { 97 | if wire_type != ::protobuf::wire_format::WireTypeLengthDelimited { 98 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 99 | } 100 | self.userMeta = is.read_bytes()?; 101 | }, 102 | _ => { 103 | ::protobuf::rt::read_unknown_or_skip_group(field_number, wire_type, is, self.mut_unknown_fields())?; 104 | }, 105 | }; 106 | } 107 | ::std::result::Result::Ok(()) 108 | } 109 | 110 | // Compute sizes of nested messages 111 | #[allow(unused_variables)] 112 | fn compute_size(&self) -> u32 { 113 | let mut my_size = 0; 114 | if !self.key.is_empty() { 115 | my_size += ::protobuf::rt::bytes_size(1, &self.key); 116 | } 117 | if !self.value.is_empty() { 118 | my_size += ::protobuf::rt::bytes_size(2, &self.value); 119 | } 120 | if !self.userMeta.is_empty() { 121 | my_size += ::protobuf::rt::bytes_size(3, &self.userMeta); 122 | } 123 | my_size += ::protobuf::rt::unknown_fields_size(self.get_unknown_fields()); 124 | self.cached_size.set(my_size); 125 | my_size 126 | } 127 | 128 | fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::ProtobufResult<()> { 129 | if !self.key.is_empty() { 130 | os.write_bytes(1, &self.key)?; 131 | } 132 | if !self.value.is_empty() { 133 | os.write_bytes(2, &self.value)?; 134 | } 135 | if !self.userMeta.is_empty() { 136 | os.write_bytes(3, &self.userMeta)?; 137 | } 138 | os.write_unknown_fields(self.get_unknown_fields())?; 139 | ::std::result::Result::Ok(()) 140 | } 141 | 142 | fn get_cached_size(&self) -> u32 { 143 | self.cached_size.get() 144 | } 145 | 146 | fn get_unknown_fields(&self) -> &::protobuf::UnknownFields { 147 | &self.unknown_fields 148 | } 149 | 150 | fn mut_unknown_fields(&mut self) -> &mut ::protobuf::UnknownFields { 151 | &mut self.unknown_fields 152 | } 153 | 154 | fn new() -> KVPair { 155 | KVPair::new() 156 | } 157 | 158 | fn descriptor_static() -> ::protobuf::reflect::MessageDescriptor { 159 | ::protobuf::reflect::MessageDescriptor::new_generated_2(file_descriptor(), 0) 160 | } 161 | 162 | fn default_instance() -> &'static KVPair { 163 | static instance: KVPair = KVPair { 164 | key: ::std::vec::Vec::new(), 165 | value: ::std::vec::Vec::new(), 166 | userMeta: ::std::vec::Vec::new(), 167 | unknown_fields: ::protobuf::UnknownFields::new(), 168 | cached_size: ::protobuf::rt::CachedSize::new(), 169 | }; 170 | &instance 171 | } 172 | } 173 | 174 | impl ::protobuf::Clear for KVPair { 175 | fn clear(&mut self) { 176 | self.key.clear(); 177 | self.value.clear(); 178 | self.userMeta.clear(); 179 | self.unknown_fields.clear(); 180 | } 181 | } 182 | 183 | impl ::std::fmt::Debug for KVPair { 184 | fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { 185 | ::protobuf::text_format::fmt(self, f) 186 | } 187 | } 188 | 189 | impl ::protobuf::reflect::ProtobufValue for KVPair { 190 | type RuntimeType = ::protobuf::reflect::runtime_types::RuntimeTypeMessage; 191 | } 192 | 193 | static file_descriptor_proto_data: &'static [u8] = b"\ 194 | \n\x13src/pb/backup.proto\x12\x06protos\"L\n\x06KVPair\x12\x10\n\x03key\ 195 | \x18\x01\x20\x01(\x0cR\x03key\x12\x14\n\x05value\x18\x02\x20\x01(\x0cR\ 196 | \x05value\x12\x1a\n\x08userMeta\x18\x03\x20\x01(\x0cR\x08userMetab\x06pr\ 197 | oto3\ 198 | "; 199 | 200 | /// `FileDescriptorProto` object which was a source for this generated file 201 | pub fn file_descriptor_proto() -> &'static ::protobuf::descriptor::FileDescriptorProto { 202 | static file_descriptor_proto_lazy: ::protobuf::rt::LazyV2<::protobuf::descriptor::FileDescriptorProto> = ::protobuf::rt::LazyV2::INIT; 203 | file_descriptor_proto_lazy.get(|| { 204 | ::protobuf::Message::parse_from_bytes(file_descriptor_proto_data).unwrap() 205 | }) 206 | } 207 | 208 | /// `FileDescriptor` object which allows dynamic access to files 209 | pub fn file_descriptor() -> ::protobuf::reflect::FileDescriptor { 210 | static file_descriptor_lazy: ::protobuf::rt::LazyV2<::protobuf::reflect::GeneratedFileDescriptor> = ::protobuf::rt::LazyV2::INIT; 211 | let file_descriptor = file_descriptor_lazy.get(|| { 212 | let mut deps = ::std::vec::Vec::new(); 213 | let mut messages = ::std::vec::Vec::new(); 214 | messages.push(KVPair::generated_message_descriptor_data()); 215 | let mut enums = ::std::vec::Vec::new(); 216 | ::protobuf::reflect::GeneratedFileDescriptor::new_generated( 217 | file_descriptor_proto(), 218 | deps, 219 | messages, 220 | enums, 221 | ) 222 | }); 223 | ::protobuf::reflect::FileDescriptor::new_generated_2(file_descriptor) 224 | } 225 | -------------------------------------------------------------------------------- /src/pb/badgerpb3.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // Use protos/gen.sh to generate .pb.go files. 18 | syntax = "proto3"; 19 | 20 | package badgerpb3; 21 | 22 | message ManifestChangeSet { 23 | // A set of changes that are applied atomically. 24 | repeated ManifestChange changes = 1; 25 | } 26 | 27 | message ManifestChange { 28 | uint64 id = 1; 29 | enum Operation { 30 | CREATE = 0; 31 | DELETE = 1; 32 | } 33 | 34 | Operation op = 2; 35 | uint32 level = 3; // Only used for CREATE 36 | } -------------------------------------------------------------------------------- /src/pb/badgerpb3.rs: -------------------------------------------------------------------------------- 1 | // This file is generated by rust-protobuf 3.0.0-alpha.2. Do not edit 2 | // .proto file is parsed by protoc 23.3 3 | // @generated 4 | 5 | // https://github.com/rust-lang/rust-clippy/issues/702 6 | #![allow(unknown_lints)] 7 | #![allow(clippy::all)] 8 | 9 | #![allow(unused_attributes)] 10 | #![cfg_attr(rustfmt, rustfmt::skip)] 11 | 12 | #![allow(box_pointers)] 13 | #![allow(dead_code)] 14 | #![allow(missing_docs)] 15 | #![allow(non_camel_case_types)] 16 | #![allow(non_snake_case)] 17 | #![allow(non_upper_case_globals)] 18 | #![allow(trivial_casts)] 19 | #![allow(unused_results)] 20 | #![allow(unused_mut)] 21 | 22 | //! Generated file from `src/pb/badgerpb3.proto` 23 | 24 | /// Generated files are compatible only with the same version 25 | /// of protobuf runtime. 26 | const _PROTOBUF_VERSION_CHECK: () = ::protobuf::VERSION_3_0_0_ALPHA_2; 27 | 28 | #[derive(PartialEq,Clone,Default)] 29 | pub struct ManifestChangeSet { 30 | // message fields 31 | pub changes: ::std::vec::Vec, 32 | // special fields 33 | pub unknown_fields: ::protobuf::UnknownFields, 34 | pub cached_size: ::protobuf::rt::CachedSize, 35 | } 36 | 37 | impl<'a> ::std::default::Default for &'a ManifestChangeSet { 38 | fn default() -> &'a ManifestChangeSet { 39 | ::default_instance() 40 | } 41 | } 42 | 43 | impl ManifestChangeSet { 44 | pub fn new() -> ManifestChangeSet { 45 | ::std::default::Default::default() 46 | } 47 | 48 | fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { 49 | let mut fields = ::std::vec::Vec::new(); 50 | fields.push(::protobuf::reflect::rt::v2::make_vec_simpler_accessor::<_, _>( 51 | "changes", 52 | |m: &ManifestChangeSet| { &m.changes }, 53 | |m: &mut ManifestChangeSet| { &mut m.changes }, 54 | )); 55 | ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( 56 | "ManifestChangeSet", 57 | 0, 58 | fields, 59 | ) 60 | } 61 | } 62 | 63 | impl ::protobuf::Message for ManifestChangeSet { 64 | fn is_initialized(&self) -> bool { 65 | for v in &self.changes { 66 | if !v.is_initialized() { 67 | return false; 68 | } 69 | }; 70 | true 71 | } 72 | 73 | fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::ProtobufResult<()> { 74 | while !is.eof()? { 75 | let (field_number, wire_type) = is.read_tag_unpack()?; 76 | match field_number { 77 | 1 => { 78 | ::protobuf::rt::read_repeated_message_into_vec(wire_type, is, &mut self.changes)?; 79 | }, 80 | _ => { 81 | ::protobuf::rt::read_unknown_or_skip_group(field_number, wire_type, is, self.mut_unknown_fields())?; 82 | }, 83 | }; 84 | } 85 | ::std::result::Result::Ok(()) 86 | } 87 | 88 | // Compute sizes of nested messages 89 | #[allow(unused_variables)] 90 | fn compute_size(&self) -> u32 { 91 | let mut my_size = 0; 92 | for value in &self.changes { 93 | let len = value.compute_size(); 94 | my_size += 1 + ::protobuf::rt::compute_raw_varint32_size(len) + len; 95 | }; 96 | my_size += ::protobuf::rt::unknown_fields_size(self.get_unknown_fields()); 97 | self.cached_size.set(my_size); 98 | my_size 99 | } 100 | 101 | fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::ProtobufResult<()> { 102 | for v in &self.changes { 103 | ::protobuf::rt::write_message_field_with_cached_size(1, v, os)?; 104 | }; 105 | os.write_unknown_fields(self.get_unknown_fields())?; 106 | ::std::result::Result::Ok(()) 107 | } 108 | 109 | fn get_cached_size(&self) -> u32 { 110 | self.cached_size.get() 111 | } 112 | 113 | fn get_unknown_fields(&self) -> &::protobuf::UnknownFields { 114 | &self.unknown_fields 115 | } 116 | 117 | fn mut_unknown_fields(&mut self) -> &mut ::protobuf::UnknownFields { 118 | &mut self.unknown_fields 119 | } 120 | 121 | fn new() -> ManifestChangeSet { 122 | ManifestChangeSet::new() 123 | } 124 | 125 | fn descriptor_static() -> ::protobuf::reflect::MessageDescriptor { 126 | ::protobuf::reflect::MessageDescriptor::new_generated_2(file_descriptor(), 0) 127 | } 128 | 129 | fn default_instance() -> &'static ManifestChangeSet { 130 | static instance: ManifestChangeSet = ManifestChangeSet { 131 | changes: ::std::vec::Vec::new(), 132 | unknown_fields: ::protobuf::UnknownFields::new(), 133 | cached_size: ::protobuf::rt::CachedSize::new(), 134 | }; 135 | &instance 136 | } 137 | } 138 | 139 | impl ::protobuf::Clear for ManifestChangeSet { 140 | fn clear(&mut self) { 141 | self.changes.clear(); 142 | self.unknown_fields.clear(); 143 | } 144 | } 145 | 146 | impl ::std::fmt::Debug for ManifestChangeSet { 147 | fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { 148 | ::protobuf::text_format::fmt(self, f) 149 | } 150 | } 151 | 152 | impl ::protobuf::reflect::ProtobufValue for ManifestChangeSet { 153 | type RuntimeType = ::protobuf::reflect::runtime_types::RuntimeTypeMessage; 154 | } 155 | 156 | #[derive(PartialEq,Clone,Default)] 157 | pub struct ManifestChange { 158 | // message fields 159 | pub id: u64, 160 | pub op: ::protobuf::ProtobufEnumOrUnknown, 161 | pub level: u32, 162 | // special fields 163 | pub unknown_fields: ::protobuf::UnknownFields, 164 | pub cached_size: ::protobuf::rt::CachedSize, 165 | } 166 | 167 | impl<'a> ::std::default::Default for &'a ManifestChange { 168 | fn default() -> &'a ManifestChange { 169 | ::default_instance() 170 | } 171 | } 172 | 173 | impl ManifestChange { 174 | pub fn new() -> ManifestChange { 175 | ::std::default::Default::default() 176 | } 177 | 178 | fn generated_message_descriptor_data() -> ::protobuf::reflect::GeneratedMessageDescriptorData { 179 | let mut fields = ::std::vec::Vec::new(); 180 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 181 | "id", 182 | |m: &ManifestChange| { &m.id }, 183 | |m: &mut ManifestChange| { &mut m.id }, 184 | )); 185 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 186 | "op", 187 | |m: &ManifestChange| { &m.op }, 188 | |m: &mut ManifestChange| { &mut m.op }, 189 | )); 190 | fields.push(::protobuf::reflect::rt::v2::make_simpler_field_accessor::<_, _>( 191 | "level", 192 | |m: &ManifestChange| { &m.level }, 193 | |m: &mut ManifestChange| { &mut m.level }, 194 | )); 195 | ::protobuf::reflect::GeneratedMessageDescriptorData::new_2::( 196 | "ManifestChange", 197 | 1, 198 | fields, 199 | ) 200 | } 201 | } 202 | 203 | impl ::protobuf::Message for ManifestChange { 204 | fn is_initialized(&self) -> bool { 205 | true 206 | } 207 | 208 | fn merge_from(&mut self, is: &mut ::protobuf::CodedInputStream<'_>) -> ::protobuf::ProtobufResult<()> { 209 | while !is.eof()? { 210 | let (field_number, wire_type) = is.read_tag_unpack()?; 211 | match field_number { 212 | 1 => { 213 | if wire_type != ::protobuf::wire_format::WireTypeVarint { 214 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 215 | } 216 | self.id = is.read_uint64()?; 217 | }, 218 | 2 => { 219 | if wire_type != ::protobuf::wire_format::WireTypeVarint { 220 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 221 | } 222 | self.op = is.read_enum_or_unknown()?; 223 | }, 224 | 3 => { 225 | if wire_type != ::protobuf::wire_format::WireTypeVarint { 226 | return ::std::result::Result::Err(::protobuf::rt::unexpected_wire_type(wire_type)); 227 | } 228 | self.level = is.read_uint32()?; 229 | }, 230 | _ => { 231 | ::protobuf::rt::read_unknown_or_skip_group(field_number, wire_type, is, self.mut_unknown_fields())?; 232 | }, 233 | }; 234 | } 235 | ::std::result::Result::Ok(()) 236 | } 237 | 238 | // Compute sizes of nested messages 239 | #[allow(unused_variables)] 240 | fn compute_size(&self) -> u32 { 241 | let mut my_size = 0; 242 | if self.id != 0 { 243 | my_size += ::protobuf::rt::value_size(1, self.id, ::protobuf::wire_format::WireTypeVarint); 244 | } 245 | if self.op != ::protobuf::ProtobufEnumOrUnknown::new(manifest_change::Operation::CREATE) { 246 | my_size += ::protobuf::rt::enum_or_unknown_size(2, self.op); 247 | } 248 | if self.level != 0 { 249 | my_size += ::protobuf::rt::value_size(3, self.level, ::protobuf::wire_format::WireTypeVarint); 250 | } 251 | my_size += ::protobuf::rt::unknown_fields_size(self.get_unknown_fields()); 252 | self.cached_size.set(my_size); 253 | my_size 254 | } 255 | 256 | fn write_to_with_cached_sizes(&self, os: &mut ::protobuf::CodedOutputStream<'_>) -> ::protobuf::ProtobufResult<()> { 257 | if self.id != 0 { 258 | os.write_uint64(1, self.id)?; 259 | } 260 | if self.op != ::protobuf::ProtobufEnumOrUnknown::new(manifest_change::Operation::CREATE) { 261 | os.write_enum(2, ::protobuf::ProtobufEnumOrUnknown::value(&self.op))?; 262 | } 263 | if self.level != 0 { 264 | os.write_uint32(3, self.level)?; 265 | } 266 | os.write_unknown_fields(self.get_unknown_fields())?; 267 | ::std::result::Result::Ok(()) 268 | } 269 | 270 | fn get_cached_size(&self) -> u32 { 271 | self.cached_size.get() 272 | } 273 | 274 | fn get_unknown_fields(&self) -> &::protobuf::UnknownFields { 275 | &self.unknown_fields 276 | } 277 | 278 | fn mut_unknown_fields(&mut self) -> &mut ::protobuf::UnknownFields { 279 | &mut self.unknown_fields 280 | } 281 | 282 | fn new() -> ManifestChange { 283 | ManifestChange::new() 284 | } 285 | 286 | fn descriptor_static() -> ::protobuf::reflect::MessageDescriptor { 287 | ::protobuf::reflect::MessageDescriptor::new_generated_2(file_descriptor(), 1) 288 | } 289 | 290 | fn default_instance() -> &'static ManifestChange { 291 | static instance: ManifestChange = ManifestChange { 292 | id: 0, 293 | op: ::protobuf::ProtobufEnumOrUnknown::from_i32(0), 294 | level: 0, 295 | unknown_fields: ::protobuf::UnknownFields::new(), 296 | cached_size: ::protobuf::rt::CachedSize::new(), 297 | }; 298 | &instance 299 | } 300 | } 301 | 302 | impl ::protobuf::Clear for ManifestChange { 303 | fn clear(&mut self) { 304 | self.id = 0; 305 | self.op = ::protobuf::ProtobufEnumOrUnknown::new(manifest_change::Operation::CREATE); 306 | self.level = 0; 307 | self.unknown_fields.clear(); 308 | } 309 | } 310 | 311 | impl ::std::fmt::Debug for ManifestChange { 312 | fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result { 313 | ::protobuf::text_format::fmt(self, f) 314 | } 315 | } 316 | 317 | impl ::protobuf::reflect::ProtobufValue for ManifestChange { 318 | type RuntimeType = ::protobuf::reflect::runtime_types::RuntimeTypeMessage; 319 | } 320 | 321 | /// Nested message and enums of message `ManifestChange` 322 | pub mod manifest_change { 323 | #[derive(Clone,Copy,PartialEq,Eq,Debug,Hash)] 324 | pub enum Operation { 325 | CREATE = 0, 326 | DELETE = 1, 327 | } 328 | 329 | impl ::protobuf::ProtobufEnum for Operation { 330 | fn value(&self) -> i32 { 331 | *self as i32 332 | } 333 | 334 | fn from_i32(value: i32) -> ::std::option::Option { 335 | match value { 336 | 0 => ::std::option::Option::Some(Operation::CREATE), 337 | 1 => ::std::option::Option::Some(Operation::DELETE), 338 | _ => ::std::option::Option::None 339 | } 340 | } 341 | 342 | fn values() -> &'static [Self] { 343 | static values: &'static [Operation] = &[ 344 | Operation::CREATE, 345 | Operation::DELETE, 346 | ]; 347 | values 348 | } 349 | 350 | fn enum_descriptor_static() -> ::protobuf::reflect::EnumDescriptor { 351 | ::protobuf::reflect::EnumDescriptor::new_generated_2(super::file_descriptor(), 0) 352 | } 353 | } 354 | 355 | impl ::std::default::Default for Operation { 356 | fn default() -> Self { 357 | Operation::CREATE 358 | } 359 | } 360 | 361 | impl ::protobuf::reflect::ProtobufValue for Operation { 362 | type RuntimeType = ::protobuf::reflect::runtime_types::RuntimeTypeEnum; 363 | } 364 | 365 | impl Operation { 366 | pub(in super) fn generated_enum_descriptor_data() -> ::protobuf::reflect::GeneratedEnumDescriptorData { 367 | ::protobuf::reflect::GeneratedEnumDescriptorData::new_2::("ManifestChange.Operation", 0) 368 | } 369 | } 370 | } 371 | 372 | static file_descriptor_proto_data: &'static [u8] = b"\ 373 | \n\x16src/pb/badgerpb3.proto\x12\tbadgerpb3\"H\n\x11ManifestChangeSet\ 374 | \x123\n\x07changes\x18\x01\x20\x03(\x0b2\x19.badgerpb3.ManifestChangeR\ 375 | \x07changes\"\x90\x01\n\x0eManifestChange\x12\x0e\n\x02id\x18\x01\x20\ 376 | \x01(\x04R\x02id\x123\n\x02op\x18\x02\x20\x01(\x0e2#.badgerpb3.ManifestC\ 377 | hange.OperationR\x02op\x12\x14\n\x05level\x18\x03\x20\x01(\rR\x05level\"\ 378 | #\n\tOperation\x12\n\n\x06CREATE\x10\0\x12\n\n\x06DELETE\x10\x01b\x06pro\ 379 | to3\ 380 | "; 381 | 382 | /// `FileDescriptorProto` object which was a source for this generated file 383 | pub fn file_descriptor_proto() -> &'static ::protobuf::descriptor::FileDescriptorProto { 384 | static file_descriptor_proto_lazy: ::protobuf::rt::LazyV2<::protobuf::descriptor::FileDescriptorProto> = ::protobuf::rt::LazyV2::INIT; 385 | file_descriptor_proto_lazy.get(|| { 386 | ::protobuf::Message::parse_from_bytes(file_descriptor_proto_data).unwrap() 387 | }) 388 | } 389 | 390 | /// `FileDescriptor` object which allows dynamic access to files 391 | pub fn file_descriptor() -> ::protobuf::reflect::FileDescriptor { 392 | static file_descriptor_lazy: ::protobuf::rt::LazyV2<::protobuf::reflect::GeneratedFileDescriptor> = ::protobuf::rt::LazyV2::INIT; 393 | let file_descriptor = file_descriptor_lazy.get(|| { 394 | let mut deps = ::std::vec::Vec::new(); 395 | let mut messages = ::std::vec::Vec::new(); 396 | messages.push(ManifestChangeSet::generated_message_descriptor_data()); 397 | messages.push(ManifestChange::generated_message_descriptor_data()); 398 | let mut enums = ::std::vec::Vec::new(); 399 | enums.push(manifest_change::Operation::generated_enum_descriptor_data()); 400 | ::protobuf::reflect::GeneratedFileDescriptor::new_generated( 401 | file_descriptor_proto(), 402 | deps, 403 | messages, 404 | enums, 405 | ) 406 | }); 407 | ::protobuf::reflect::FileDescriptor::new_generated_2(file_descriptor) 408 | } 409 | -------------------------------------------------------------------------------- /src/pb/mod.rs: -------------------------------------------------------------------------------- 1 | // @generated 2 | 3 | use protobuf::Message; 4 | use crate::manifest::ManifestChangeBuilder; 5 | use crate::pb::badgerpb3::ManifestChangeSet; 6 | use crate::Result; 7 | // use quick_protobuf::MessageWrite; 8 | 9 | pub mod badgerpb3; 10 | pub mod backup; 11 | 12 | pub(crate) fn convert_manifest_set_to_vec(mf_set: &ManifestChangeSet) -> Vec { 13 | let mut buffer = vec![]; 14 | mf_set.write_to_vec(&mut buffer).unwrap(); 15 | buffer 16 | } 17 | 18 | pub(crate) fn parse_manifest_set_from_vec(buffer: &[u8]) -> Result { 19 | let set: ManifestChangeSet = protobuf::Message::parse_from_bytes(buffer).map_err(|err| crate::Error::from(format!("{}", err)))?; 20 | Ok(set) 21 | } 22 | 23 | #[test] 24 | fn enc_dec() { 25 | let mut mf = ManifestChangeSet::default(); 26 | mf.changes 27 | .extend(vec![ManifestChangeBuilder::new(1).build()]); 28 | let buffer = convert_manifest_set_to_vec(&mf); 29 | let got = parse_manifest_set_from_vec(&buffer).unwrap(); 30 | assert_eq!(got, mf); 31 | } 32 | -------------------------------------------------------------------------------- /src/skl/alloc.rs: -------------------------------------------------------------------------------- 1 | use crate::{cals_size_with_align}; 2 | use std::fmt::Debug; 3 | use std::mem::{ManuallyDrop}; 4 | use std::sync::atomic::{AtomicUsize, Ordering}; 5 | 6 | pub(crate) const PtrAlign: usize = 7; 7 | 8 | pub trait Allocate: Send + Sync { 9 | #[inline] 10 | fn alloc(&self, size: usize) -> usize; 11 | #[inline] 12 | fn alloc_rev(&self, size: usize) -> usize { 13 | todo!() 14 | } 15 | #[inline] 16 | fn size(&self) -> usize; 17 | #[inline] 18 | unsafe fn get_mut(&self, offset: usize) -> *mut T; 19 | #[inline] 20 | fn offset(&self, ptr: *const T) -> usize; 21 | #[inline] 22 | fn len(&self) -> usize; 23 | #[inline] 24 | fn cap(&self) -> usize; 25 | } 26 | 27 | #[derive(Debug)] 28 | pub struct DoubleAlloc { 29 | pub(crate) head: AtomicUsize, 30 | pub(crate) tail: AtomicUsize, 31 | ptr: ManuallyDrop>, 32 | _cap: usize, 33 | } 34 | 35 | unsafe impl Send for DoubleAlloc {} 36 | 37 | impl Drop for DoubleAlloc { 38 | fn drop(&mut self) { 39 | unsafe { 40 | ManuallyDrop::drop(&mut self.ptr); 41 | } 42 | } 43 | } 44 | 45 | impl Allocate for DoubleAlloc { 46 | fn alloc(&self, size: usize) -> usize { 47 | let free_count = self.free_count(); 48 | // info!("{}", free_count); 49 | assert!(free_count > size, "less memory"); 50 | let offset = self.head.fetch_add(size, Ordering::SeqCst); 51 | offset 52 | } 53 | 54 | fn alloc_rev(&self, size: usize) -> usize { 55 | let free_count = self.free_count(); 56 | assert!(free_count > size, "less memory"); 57 | let offset = self.tail.fetch_sub(size, Ordering::SeqCst); 58 | offset - size 59 | } 60 | 61 | fn size(&self) -> usize { 62 | todo!() 63 | } 64 | 65 | unsafe fn get_mut(&self, offset: usize) -> *mut T { 66 | let ptr = self.ptr.as_ptr() as *mut u8; 67 | ptr.add(offset).cast::() 68 | } 69 | 70 | fn offset(&self, ptr: *const T) -> usize { 71 | let base_ptr = self.ptr.as_ptr() as usize; 72 | let offset_ptr = ptr as usize; 73 | offset_ptr - base_ptr 74 | } 75 | 76 | fn len(&self) -> usize { 77 | self.cap() - (self.tail.load(Ordering::SeqCst) - self.head.load(Ordering::SeqCst)) 78 | } 79 | 80 | fn cap(&self) -> usize { 81 | self._cap 82 | } 83 | } 84 | 85 | impl DoubleAlloc { 86 | pub(crate) fn new(n: usize) -> DoubleAlloc { 87 | let n = cals_size_with_align(n, PtrAlign); 88 | assert_eq!(n % (PtrAlign + 1), 0); 89 | DoubleAlloc { 90 | head: AtomicUsize::new(PtrAlign + 1), 91 | tail: AtomicUsize::new(n), 92 | ptr: ManuallyDrop::new(vec![0u8; n]), 93 | _cap: n, 94 | } 95 | } 96 | 97 | fn free_count(&self) -> usize { 98 | let head = self.head.load(Ordering::SeqCst); 99 | let tail = self.tail.load(Ordering::SeqCst); 100 | assert!(head < tail, "head({}) should be lt tail({})", head, tail); 101 | tail - head 102 | } 103 | } 104 | 105 | #[test] 106 | fn t() { 107 | let size = (1 + 0) & !0; 108 | println!("{}, {}, {}", (1 + 0) & !0, (0 + 0) & !0, (3 + 0) & !0); 109 | } 110 | -------------------------------------------------------------------------------- /src/skl/arena.rs: -------------------------------------------------------------------------------- 1 | use crate::skl::node::Node; 2 | use crate::skl::PtrAlign; 3 | use crate::y::ValueStruct; 4 | use crate::{Allocate, DoubleAlloc}; 5 | use std::mem::size_of; 6 | use std::ptr::{slice_from_raw_parts, slice_from_raw_parts_mut, NonNull}; 7 | use tracing::info; 8 | 9 | /// How to cals SkipList allocate size 10 | /// 8(zero-bit) + key + value + node*N 11 | 12 | /// `Arena` should be lock-free. 13 | #[derive(Debug)] 14 | pub struct Arena { 15 | alloc: DoubleAlloc, 16 | } 17 | 18 | impl Arena { 19 | pub(crate) fn new(n: usize) -> Self { 20 | assert!(n > 0); 21 | // Don't store data at position 0 in order to reverse offset = 0 as a kind 22 | // of nil pointer 23 | Self { 24 | alloc: DoubleAlloc::new(n + 1), 25 | } 26 | } 27 | 28 | pub(crate) fn size(&self) -> u32 { 29 | self.alloc.len() as u32 30 | } 31 | 32 | pub(crate) fn cap(&self) -> u32 { 33 | self.alloc.cap() as u32 34 | } 35 | 36 | pub(crate) fn free_size(&self) -> u32 { 37 | self.cap() - self.size() 38 | } 39 | 40 | pub(crate) fn valid(&self) -> bool { 41 | // !self.slice.ptr.is_empty() 42 | todo!() 43 | } 44 | 45 | // Returns a pointer to the node located at offset. If the offset is 46 | // zero, then the null node pointer is returned. 47 | pub(crate) fn get_node(&self, offset: usize) -> Option<&Node> { 48 | if offset == 0 { 49 | return None; 50 | } 51 | unsafe { self.alloc.get_mut::(offset).as_ref() } 52 | } 53 | 54 | pub(crate) fn get_mut_node(&self, offset: usize) -> Option<&mut Node> { 55 | if offset == 0 { 56 | return None; 57 | } 58 | unsafe { self.alloc.get_mut::(offset).as_mut() } 59 | } 60 | 61 | // Returns start location 62 | pub(crate) fn put_key(&self, key: &[u8]) -> u32 { 63 | let offset = self.alloc.alloc_rev(key.len()); 64 | let buffer = unsafe { self.alloc.get_mut::(offset) }; 65 | let buffer = unsafe { &mut *slice_from_raw_parts_mut(buffer, key.len()) }; 66 | buffer.copy_from_slice(key); 67 | offset as u32 68 | } 69 | 70 | // Put will *copy* val into arena. To make better use of this, reuse your input 71 | // val buffer. Returns an offset into buf. User is responsible for remembering 72 | // size of val. We could also store this size inside arena but the encoding and 73 | // decoding will incur some overhead. 74 | pub(crate) fn put_val(&self, v: &ValueStruct) -> (u32, u16) { 75 | let buf: Vec = v.into(); 76 | let offset = self.put_key(buf.as_slice()); 77 | (offset, buf.len() as u16) 78 | } 79 | 80 | // Returns byte slice at offset. 81 | pub(crate) fn get_key(&self, offset: u32, size: u16) -> &[u8] { 82 | let buffer = unsafe { self.alloc.get_mut::(offset as usize) }; 83 | unsafe { &*slice_from_raw_parts(buffer, size as usize) } 84 | } 85 | 86 | // Returns byte slice at offset. The given size should be just the value 87 | // size and should NOT include the meta bytes. 88 | pub(crate) fn get_val(&self, offset: u32, size: u16) -> ValueStruct { 89 | let buffer = self.get_key(offset, size); 90 | ValueStruct::from(buffer) 91 | } 92 | 93 | // Return byte slice at offset. 94 | // FIXME: 95 | pub(crate) fn put_node(&self, _height: isize) -> u32 { 96 | let offset = self.alloc.alloc(Node::align_size()); 97 | offset as u32 98 | } 99 | 100 | // Returns the offset of `node` in the arena. If the `node` pointer is 101 | // nil, then the zero offset is returned. 102 | pub(crate) fn get_node_offset(&self, node: *const Node) -> usize { 103 | if node.is_null() { 104 | return 0; 105 | } 106 | let offset = self.alloc.offset(node); 107 | offset 108 | } 109 | 110 | pub(crate) fn copy(&self) -> NonNull { 111 | let ptr = self as *const Self as *mut Self; 112 | NonNull::new(ptr).unwrap() 113 | } 114 | } 115 | 116 | #[cfg(test)] 117 | mod tests { 118 | use crate::skl::{PtrAlign, MAX_HEIGHT}; 119 | use crate::test_util::tracing_log; 120 | use crate::{cals_size_with_align, Arena, Node, SkipList, ValueStruct}; 121 | use log::info; 122 | use log::kv::{Key, value}; 123 | use prometheus::core::AtomicU64; 124 | use rand::{random, thread_rng, Rng}; 125 | use std::ptr; 126 | use std::sync::atomic::Ordering; 127 | use std::sync::Arc; 128 | use std::thread::spawn; 129 | use std::time::Duration; 130 | 131 | #[test] 132 | fn t_arena_key() { 133 | let arena = Arena::new(1 << 20); 134 | let keys = vec![vec![1, 2, 3], vec![4, 5, 6, 7, 90]]; 135 | let mut got = vec![]; 136 | for key in keys.iter() { 137 | got.push(arena.put_key(key)); 138 | } 139 | for (i, offset) in got.iter().enumerate() { 140 | let key = arena.get_key(*offset, keys[i].len() as u16); 141 | assert_eq!(key, keys[i]); 142 | } 143 | } 144 | 145 | #[test] 146 | fn t_arena_value() { 147 | let arena = Arena::new(1 << 20); 148 | let v = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; 149 | let value = ValueStruct { 150 | meta: 1, 151 | user_meta: 1, 152 | cas_counter: 2, 153 | value: v, 154 | }; 155 | let (start, n) = arena.put_val(&value); 156 | let load_value = arena.get_val(start, n); 157 | assert_eq!(value, load_value); 158 | } 159 | 160 | #[test] 161 | fn t_arena_store_node() { 162 | let arena = Arena::new(1 << 20); 163 | let mut starts = vec![]; 164 | for i in 0..5 { 165 | let start = arena.put_node(i); 166 | let node = arena.get_mut_node(start as usize).unwrap(); 167 | node.height = i as u16; 168 | // node.value.fetch_add(i as u64, atomic::Ordering::Relaxed); 169 | starts.push((i, start)); 170 | } 171 | 172 | for (i, start) in starts { 173 | let node = arena.get_mut_node(start as usize).unwrap(); 174 | // let value = node.value.load(atomic::Ordering::Relaxed); 175 | assert_eq!(node.height, i as u16); 176 | // assert_eq!(value, i as u64); 177 | } 178 | 179 | let second_node = arena.get_node(Node::size()).unwrap(); 180 | let offset = arena.get_node_offset(second_node); 181 | assert_eq!(offset, Node::size()); 182 | } 183 | 184 | #[test] 185 | fn t_arena_currency() { 186 | let arena = Arc::new(Arena::new(1 << 20)); 187 | let mut waits = vec![]; 188 | for _i in 0..100 { 189 | let arena = Arc::clone(&arena); 190 | waits.push(spawn(move || arena.put_key(b"abc"))); 191 | } 192 | 193 | let mut offsets = waits 194 | .into_iter() 195 | .map(|join| join.join().unwrap()) 196 | .collect::>(); 197 | offsets.sort(); 198 | println!("offsets: {:?}", offsets); 199 | } 200 | 201 | #[test] 202 | fn t_arena_memory1() { 203 | let arena = Arena::new(1 << 20); 204 | struct Item<'a> { 205 | key: Vec, 206 | key_offset: usize, 207 | value: ValueStruct, 208 | value_offset: usize, 209 | node: &'a Node, 210 | node_offset: usize, 211 | } 212 | let mut kv = vec![]; 213 | for i in 0..1119000 { 214 | let key = vec![1u8; random::() % 18]; 215 | let value = vec![1u8; random::() % 10]; 216 | let value = ValueStruct::new(value, 9, 0, 0); 217 | if arena.cap() - 200 < arena.size() { 218 | break; 219 | } 220 | let key_offset = arena.put_key(&key); 221 | if arena.cap() - 200 < arena.size() { 222 | break; 223 | } 224 | let (value_offset, _) = arena.put_val(&value); 225 | if arena.cap() - 200 < arena.size() { 226 | break; 227 | } 228 | let offset = arena.put_node(0); 229 | let node = arena.get_mut_node(offset as usize).unwrap(); 230 | node.height = 12; 231 | node.key_offset = key_offset; 232 | node.key_size = key.len() as u16; 233 | // node.value.store(10, Ordering::SeqCst); 234 | for i in 0..node.tower.len() { 235 | node.tower[i].store(20, Ordering::SeqCst); 236 | } 237 | //println!("{}, {}, {}, {:?}", key_offset, value_offset, offset, node.tower); 238 | kv.push(Item { 239 | key: b"".to_vec(), 240 | key_offset: 0, 241 | value: ValueStruct::default(), 242 | value_offset: 0, 243 | node, 244 | node_offset: offset as usize, 245 | }) 246 | } 247 | // 248 | for el in kv.into_iter().enumerate() { 249 | let node = arena.get_node(el.1.node_offset).unwrap(); 250 | //println!("{}, {:?}", el.0, node); 251 | } 252 | } 253 | 254 | #[test] 255 | fn t_arena_memory_cals() { 256 | tracing_log(); 257 | let st = SkipList::new(1 << 9); 258 | let mut rng = thread_rng(); 259 | for i in 0..1000000 { 260 | let mut key = vec![1u8; random::() % 100]; 261 | rng.fill(&mut key[..]); 262 | let value = vec![1u8; random::() % 10]; 263 | let value = ValueStruct::new(value, 9, 0, 0); 264 | if st.arena.free_size() <= 2 * (key.len() + value.size() + Node::size()) as u32 { 265 | info!("skip it"); 266 | return; 267 | } 268 | st.put(&key, value.clone()); 269 | // info!( 270 | // " key_size: {}, value_size: {}, node_size: {}, cap:{}, len:{}, free:{}, head:{}, tail:{}", 271 | // key.len(), 272 | // value.size(), 273 | // Node::size(), 274 | // st.arena.cap(), 275 | // st.arena.size(), 276 | // st.arena.free_size(), 277 | // st.arena.alloc.head.load(Ordering::SeqCst), 278 | // st.arena.alloc.tail.load(Ordering::SeqCst), 279 | // ); 280 | // tokio::time::sleep(Duration::from_millis(200)).await; 281 | } 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/skl/cursor.rs: -------------------------------------------------------------------------------- 1 | use crate::skl::{node::Node, skip::SkipList}; 2 | use crate::y::iterator::{KeyValue, Xiterator}; 3 | use crate::y::ValueStruct; 4 | use std::cell::RefCell; 5 | 6 | /// An iterator over `SkipList` object. For new objects, you just 7 | /// need to initialize Iterator.List. 8 | pub struct Cursor<'a> { 9 | pub(crate) list: &'a SkipList, 10 | item: RefCell>, 11 | id: String, 12 | } 13 | 14 | impl<'a> Cursor<'a> { 15 | pub fn new(list: &'a SkipList) -> Cursor<'a> { 16 | Cursor { 17 | list, 18 | item: RefCell::new(Some(list.get_head())), 19 | id: format!("cursor"), 20 | } 21 | } 22 | 23 | /// Returns true if the iterator is positioned at a valid node. 24 | pub fn valid(&self) -> bool { 25 | self.item 26 | .borrow() 27 | .map_or(false, |node| !std::ptr::eq(node, self.list.get_head())) 28 | } 29 | 30 | /// Returns the key at the current position. 31 | pub fn key(&self) -> &[u8] { 32 | let node = self.item.borrow().unwrap(); 33 | self.list 34 | .arena_ref() 35 | .get_key(node.key_offset, node.key_size) 36 | } 37 | 38 | /// Return value. 39 | pub fn value(&self) -> ValueStruct { 40 | let node = self.item.borrow().unwrap(); 41 | let (value_offset, val_size) = node.get_value_offset(); 42 | self.list.arena_ref().get_val(value_offset, val_size) 43 | } 44 | 45 | /// Advances to the next position. 46 | pub fn next(&'a self) -> Option<&Node> { 47 | //assert!(self.valid()); 48 | let next = self.list.get_next(self.item.borrow().unwrap(), 0); 49 | *self.item.borrow_mut() = next; 50 | next 51 | } 52 | 53 | /// Advances to the previous position. 54 | pub fn prev(&'a self) -> Option<&Node> { 55 | //assert!(self.valid()); 56 | let (node, _) = self.list.find_near(self.key(), true, false); 57 | *self.item.borrow_mut() = node; 58 | node 59 | } 60 | 61 | /// Advance to the first entry with a key >= target. 62 | pub fn seek(&'a self, target: &[u8]) -> Option<&Node> { 63 | let (node, _) = self.list.find_near(target, false, true); // find >=. 64 | *self.item.borrow_mut() = node; 65 | node 66 | } 67 | 68 | /// Finds an entry with key <= target. 69 | pub fn seek_for_prev(&'a self, target: &[u8]) -> Option<&Node> { 70 | let (node, _) = self.list.find_near(target, true, true); // find <=. 71 | *self.item.borrow_mut() = node; 72 | node 73 | } 74 | 75 | /// Seeks position at the first entry in list. 76 | /// Final state of iterator is Valid() if list is not empty. 77 | pub fn seek_for_first(&'a self) -> Option<&'a Node> { 78 | let node = self.list.get_next(self.list.get_head(), 0); 79 | *self.item.borrow_mut() = node; 80 | node 81 | } 82 | 83 | /// Seeks position at the last entry in list. 84 | /// Final state of iterator is Valid() iff list is not empty. 85 | pub fn seek_for_last(&'a self) -> Option<&Node> { 86 | let node = unsafe { self.list.find_last() }; 87 | *self.item.borrow_mut() = node; 88 | node 89 | } 90 | 91 | // Must be call for every `Cursor` 92 | pub fn close(&self) { 93 | self.list.decr_ref(); 94 | } 95 | 96 | fn _peek(&self) -> Option<&'a Node> { 97 | let node = self.item.borrow(); 98 | if node.is_none() { 99 | return None; 100 | } 101 | let node = node.unwrap(); 102 | if std::ptr::eq(node, self.list.get_head()) { 103 | return None; 104 | } 105 | Some(node) 106 | } 107 | } 108 | 109 | pub struct CursorReverse<'a> { 110 | iter: &'a Cursor<'a>, 111 | reversed: RefCell, 112 | } 113 | 114 | impl<'a> Xiterator for CursorReverse<'a> { 115 | type Output = &'a Node; 116 | fn next(&self) -> Option { 117 | if !*(self.reversed.borrow()) { 118 | self.iter.next() 119 | } else { 120 | self.iter.prev() 121 | } 122 | } 123 | 124 | fn rewind(&self) -> Option { 125 | if !*(self.reversed.borrow()) { 126 | self.iter.seek_for_first() 127 | } else { 128 | self.iter.seek_for_last() 129 | } 130 | } 131 | 132 | fn seek(&self, key: &[u8]) -> Option { 133 | if !*(self.reversed.borrow()) { 134 | self.iter.seek(key) 135 | } else { 136 | self.iter.seek_for_prev(key) 137 | } 138 | } 139 | 140 | fn peek(&self) -> Option { 141 | self.iter._peek() 142 | } 143 | 144 | fn id(&self) -> String { 145 | self.iter.id.clone() 146 | } 147 | } 148 | 149 | impl KeyValue for CursorReverse<'_> { 150 | fn key(&self) -> &[u8] { 151 | self.iter.key() 152 | } 153 | 154 | fn value(&self) -> ValueStruct { 155 | self.iter.value() 156 | } 157 | } 158 | 159 | #[test] 160 | fn t_cursor() {} 161 | -------------------------------------------------------------------------------- /src/skl/mod.rs: -------------------------------------------------------------------------------- 1 | mod alloc; 2 | mod arena; 3 | mod cursor; 4 | mod node; 5 | mod skip; 6 | 7 | pub use alloc::*; 8 | pub use arena::Arena; 9 | pub use cursor::Cursor; 10 | pub use node::Node; 11 | pub use skip::*; 12 | 13 | const MAX_HEIGHT: usize = 20; 14 | const HEIGHT_INCREASE: u32 = u32::MAX / 3; 15 | -------------------------------------------------------------------------------- /src/skl/node.rs: -------------------------------------------------------------------------------- 1 | use crate::skl::arena::Arena; 2 | use crate::skl::{MAX_HEIGHT, PtrAlign}; 3 | use crate::y::ValueStruct; 4 | use std::mem::{align_of, size_of, size_of_val}; 5 | use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; 6 | 7 | #[derive(Debug)] 8 | #[repr(C)] 9 | pub struct Node { 10 | // A byte slice is 24 bytes. We are trying to save space here. 11 | // immutable. No need to lock to access key. 12 | pub(crate) key_offset: u32, 13 | // immutable. No need to lock to access key. 14 | pub(crate) key_size: u16, 15 | 16 | // Height of the tower. 17 | pub(crate) height: u16, 18 | 19 | // parts of the value are encoded as a single uint64 so that it 20 | // can be atomically loaded and stored: 21 | // value offset: uint32 (bits 0-31) 22 | // value size : uint16 (bits 32-47) 23 | pub(crate) value: AtomicU64, 24 | 25 | // Most nodes do not need to use the full height of the tower, since the 26 | // probability of each successive level decreases exponentially, Because 27 | // these elements are never accessed, the do not need to be allocated. 28 | // is deliberately truncated to not include unneeded tower elements. 29 | // 30 | // All accesses to elements should use CAS operations, with no need to lock. 31 | pub(crate) tower: [AtomicU32; MAX_HEIGHT], 32 | } 33 | 34 | impl Default for Node { 35 | fn default() -> Self { 36 | const TOWER: AtomicU32 = AtomicU32::new(0); 37 | let mut node = Node { 38 | key_offset: 0, 39 | key_size: 0, 40 | height: 0, 41 | value: AtomicU64::new(0), 42 | tower: [TOWER; MAX_HEIGHT], 43 | }; 44 | for i in 0..MAX_HEIGHT { 45 | node.tower[i] = AtomicU32::new(0); 46 | } 47 | node 48 | } 49 | } 50 | 51 | impl Node { 52 | pub(crate) fn new<'a>( 53 | arena: &'a mut Arena, 54 | key: &'a [u8], 55 | v: &'a ValueStruct, 56 | height: isize, 57 | ) -> &'a mut Node { 58 | let key_offset = arena.put_key(key); 59 | let (value_offset, value_size) = arena.put_val(v); 60 | // The base level is already allocated in the node struct. 61 | let offset = arena.put_node(height); 62 | let node = arena.get_mut_node(offset as usize).unwrap(); 63 | // 1: storage key 64 | node.key_offset = key_offset; 65 | node.key_size = key.len() as u16; 66 | // 2: storage value 67 | node.value.store( 68 | Self::encode_value(value_offset, value_size), 69 | Ordering::Relaxed, 70 | ); 71 | node.height = height as u16; 72 | node 73 | } 74 | 75 | pub(crate) const fn size() -> usize { 76 | size_of::() 77 | } 78 | 79 | pub(crate) const fn align_size() -> usize { 80 | (size_of::() + PtrAlign) & !PtrAlign 81 | } 82 | 83 | pub(crate) fn set_value(&self, arena: &Arena, v: &ValueStruct) { 84 | let (value_offset, value_size) = arena.put_val(v); 85 | let value = Self::encode_value(value_offset, value_size as u16); 86 | self.value.store(value, Ordering::Relaxed); 87 | } 88 | 89 | pub(crate) fn get_value_offset(&self) -> (u32, u16) { 90 | let value = self.value.load(Ordering::Acquire); 91 | Self::decode_value(value) 92 | } 93 | 94 | pub(crate) fn key<'a>(&'a self, arena: &'a Arena) -> &'a [u8] { 95 | arena.get_key(self.key_offset, self.key_size) 96 | } 97 | 98 | pub(crate) fn get_next_offset(&self, h: usize) -> u32 { 99 | self.tower[h].load(Ordering::Acquire) 100 | } 101 | 102 | pub(crate) fn cas_next_offset(&self, h: usize, old: u32, val: u32) -> bool { 103 | let ok = self.tower[h].compare_exchange(old, val, Ordering::Acquire, Ordering::Acquire); 104 | return ok.is_ok(); 105 | } 106 | 107 | #[inline] 108 | fn decode_value(value: u64) -> (u32, u16) { 109 | let value_offset = value as u32; 110 | let value_size = (value >> 32) as u16; 111 | (value_offset, value_size) 112 | } 113 | 114 | #[inline] 115 | fn encode_value(value_offset: u32, value_size: u16) -> u64 { 116 | ((value_size as u64) << 32) | (value_offset) as u64 117 | } 118 | } 119 | 120 | -------------------------------------------------------------------------------- /src/st_manager.rs: -------------------------------------------------------------------------------- 1 | use crate::options::Options; 2 | use crate::SkipList; 3 | 4 | use crossbeam_epoch::Shared; 5 | use drop_cell::defer; 6 | use log::info; 7 | use parking_lot::lock_api::RwLockWriteGuard; 8 | use parking_lot::RawRwLock; 9 | 10 | use std::sync::atomic::{AtomicUsize, Ordering}; 11 | use std::sync::Arc; 12 | 13 | type SkipListItem = crossbeam_epoch::Atomic; 14 | 15 | pub struct SkipListManager { 16 | // TODO use it lock skip_list_manager 17 | share_lock: parking_lot::RwLock<()>, 18 | mt: Option, 19 | imm: Arc>>, 20 | sz: Arc, 21 | mt_seq: Arc, 22 | } 23 | 24 | impl Default for SkipListManager { 25 | fn default() -> Self { 26 | SkipListManager { 27 | share_lock: parking_lot::RwLock::new(()), 28 | mt: None, 29 | imm: Arc::new(parking_lot::RwLock::new(vec![])), 30 | sz: Arc::new(AtomicUsize::new(0)), 31 | mt_seq: Arc::new(AtomicUsize::default()), 32 | } 33 | } 34 | } 35 | 36 | impl SkipListManager { 37 | pub fn new(sz: usize) -> SkipListManager { 38 | SkipListManager { 39 | share_lock: parking_lot::RwLock::new(()), 40 | mt: Some(SkipListItem::new(SkipList::new(sz))), 41 | imm: Arc::new(parking_lot::RwLock::new(vec![])), 42 | sz: Arc::new(AtomicUsize::new(sz)), 43 | mt_seq: Arc::new(AtomicUsize::new(0)), 44 | } 45 | } 46 | 47 | pub fn take<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { 48 | // self.lock_exclusive(); 49 | // defer! {self.unlock_exclusive()} 50 | self.mt.as_ref().unwrap().load_consume(p) 51 | } 52 | 53 | pub fn mt_ref<'a>(&'a self, p: &'a crossbeam_epoch::Guard) -> Shared<'a, SkipList> { 54 | // self.lock_exclusive(); 55 | // defer! {self.unlock_exclusive()} 56 | let st = self.mt.as_ref().unwrap().load(Ordering::Relaxed, &p); 57 | st 58 | } 59 | 60 | pub fn mt_clone(&self) -> SkipList { 61 | // self.lock_exclusive(); 62 | // defer! {self.unlock_exclusive()} 63 | let p = crossbeam_epoch::pin(); 64 | let mt = self.mt_ref(&p); 65 | unsafe { mt.as_ref().unwrap().clone() } 66 | } 67 | 68 | pub fn imm(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> { 69 | // self.lock_exclusive(); 70 | // defer! {self.unlock_exclusive()} 71 | self.imm.write() 72 | } 73 | 74 | // TODO 75 | pub fn swap_st(&self, opt: Options) { 76 | self.lock_exclusive(); 77 | defer! {self.unlock_exclusive()} 78 | let p = crossbeam_epoch::pin(); 79 | let st = self.take(&p).into(); 80 | self.imm.write().push(st); 81 | let st = SkipList::new(opt.arena_size() as usize); 82 | self.mt 83 | .as_ref() 84 | .unwrap() 85 | .store(crossbeam_epoch::Owned::new(st), Ordering::Relaxed); 86 | self.mt_seq.fetch_add(1, Ordering::Relaxed); 87 | } 88 | 89 | pub fn advance_imm(&self, _mt: &SkipList) { 90 | self.lock_exclusive(); 91 | defer! {self.unlock_exclusive()}; 92 | info!( 93 | "advance im, mt_seq: {}", 94 | self.mt_seq.load(Ordering::Relaxed) 95 | ); 96 | let mut imm = self.imm(); 97 | // let first_imm = imm 98 | // .first() 99 | // .unwrap() 100 | // .load(Ordering::Relaxed, &crossbeam_epoch::pin()) 101 | // .as_raw(); 102 | // assert!(ptr::eq(first_imm, mt)); 103 | imm.remove(0); 104 | } 105 | 106 | pub fn lock_exclusive(&self) { 107 | use parking_lot::lock_api::RawRwLock; 108 | unsafe { self.share_lock.raw().lock_exclusive() } 109 | } 110 | 111 | pub fn unlock_exclusive(&self) { 112 | use parking_lot::lock_api::RawRwLock; 113 | unsafe { self.share_lock.raw().unlock_exclusive() } 114 | } 115 | } 116 | 117 | #[test] 118 | fn ti() {} 119 | -------------------------------------------------------------------------------- /src/table/builder.rs: -------------------------------------------------------------------------------- 1 | use crate::y::{hash, is_eof, Decode, Encode, ValueStruct}; 2 | use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; 3 | use drop_cell::defer; 4 | use growable_bloom_filter::GrowableBloom; 5 | use log::{debug, info}; 6 | use serde_json; 7 | use std::hash::Hasher; 8 | use std::io::{Cursor, Read, Write}; 9 | use std::time::SystemTime; 10 | 11 | // TODO use simd 12 | #[derive(Clone, Default, Debug)] 13 | pub(crate) struct Header { 14 | pub(crate) p_len: u16, // Overlap with base key(Prefix length) 15 | pub(crate) k_len: u16, // Length of the diff. Eg: "d" = "abcd" - "abc" 16 | pub(crate) v_len: u16, // Length of the value. 17 | pub(crate) prev: u32, // Offset for the previous key-value pair. The offset is relative to `block` base offset. 18 | } 19 | 20 | impl Header { 21 | pub(crate) const fn size() -> usize { 22 | 10 23 | } 24 | 25 | pub(crate) fn is_dummy(&self) -> bool { 26 | self.k_len == 0 && self.p_len == 0 27 | } 28 | } 29 | 30 | impl Decode for Header { 31 | fn dec(&mut self, rd: &mut dyn Read) -> crate::Result<()> { 32 | self.p_len = rd.read_u16::()?; 33 | self.k_len = rd.read_u16::()?; 34 | self.v_len = rd.read_u16::()?; 35 | self.prev = rd.read_u32::()?; 36 | Ok(()) 37 | } 38 | } 39 | 40 | impl Encode for Header { 41 | fn enc(&self, wt: &mut dyn Write) -> crate::Result { 42 | wt.write_u16::(self.p_len)?; 43 | wt.write_u16::(self.k_len)?; 44 | wt.write_u16::(self.v_len)?; 45 | wt.write_u32::(self.prev)?; 46 | Ok(Header::size()) 47 | } 48 | } 49 | 50 | impl From<&[u8]> for Header { 51 | fn from(buffer: &[u8]) -> Self { 52 | let mut header = Header::default(); 53 | Decode::dec(&mut header, &mut Cursor::new(buffer)).unwrap(); 54 | header 55 | } 56 | } 57 | 58 | impl Into> for Header { 59 | fn into(self) -> Vec { 60 | let mut wt = Cursor::new(vec![0u8; Header::size()]); 61 | Encode::enc(&self, &mut wt).unwrap(); 62 | wt.into_inner() 63 | } 64 | } 65 | 66 | // Used in building a table. 67 | pub struct Builder { 68 | counter: usize, // Number of keys written for the current block. 69 | buf: Cursor>, // bytes buffer 70 | base_key: Vec, // Base key for the current block. 71 | base_offset: u32, // Offset for the current block. 72 | restarts: Vec, // Base offsets of every block. 73 | prev_offset: u32, // Tracks offset for the previous key-value-pair. Offset is relative to block base offset. 74 | key_buf: Cursor>, 75 | key_count: u32, 76 | } 77 | 78 | impl Builder { 79 | // the max keys number of every block. 80 | pub(crate) const RESTART_INTERVAL: usize = 100; 81 | pub(crate) fn is_zero_bytes(&self) -> bool { 82 | self.buf.position() == 0 83 | } 84 | 85 | /// Returns a suffix of new_key that is different from b.base_key. 86 | fn key_diff<'a>(new_key: &'a [u8], key: &'a [u8]) -> &'a [u8] { 87 | let mut i = 0usize; 88 | while i < new_key.len() && i < key.len() { 89 | if new_key[i] != key[i] { 90 | break; 91 | } 92 | i += 1; 93 | } 94 | &new_key[i..] 95 | } 96 | 97 | fn add_helper(&mut self, key: &[u8], v: &ValueStruct) { 98 | // Add key to bloom filter. 99 | self.key_buf 100 | .write_u16::(key.len() as u16) 101 | .unwrap(); 102 | self.key_buf.write_all(key).unwrap(); 103 | self.key_count += 1; 104 | 105 | // diff_key stores the difference of key with base_key. 106 | let diff_key; 107 | if self.base_key.is_empty() { 108 | // Make a copy. Builder should not keep references. Otherwise, caller has to be very careful 109 | // and will have to make copies of keys every time they add to builder. which is even worse. 110 | self.base_key.clear(); 111 | self.base_key.extend_from_slice(key); 112 | diff_key = key; 113 | } else { 114 | diff_key = Self::key_diff(key, self.base_key.as_slice()); 115 | } 116 | let h = Header { 117 | p_len: (key.len() - diff_key.len()) as u16, 118 | k_len: diff_key.len() as u16, 119 | v_len: (v.value.len() + ValueStruct::header_size()) as u16, 120 | prev: self.prev_offset, // prevOffset is the location of the last key-value added. 121 | }; 122 | // Remember current offset for the next Add call. 123 | self.prev_offset = self.buf.get_ref().len() as u32 - self.base_offset; 124 | 125 | // Layout: header, diff_key, value. 126 | self.buf 127 | .write_all(