├── .github
└── workflows
│ └── rust.yml
├── .gitignore
├── .idea
├── .gitignore
├── badger-rs.iml
├── modules.xml
└── vcs.xml
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
└── my_benchmark.rs
├── build.rs
├── examples
└── badger.rs
├── lock.txt
└── src
├── backup.rs
├── compaction.rs
├── doc
└── write.md
├── event
└── mod.rs
├── iterator.rs
├── kv.rs
├── kv_test.rs
├── level_handler.rs
├── levels.rs
├── lib.rs
├── lock.txt
├── log_file.rs
├── manifest.rs
├── options
└── mod.rs
├── pb
├── backup.proto
├── backup.rs
├── badgerpb3.proto
├── badgerpb3.rs
└── mod.rs
├── skl
├── alloc.rs
├── arena.rs
├── cursor.rs
├── mod.rs
├── node.rs
└── skip.rs
├── st_manager.rs
├── table
├── builder.rs
├── iterator.rs
├── mod.rs
├── table.rs
└── tests.rs
├── test_data
└── vlog_file.text
├── test_util.rs
├── types.rs
├── value_log.rs
├── value_log_tests.rs
└── y
├── codec.rs
├── iterator.rs
├── merge_iterator.rs
├── metrics.rs
└── mod.rs
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
1 | name: Rust
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 |
9 | env:
10 | CARGO_TERM_COLOR: always
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Install latest nightly
20 | uses: actions-rs/toolchain@v1
21 | with:
22 | toolchain: nightly
23 | override: true
24 | - name: Build
25 | run: cargo build --verbose
26 | - name: Run tests
27 | run: cargo test --verbose
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | /target/
4 |
5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
7 | Cargo.lock
8 |
9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/badger-rs.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "badger-rs"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 |
8 | [dependencies]
9 | serde = { version = "1.0.171", features = ["derive"] }
10 | serde_json = { version = "1.0.103", default-features = true, features = ["alloc"] }
11 | anyhow = "1.0.72"
12 | thiserror = "1.0.43"
13 | tokio = { version = "1.29.1", features = ["full", "tracing"] }
14 | byteorder = "1.4.3"
15 | rand = "0.8.5"
16 | maligned = "0.2.1"
17 | atomic = "0.5.3"
18 | tabled = { version = "0.12.2", features = ["ansi-str", "color"] }
19 | memmap = "0.7.0"
20 | bytes = "1.4.0"
21 | bloom = "0.3.2"
22 | growable-bloom-filter = { version = "2.0.1", features = ["nightly"] }
23 | filename = "0.1.1"
24 | num_cpus = "1.16.0"
25 | threads_pool = "0.2.6"
26 | crc32fast = "1.3.2"
27 | async-trait = "0.1.71"
28 | fmmap = { version = "0.3.2", features = ["tokio-async"] }
29 | parking_lot = "0.12.1"
30 | bitflags = "2.3.3"
31 | libc = "0.2.147"
32 | log = { version = "0.4.19", features = ["kv_unstable", "kv_unstable_serde", "kv_unstable_sval"] }
33 | async-channel = "1.9.0"
34 | file-guard = "0.1.0"
35 | fs2 = "0.4.3"
36 | awaitgroup = "0.7.0"
37 | range-lock = "0.2.3"
38 | tracing = "0.1.37"
39 | drop_cell = "0.0.0"
40 | walkdir = "2.3.3"
41 | crossbeam-epoch = "0.9.15"
42 | tokio-context = "0.1.3"
43 | dyn-clone = "1.0.12"
44 | eieio = "1.0.0"
45 | either = "1.8.1"
46 | enum-unitary = "0.5.0"
47 | atom_box = "0.1.2"
48 | console-subscriber = "0.1.10"
49 | uuid = { version = "1.4.1", features = ["v5", "v4"] }
50 | winapi = "0.3.9"
51 | itertools = "0.11.0"
52 | tokio-metrics = "0.2.2"
53 | metrics = "0.21.1"
54 | metrics-prometheus = "0.4.1"
55 | prometheus = "0.13.3"
56 | lazy_static = "1.4.0"
57 | getset = "0.1.2"
58 | tokio-stream = "0.1.14"
59 | async-stream = "0.3.5"
60 | futures-core = "0.3.28"
61 | backtrace-on-stack-overflow = "0.3.0"
62 | protobuf = { version = "3.0.0-alpha.2", features = ["with-bytes"] }
63 | [dev-dependencies]
64 | tracing-subscriber = "0.3.17"
65 | tracing-log = "0.1.3"
66 | chrono = "0.4.26"
67 | env_logger = "0.10.0"
68 | console_log = { version = "1.0.0", features = ["color"] }
69 | itertools = "0.11.0"
70 | tokio-metrics = { version = "0.2.2", default-features = false }
71 | tokio = { version = "1.29.1", features = ["full", "rt", "time", "macros", "test-util"] }
72 | criterion = { version = "0.5.1", features = ["tokio"] }
73 |
74 | [build]
75 | rustflags = ["--cfg", "tokio_unstable"]
76 |
77 | [build-dependencies]
78 | protoc-rust = "3.0.0-alpha.2"
79 |
80 | [[bench]]
81 | name = "my_benchmark"
82 | harness = false
83 |
84 | [profile.dev]
85 | debug-assertions = false
86 |
87 | [profile.release]
88 | codegen-units=1
89 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # badger-rs
2 | 
3 |
4 | Badger is based on [WiscKey paper by University of Wisconsin, Madison](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf).
5 |
6 | Thanks for [dgraph-io/badger](https://github.com/dgraph-io/badger)
--------------------------------------------------------------------------------
/benches/my_benchmark.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 | fn main() {
3 | use criterion::BenchmarkId;
4 | use criterion::Criterion;
5 | use criterion::{criterion_group, criterion_main};
6 |
7 | // This is a struct that tells Criterion.rs to use the "futures" crate's current-thread executor
8 | use criterion::async_executor::AsyncExecutor;
9 |
10 | // Here we have an async function to benchmark
11 | async fn do_something(size: usize) {
12 | // Do something async with the size
13 | }
14 |
15 | fn from_elem(c: &mut Criterion) {
16 | let size: usize = 1024;
17 |
18 | c.bench_with_input(BenchmarkId::new("input_example", size), &size, |b, &s| {
19 | // Insert a call to `to_async` to convert the bencher to async mode.
20 | // The timing loops are the same as with the normal bencher.
21 | // b.to_async(FuturesExecutor).iter(|| do_something(s));
22 | });
23 | }
24 |
25 | criterion_group!(benches, from_elem);
26 | criterion_main!(benches);
27 | }
28 |
--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | extern crate protoc_rust;
2 |
3 | fn main() {
4 | // protoc --rust_out=src/pb src/pb/badgerpb3.proto
5 | //protoc_rust::Codegen::new()
6 | // .out_dir("src/pb")
7 | //.inputs(&["src/pb/badgerpb3.proto", "src/pb/backup.proto"])
8 | // .run()
9 | // .expect("Running protoc failed");
10 | }
11 |
--------------------------------------------------------------------------------
/examples/badger.rs:
--------------------------------------------------------------------------------
1 | #[tokio::main]
2 | async fn main() {
3 | let env = tracing_subscriber::EnvFilter::from_default_env();
4 | tracing_subscriber::FmtSubscriber::builder()
5 | .with_env_filter(env)
6 | .try_init()
7 | .unwrap();
8 | let opt = badger_rs::Options::default();
9 | let kv = badger_rs::KV::open(opt).await.unwrap();
10 | kv.set(
11 | b"hello word".to_vec(),
12 | b">>>>>I LOVE YOU!<<<<<".to_vec(),
13 | 0x0,
14 | )
15 | .await
16 | .unwrap();
17 |
18 | let got = kv.get(b"hello word").await.unwrap();
19 | println!("{}", String::from_utf8_lossy(&got));
20 | }
21 |
--------------------------------------------------------------------------------
/lock.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laohanlinux/badger-rs/00bbe70da1f4b0fe7d52ffbdf8da91867a147834/lock.txt
--------------------------------------------------------------------------------
/src/backup.rs:
--------------------------------------------------------------------------------
1 | use std::io::Write;
2 | use byteorder::{LittleEndian, WriteBytesExt};
3 | use protobuf::Message;
4 | use crate::pb::backup::KVPair;
5 |
6 | pub fn write_to(entry: &KVPair, wt: &mut W) -> crate::Result<()> where W: Write {
7 | let buf = entry.write_to_bytes().unwrap();
8 | wt.write_u64::(buf.len() as u64)?;
9 | wt.write_all(&buf)?;
10 | Ok(())
11 | }
12 |
--------------------------------------------------------------------------------
/src/compaction.rs:
--------------------------------------------------------------------------------
1 | use crate::hex_str;
2 | use crate::levels::CompactDef;
3 | use crate::table::table::Table;
4 |
5 | use log::{error, info, warn};
6 | use parking_lot::lock_api::{RwLockReadGuard, RwLockWriteGuard};
7 | use parking_lot::{RawRwLock, RwLock};
8 | use std::fmt::{Display, Formatter};
9 | use std::sync::atomic::{AtomicU64, Ordering};
10 | use std::sync::Arc;
11 |
12 | #[derive(Debug)]
13 | pub(crate) struct CompactStatus {
14 | // every level has a *CompactionStatus* that includes multipart *KeyRange*
15 | pub(crate) levels: RwLock>,
16 | }
17 |
18 | impl Default for CompactStatus {
19 | fn default() -> Self {
20 | CompactStatus {
21 | levels: RwLock::new(vec![]),
22 | }
23 | }
24 | }
25 |
26 | impl CompactStatus {
27 | // Check whether we can run this *CompactDef*. That it doesn't overlap with any
28 | // other running Compaction. If it can be run, it would store this run in the compactStatus state.
29 | pub(crate) fn compare_and_add(&self, cd: &CompactDef) -> bool {
30 | let level = cd.this_level.level();
31 | assert!(
32 | level + 1 < self.rl().len(),
33 | "Got level {}, max level {}",
34 | level,
35 | self.rl().len()
36 | );
37 | let lc = self.rl();
38 | let this_level = lc.get(level).unwrap();
39 | let next_level = lc.get(level + 1).unwrap();
40 | if this_level.overlaps_with(&cd.this_range) {
41 | return false;
42 | }
43 | if next_level.overlaps_with(&cd.next_range) {
44 | return false;
45 | }
46 |
47 | // Check whether this level really needs compaction or not. Otherwise, we'll end up
48 | // running parallel compactions for the same level.
49 | // *NOTE*: We can directly call this_level.total_size, because we already have acquired a read lock
50 | // over this and the next level.
51 | if cd.this_level.get_total_size() - this_level.get_del_size()
52 | < cd.this_level.get_max_total_size()
53 | {
54 | log::info!(
55 | "skip the compaction, top_size:{}, bot_size:{}, max_size:{}",
56 | cd.this_level.get_total_size(),
57 | cd.next_level.get_total_size(),
58 | cd.this_level.get_max_total_size()
59 | );
60 | return false;
61 | }
62 | this_level.add(cd.this_range.clone());
63 | next_level.add(cd.next_range.clone());
64 | this_level.incr_del_size(cd.this_size.load(Ordering::Relaxed));
65 | true
66 | }
67 |
68 | // Delete CompactDef.
69 | pub(crate) fn delete(&self, cd: &CompactDef) {
70 | let levels = self.wl();
71 | let level = cd.this_level.level();
72 | assert!(
73 | level < levels.len() - 1,
74 | "Got level {}, Max levels {}",
75 | level,
76 | levels.len()
77 | );
78 |
79 | let this_level = levels.get(level).unwrap();
80 | let next_level = levels.get(level + 1).unwrap();
81 | // Decr delete size after compacted.
82 | this_level.decr_del_size(cd.this_size.load(Ordering::Relaxed));
83 | let mut found = this_level.remove(&cd.this_range);
84 | // top level must have KeyRange because it is compact's base condition
85 | assert!(found, "{}", this_level);
86 | found = next_level.remove(&cd.next_range) && found;
87 | if !found {
88 | let this_kr = &cd.this_range;
89 | let next_kr = &cd.next_range;
90 | warn!("Looking for: [{}] in this level.", this_kr,);
91 | warn!("This Level: {}", level);
92 | warn!("Looking for: [{}] in next level.", next_kr);
93 | warn!("Next Level: {}", level + 1);
94 | warn!("KeyRange not found");
95 | warn!("Looking for seek k range");
96 | warn!("{}, {}", cd.this_range, cd.next_range);
97 | }
98 | }
99 |
100 | // Return trur if the level overlap with this, otherwise false
101 | pub(crate) fn overlaps_with(&self, level: usize, this: &KeyRange) -> bool {
102 | let cstatus = &self.rl()[level];
103 | let overlaps = cstatus.overlaps_with(this);
104 | #[cfg(test)]
105 | log::info!(
106 | "level{} compact status compare, {:?}, dst: {:?}, overlaps: {}",
107 | level,
108 | cstatus.rl(),
109 | this,
110 | overlaps
111 | );
112 | overlaps
113 | }
114 |
115 | // Return level's deleted data count
116 | pub(crate) fn del_size(&self, level: usize) -> u64 {
117 | self.rl()[level].get_del_size()
118 | }
119 |
120 | // Return Level's compaction status with *WriteLockGuard*
121 | pub(crate) fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> {
122 | self.levels.write()
123 | }
124 |
125 | // Return Level's compaction status with *ReadLockGuard*
126 | pub(crate) fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> {
127 | self.levels.read()
128 | }
129 |
130 | pub(crate) fn to_log(&self) {
131 | let status = self.rl();
132 | info!("Compact levels, count:{}", status.len());
133 | for level in status.iter().enumerate() {
134 | info!("[{}] {}", level.0, level.1.to_string())
135 | }
136 | }
137 | }
138 |
139 | // Every level compacted status(ranges).
140 | // del_size: all KeyRange size at the level (NOTE: equal LevelCompactStatus.ranges delete size, so after compacting,
141 | // KeyRange and del_size all be decr)
142 | #[derive(Clone, Debug)]
143 | pub(crate) struct LevelCompactStatus {
144 | ranges: Arc>>,
145 | del_size: Arc,
146 | }
147 |
148 | impl Default for LevelCompactStatus {
149 | fn default() -> Self {
150 | LevelCompactStatus {
151 | ranges: Arc::new(RwLock::new(Vec::new())),
152 | del_size: Arc::new(AtomicU64::new(0)),
153 | }
154 | }
155 | }
156 |
157 | impl Display for LevelCompactStatus {
158 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
159 | let ranges = self
160 | .rl()
161 | .iter()
162 | .map(|kr| kr.to_string())
163 | .collect::>()
164 | .join(",");
165 | let del_size = self.get_del_size();
166 | f.debug_struct("LevelCompactStatus")
167 | .field("ranges", &ranges)
168 | .field("del_size", &del_size)
169 | .finish()
170 | }
171 | }
172 |
173 | impl LevelCompactStatus {
174 | // returns true if self.ranges and dst has overlap, otherwise returns false
175 | fn overlaps_with(&self, dst: &KeyRange) -> bool {
176 | self.rl().iter().any(|kr| kr.overlaps_with(dst))
177 | }
178 |
179 | // remove dst from self.ranges
180 | pub(crate) fn remove(&self, dst: &KeyRange) -> bool {
181 | let mut rlock = self.wl();
182 | let len = rlock.len();
183 | rlock.retain(|r| r != dst);
184 | len > rlock.len()
185 | }
186 |
187 | // add dst range
188 | fn add(&self, dst: KeyRange) {
189 | self.wl().push(dst);
190 | }
191 |
192 | pub(crate) fn get_del_size(&self) -> u64 {
193 | self.del_size.load(Ordering::Acquire)
194 | }
195 |
196 | fn incr_del_size(&self, n: u64) {
197 | self.del_size.fetch_add(n, Ordering::Release);
198 | }
199 |
200 | fn decr_del_size(&self, n: u64) {
201 | self.del_size.fetch_sub(n, Ordering::Release);
202 | }
203 |
204 | fn wl(&self) -> RwLockWriteGuard<'_, RawRwLock, Vec> {
205 | self.ranges.write()
206 | }
207 |
208 | fn rl(&self) -> RwLockReadGuard<'_, RawRwLock, Vec> {
209 | self.ranges.read()
210 | }
211 | }
212 |
213 | // [left, right], Special inf is range all if it be set `true`
214 | #[derive(Clone, Default, Debug)]
215 | pub(crate) struct KeyRange {
216 | pub(crate) left: Vec,
217 | // TODO zero Copy
218 | pub(crate) right: Vec,
219 | pub(crate) inf: bool,
220 | }
221 |
222 | impl PartialEq for KeyRange {
223 | fn eq(&self, other: &Self) -> bool {
224 | self.equals(other)
225 | }
226 | }
227 |
228 | impl Display for KeyRange {
229 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
230 | write!(
231 | f,
232 | "",
233 | hex_str(&self.left),
234 | hex_str(&self.right),
235 | self.inf
236 | )
237 | }
238 | }
239 |
240 | // Including all keys
241 | pub(crate) const INFO_RANGE: KeyRange = KeyRange {
242 | left: vec![],
243 | right: vec![],
244 | inf: true,
245 | };
246 |
247 | impl KeyRange {
248 | // Get the KeyRange of tables
249 | pub(crate) fn get_range(tables: &Vec
) -> KeyRange {
250 | assert!(!tables.is_empty());
251 | let mut smallest = tables[0].smallest();
252 | let mut biggest = tables[0].biggest();
253 | for i in 1..tables.len() {
254 | if tables[i].smallest() < smallest {
255 | smallest = tables[i].smallest();
256 | }
257 | if tables[i].biggest() > biggest {
258 | biggest = tables[i].biggest();
259 | }
260 | }
261 | KeyRange {
262 | left: smallest.to_vec(),
263 | right: biggest.to_vec(),
264 | inf: false,
265 | }
266 | }
267 |
268 | // Left, right, inf all same, indicate equal
269 | pub(crate) fn equals(&self, other: &KeyRange) -> bool {
270 | self.left == other.left && self.right == self.right && self.inf == self.inf
271 | }
272 |
273 | // Check for overlap, *Notice*, if a and b are all inf, indicate has overlap.
274 | pub(crate) fn overlaps_with(&self, other: &KeyRange) -> bool {
275 | if self.inf || other.inf {
276 | return true;
277 | }
278 |
279 | // ---[other_left, other_right]--[]
280 | if self.left > other.right {
281 | return false;
282 | }
283 | // ---[]--[other-left, other-right]
284 | if self.right < other.left {
285 | return false;
286 | }
287 | true
288 | }
289 | }
290 |
291 | mod tests {
292 | use crate::compaction::{KeyRange, INFO_RANGE};
293 |
294 | #[test]
295 | fn key_range() {
296 | let mut v = vec![KeyRange {
297 | left: vec![],
298 | right: vec![],
299 | inf: true,
300 | }];
301 | let cd = INFO_RANGE;
302 | v.retain(|kr| kr != &cd);
303 | assert!(v.is_empty());
304 | let tests = vec![vec![2, 20], vec![30, 50], vec![70, 80]];
305 |
306 | let inputs = vec![
307 | vec![0, 1],
308 | vec![81, 100],
309 | vec![21, 25],
310 | vec![29, 40],
311 | vec![40, 60],
312 | vec![21, 51],
313 | vec![21, 100],
314 | vec![0, 200],
315 | vec![0, 70],
316 | vec![70, 80],
317 | ];
318 |
319 | for (i, arg) in inputs.iter().enumerate() {
320 | let left = tests.binary_search_by(|probe| probe[1].cmp(&arg[0]));
321 | let left = left.unwrap_or_else(|n| n);
322 | let right = tests.binary_search_by(|probe| probe[0].cmp(&arg[1]));
323 | let right = right.map(|n| n + 1).unwrap_or_else(|n| n);
324 | println!("{}, {:?}, {:?}", i, left, right);
325 | }
326 | }
327 | }
328 |
--------------------------------------------------------------------------------
/src/doc/write.md:
--------------------------------------------------------------------------------
1 | Put Key
2 |
3 | ```mermaid
4 | %% Example of sequence diagram
5 | sequenceDiagram
6 | actor KV
7 | participant WriteCh
8 | actor FlushCh
9 | KV-->>WriteCh: Async Send Req
10 | activate WriteCh
11 | alt Inner Data Transfer
12 | WriteCh-->>WriteCh: 1. Call writeRequests[Mult Reqs]
13 | WriteCh -->>WriteCh: 2. Write Into Vlog, Fill Ptrs
14 | WriteCh -)WriteCh: 3. Check ensureRoomForWrite
15 | WriteCh -->>FlushCh: 4. Send flushTask{s.mt, s.vptr} to FlushCh
16 | Note right of WriteCh: 1) vlog.sync(): Ensure value log is synced to disk so this memtable's contents wouldn't be lost. 2) s.imm = append(s.imm, s.mt): We manage to push this task. Let's modify imm. 3) s.mt = skl.NewSkiplist(arenaSize(&s.opt)): New memtable is empty. We certainly have room.
17 | WriteCh -->>WriteCh: 5. If not pass 3, writeToLSM
18 | WriteCh-->>WriteCh: 6. updateOffset [update lasted Ptr]
19 | end
20 | WriteCh-->> KV: Async Return Req
21 | deactivate WriteCh
22 | activate FlushCh
23 | FlushCh -->> FlushCh: Receive FlushTask From 4
24 | FlushCh -->> FlushCh: ft.mt is nil ? and ft.vptr.IsZero()? Put Offset for replay
25 | FlushCh -->> FlushCh: Create a new table, writeLevel0Table and addLevel0Table
26 | deactivate FlushCh
27 | ```
28 |
29 |
--------------------------------------------------------------------------------
/src/event/mod.rs:
--------------------------------------------------------------------------------
1 | use crate::table::table::Table;
2 | use lazy_static::lazy_static;
3 | use prometheus::{Gauge, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, Opts, Registry};
4 | use std::fmt;
5 | use std::fmt::Formatter;
6 | use std::time::{Duration, Instant};
7 |
8 | lazy_static! {
9 | static ref EV: EvMetrics = EvMetrics {
10 | lsm_size: IntGaugeVec::new(
11 | prometheus::Opts::new("badger_lsm_size_bytes", "lsm size bytes by direct"),
12 | &["direct"]
13 | )
14 | .unwrap(),
15 | vlog_size: IntGauge::new("vlog_size", "vlog size bytes").unwrap(),
16 | pending_writes: IntGauge::new("pending_writes_total", "pending writes total").unwrap(),
17 | num_reads: IntCounter::new("num_reads", "number of reads").unwrap(),
18 | num_writes: IntCounter::new("num_writes", "number of writes").unwrap(),
19 | num_bytes_read: IntCounter::new("num_bytes_read", "bytes of read").unwrap(),
20 | num_bytes_written: IntCounter::new("num_bytes_written", "bytes of written").unwrap(),
21 | num_lsm_gets: IntCounter::new("num_lsm_gets", "number of lsm gets").unwrap(),
22 | num_lsm_bloom_hits: IntCounter::new("num_bloom_hits", "number of bloom hits").unwrap(),
23 | num_blocked_puts: IntCounter::new("num_blocked_hits", "number of blocked hits").unwrap(),
24 | num_mem_tables_gets: IntCounter::new("num_mem_tables", "number of the memtable gets")
25 | .unwrap(),
26 | num_gets: IntCounter::new("num_gets", "number of gets").unwrap(),
27 | num_puts: IntCounter::new("num_puts", "number of puts").unwrap(),
28 | block_hash_calc_cost: IntCounter::new(
29 | "block_hash_calc_cost",
30 | "block hash calc cost for bloom"
31 | )
32 | .unwrap(),
33 | };
34 | }
35 |
36 | #[derive(Debug)]
37 | pub struct EvMetrics {
38 | pub lsm_size: IntGaugeVec,
39 | pub vlog_size: IntGauge,
40 | pub pending_writes: IntGauge,
41 |
42 | /// These are cumulative
43 | pub num_reads: IntCounter,
44 | pub num_writes: IntCounter,
45 | pub num_bytes_read: IntCounter,
46 | pub num_bytes_written: IntCounter,
47 | pub num_lsm_gets: IntCounter,
48 | pub num_lsm_bloom_hits: IntCounter,
49 | pub num_gets: IntCounter,
50 | pub num_puts: IntCounter,
51 | pub num_blocked_puts: IntCounter,
52 | /// number of the memtable gets
53 | pub num_mem_tables_gets: IntCounter,
54 | pub block_hash_calc_cost: IntCounter,
55 | }
56 |
57 | impl fmt::Display for EvMetrics {
58 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
59 | use tabled::{Table, Tabled};
60 |
61 | #[derive(Tabled)]
62 | struct KeyPair {
63 | label: String,
64 | value: String,
65 | }
66 | let mut kv = vec![];
67 | kv.push(KeyPair {
68 | label: "num_reads".to_owned(),
69 | value: self.num_reads.get().to_string(),
70 | });
71 | kv.push(KeyPair {
72 | label: "num_writes".to_owned(),
73 | value: self.num_writes.get().to_string(),
74 | });
75 | kv.push(KeyPair {
76 | label: "num_bytes_read".to_owned(),
77 | value: self.num_bytes_read.get().to_string(),
78 | });
79 | kv.push(KeyPair {
80 | label: "num_bytes_written".to_owned(),
81 | value: self.num_bytes_written.get().to_string(),
82 | });
83 | kv.push(KeyPair {
84 | label: "num_lsm_gets".to_owned(),
85 | value: self.num_lsm_gets.get().to_string(),
86 | });
87 | kv.push(KeyPair {
88 | label: "num_lsm_bloom_hits".to_owned(),
89 | value: self.num_lsm_bloom_hits.get().to_string(),
90 | });
91 | kv.push(KeyPair {
92 | label: "num_gets".to_owned(),
93 | value: self.num_gets.get().to_string(),
94 | });
95 | kv.push(KeyPair {
96 | label: "num_puts".to_owned(),
97 | value: self.num_puts.get().to_string(),
98 | });
99 | kv.push(KeyPair {
100 | label: "num_blocked_puts".to_owned(),
101 | value: self.num_blocked_puts.get().to_string(),
102 | });
103 | kv.push(KeyPair {
104 | label: "num_mem_tables_gets".to_owned(),
105 | value: self.num_mem_tables_gets.get().to_string(),
106 | });
107 | kv.push(KeyPair {
108 | label: "block_hash_calc_cost".to_owned(),
109 | value: self.block_hash_calc_cost.get().to_string(),
110 | });
111 | let table_str = Table::new(kv).to_string();
112 | f.write_str(&table_str)
113 | }
114 | }
115 |
116 | pub fn get_metrics() -> &'static EvMetrics {
117 | &EV
118 | }
119 |
--------------------------------------------------------------------------------
/src/iterator.rs:
--------------------------------------------------------------------------------
1 | use crate::iterator::PreFetchStatus::Prefetched;
2 | use crate::kv::_BADGER_PREFIX;
3 | use crate::types::{ArcRW, Channel, Closer, TArcMx, TArcRW};
4 | use crate::{hex_str, ValueStruct, KV};
5 | use crate::{
6 | value_log::{MetaBit, ValuePointer},
7 | Decode, MergeIterator, Result, Xiterator, EMPTY_SLICE,
8 | };
9 |
10 | use atomic::Atomic;
11 |
12 | use std::fmt::{Debug, Display, Formatter, Pointer};
13 | use std::future::Future;
14 |
15 | use std::pin::{pin, Pin};
16 |
17 | use std::sync::atomic::Ordering;
18 | use std::sync::Arc;
19 | use std::{io::Cursor, sync::atomic::AtomicU64};
20 | use tokio::io::AsyncWriteExt;
21 | use tokio::sync::{RwLockReadGuard, RwLockWriteGuard};
22 |
23 | #[derive(Debug, PartialEq, Copy, Clone)]
24 | pub(crate) enum PreFetchStatus {
25 | Empty,
26 | Prefetched,
27 | }
28 |
29 | #[derive(Clone, Debug)]
30 | pub struct KVItem {
31 | inner: TArcRW,
32 | }
33 |
34 | impl From for KVItem {
35 | fn from(value: KVItemInner) -> Self {
36 | Self {
37 | inner: TArcRW::new(tokio::sync::RwLock::new(value)),
38 | }
39 | }
40 | }
41 | // impl Deref for KVItem {
42 | // type Target = tokio::sync::RwLock;
43 | //
44 | // fn deref(&self) -> &Self::Target {
45 | // self.inner.as_ref()
46 | // }
47 | // }
48 |
49 | impl KVItem {
50 | pub async fn key(&self) -> Vec {
51 | let inner = self.rl().await;
52 | inner.key().to_vec()
53 | }
54 |
55 | pub async fn value(&self) -> Result> {
56 | let inner = self.rl().await;
57 | inner.get_value().await
58 | }
59 |
60 | pub async fn has_value(&self) -> bool {
61 | let inner = self.rl().await;
62 | inner.has_value()
63 | }
64 |
65 | pub async fn counter(&self) -> u64 {
66 | let inner = self.rl().await;
67 | inner.counter()
68 | }
69 |
70 | pub async fn user_meta(&self) -> u8 {
71 | let inner = self.rl().await;
72 | inner.user_meta()
73 | }
74 |
75 | pub(crate) async fn rl(&self) -> RwLockReadGuard<'_, KVItemInner> {
76 | self.inner.read().await
77 | }
78 |
79 | pub(crate) async fn wl(&self) -> RwLockWriteGuard<'_, KVItemInner> {
80 | self.inner.write().await
81 | }
82 | }
83 |
84 | // Returned during iteration. Both the key() and value() output is only valid until
85 | // iterator.next() is called.
86 | #[derive(Clone)]
87 | pub(crate) struct KVItemInner {
88 | status: Arc>,
89 | kv: KV,
90 | key: Vec,
91 | // TODO, Opz memory
92 | vptr: Vec,
93 | value: TArcMx>,
94 | meta: u8,
95 | user_meta: u8,
96 | cas_counter: Arc,
97 | wg: Closer,
98 | err: Result<()>,
99 | }
100 |
101 | impl Display for KVItemInner {
102 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
103 | f.debug_struct("kv")
104 | .field("key", &hex_str(&self.key))
105 | .field("meta", &self.meta)
106 | .field("user_meta", &self.user_meta)
107 | .field("cas", &self.counter())
108 | .finish()
109 | }
110 | }
111 |
112 | impl Debug for KVItemInner {
113 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
114 | f.debug_struct("kv")
115 | .field("key", &hex_str(&self.key))
116 | .field("meta", &self.meta)
117 | .field("user_meta", &self.user_meta)
118 | .field("cas", &self.counter())
119 | .finish()
120 | }
121 | }
122 |
123 | impl KVItemInner {
124 | pub(crate) fn new(key: Vec, value: ValueStruct, kv: KV) -> KVItemInner {
125 | Self {
126 | status: Arc::new(Atomic::new(PreFetchStatus::Empty)),
127 | kv,
128 | key,
129 | value: TArcMx::new(Default::default()),
130 | vptr: value.value,
131 | meta: value.meta,
132 | user_meta: value.user_meta,
133 | cas_counter: Arc::new(AtomicU64::new(value.cas_counter)),
134 | wg: Closer::new("kv".to_owned()),
135 | err: Ok(()),
136 | }
137 | }
138 |
139 | // Returns the key. Remember to copy if you need to access it outside the iteration loop.
140 | pub(crate) fn key(&self) -> &[u8] {
141 | &self.key
142 | }
143 |
144 | // Return value
145 | pub(crate) async fn get_value(&self) -> Result> {
146 | let ch = Channel::new(1);
147 | self.value(|value| {
148 | let tx = ch.tx();
149 | let value = value.to_vec();
150 | Box::pin(async move {
151 | tx.send(value).await.unwrap();
152 | Ok(())
153 | })
154 | })
155 | .await?;
156 | Ok(ch.recv().await.unwrap())
157 | }
158 |
159 | // Value retrieves the value of the item from the value log. It calls the
160 | // consumer function with a slice argument representing the value. In case
161 | // of error, the consumer function is not called.
162 | //
163 | // Note that the call to the consumer func happens synchronously.
164 | pub(crate) async fn value(
165 | &self,
166 | mut consumer: impl FnMut(&[u8]) -> Pin> + Send>>,
167 | ) -> Result<()> {
168 | // Wait result
169 | self.wg.wait().await;
170 | if self.status.load(Ordering::Acquire) == Prefetched {
171 | if self.err.is_err() {
172 | return self.err.clone();
173 | }
174 | let value = self.value.lock().await;
175 | return if value.is_empty() {
176 | consumer(&EMPTY_SLICE).await
177 | } else {
178 | consumer(&value).await
179 | };
180 | }
181 | return self.kv.yield_item_value(self.clone(), consumer).await;
182 | }
183 |
184 | pub(crate) fn has_value(&self) -> bool {
185 | if self.meta == 0 && self.vptr.is_empty() {
186 | return false;
187 | }
188 | if (self.meta & MetaBit::BIT_DELETE.bits()) > 0 {
189 | return false;
190 | }
191 | true
192 | }
193 |
194 | // async fetch value from value_log.
195 | pub(crate) async fn pre_fetch_value(&self) -> Result<()> {
196 | let kv = self.kv.clone();
197 | kv.yield_item_value(self.clone(), |value| {
198 | let status_wl = self.status.clone();
199 | let value = value.to_vec();
200 | let value_wl = self.value.clone();
201 | Box::pin(async move {
202 | status_wl.store(Prefetched, Ordering::Release);
203 | if value.is_empty() {
204 | return Ok(());
205 | }
206 | let mut value_wl = value_wl.lock().await;
207 | *value_wl = value;
208 | Ok(())
209 | })
210 | })
211 | .await
212 | }
213 |
214 | // Returns approximate size of the key-value pair.
215 | //
216 | // This can be called while iterating through a store to quickly estimate the
217 | // size of a range of key-value pairs (without fetching the corresponding)
218 | // values).
219 | pub(crate) fn estimated_size(&self) -> u64 {
220 | if !self.has_value() {
221 | return 0;
222 | }
223 | if self.meta & MetaBit::BIT_VALUE_POINTER.bits() == 0 {
224 | return (self.key.len() + self.vptr.len()) as u64;
225 | }
226 | let mut vpt = ValuePointer::default();
227 | vpt.dec(&mut Cursor::new(&self.vptr)).unwrap();
228 | vpt.len as u64 // includes key length
229 | }
230 |
231 | // Returns the CAS counter associated with the value.
232 | pub(crate) fn counter(&self) -> u64 {
233 | self.cas_counter.load(atomic::Ordering::Acquire)
234 | }
235 |
236 | // Returns the user_meta set by the user. Typically, this byte, optionally set by the user
237 | // is used to interpret the value.
238 | pub(crate) fn user_meta(&self) -> u8 {
239 | self.user_meta
240 | }
241 |
242 | pub(crate) fn meta(&self) -> u8 {
243 | self.meta
244 | }
245 |
246 | pub(crate) fn vptr(&self) -> &[u8] {
247 | &self.vptr
248 | }
249 | }
250 |
251 | // Used to set options when iterating over Badger key-value stores.
252 | #[derive(Debug, Clone, Copy)]
253 | pub struct IteratorOptions {
254 | // Indicates whether we should prefetch values during iteration and store them.
255 | pub(crate) pre_fetch_values: bool,
256 | // How may KV pairs to prefetch while iterating. Valid only if PrefetchValues is true.
257 | pub(crate) pre_fetch_size: isize,
258 | // Direction of iteration. False is forward, true is backward.
259 | pub(crate) reverse: bool,
260 | }
261 |
262 | impl Default for IteratorOptions {
263 | fn default() -> Self {
264 | DEF_ITERATOR_OPTIONS
265 | }
266 | }
267 |
268 | impl IteratorOptions {
269 | pub fn new(pre_fetch_values: bool, pre_fetch_size: isize, reverse: bool) -> Self {
270 | IteratorOptions {
271 | pre_fetch_values,
272 | pre_fetch_size,
273 | reverse,
274 | }
275 | }
276 | }
277 |
278 | pub(crate) const DEF_ITERATOR_OPTIONS: IteratorOptions = IteratorOptions {
279 | pre_fetch_size: 100,
280 | pre_fetch_values: true,
281 | reverse: false,
282 | };
283 |
284 | /// Helps iterating over the KV pairs in a lexicographically sorted order.
285 | /// skiplist, sst vlog
286 | /// | | |
287 | /// | | |
288 | /// IteratorExt reference
289 | pub struct IteratorExt {
290 | kv: KV,
291 | itr: MergeIterator,
292 | opt: IteratorOptions,
293 | item: ArcRW