├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benchmarks ├── Cargo.toml ├── microbenches │ └── entry_codec.rs ├── src │ ├── load.rs │ ├── main.rs │ ├── panel.rs │ ├── read.rs │ └── scan.rs └── ssd-test.fio ├── docs └── image │ └── helix_logo.jpg ├── protos ├── Cargo.toml ├── build.rs ├── flatbuffer │ └── helix.fbs └── src │ └── lib.rs ├── rust-toolchain.toml ├── rustfmt.toml └── src ├── blocks ├── block.rs └── mod.rs ├── cache.rs ├── compact_sched.rs ├── context.rs ├── db.rs ├── error.rs ├── file ├── file_manager.rs ├── mod.rs ├── rick.rs └── sstable.rs ├── fn_registry.rs ├── index.rs ├── io.rs ├── io_worker.rs ├── iterator.rs ├── level.rs ├── lib.rs ├── option.rs ├── table.rs ├── types ├── entry.rs ├── level_info.rs ├── mod.rs ├── rick.rs └── sstable.rs └── util.rs /.gitignore: -------------------------------------------------------------------------------- 1 | **/target 2 | Cargo.lock 3 | 4 | .vscode 5 | 6 | # flatbuffer generated 7 | *_generated.rs -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "helixdb" 3 | version = "0.1.0" 4 | authors = ["Ruihang Xia "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | thiserror = "1.0" 9 | protos = { path = "protos" } 10 | flatbuffers = "24" 11 | io-uring = "0.5.0" 12 | tokio = { version = "1.4", features = ["full"] } 13 | glommio = "0.9.0" 14 | async-trait = "0.1.51" 15 | lru = "0.6" 16 | futures-util = "0.3" 17 | crossbeam-channel = "0.5" 18 | tracing = "0.1.26" 19 | jemallocator = "0.3.2" 20 | num_cpus = "1.13" 21 | 22 | [dev-dependencies] 23 | tempfile = "3.2" 24 | tracing-subscriber = "0.2.18" 25 | 26 | [workspace] 27 | members = ["benchmarks"] 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # HelixDB 4 | 5 | HelixDB is a Key-Value store written in Rust. Checkout our [wiki](https://github.com/waynexia/helixdb/wiki) to get more! 6 | 7 | # Features 8 | ## Time Series 9 | HelixDB is designed to serve time-series data. "Key-Value" definition here is (`User Key`, `Logical Timestamp`) => `Data` 10 | 11 | ## Time aware 12 | HelixDB organizes data in a time-aware way. This gives HelixDB the ability to efficiently processing time related requests like "Hierarchy" or "Outdate". 13 | 14 | ## Custom Compression 15 | HelixDB gives users an interface to customize their compression method that best suits their data. 16 | 17 | ## Async I/O & Thread-Per-Core 18 | HelixDB use io-uring provided by glommio as IO library. The thread-per-core architecture is also built on top of glommio. 19 | 20 | HelixDB provides async interface, which is `Send` and can be spawned into other async runtime like tokio. 21 | 22 | # Status 23 | *This project is still in the early stages.* Laking of test coverage, robust functionality, documentation and other things. So 24 | 25 | Any discussion / suggestions / pull requests / issues / ... are welcome :heart: 26 | -------------------------------------------------------------------------------- /benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "db_bench" 3 | version = "0.1.0" 4 | authors = ["Ruihang Xia "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | helixdb = { path = "../" } 11 | tokio = { version = "1.4", features = ["full"] } 12 | indicatif = "0.15.0" 13 | clap = "2.33" 14 | procfs = "0.9.1" 15 | pprof = { version = "0.4.3", features = ["flamegraph"] } 16 | tracing-subscriber = "0.2.18" 17 | tracing = "0.1.26" 18 | rand = { version = "0.8" } 19 | 20 | [dev-dependencies] 21 | criterion = "0.3" 22 | 23 | [[bench]] 24 | name = "entry_codec" 25 | path = "microbenches/entry_codec.rs" 26 | harness = false 27 | -------------------------------------------------------------------------------- /benchmarks/microbenches/entry_codec.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 2 | use helixdb::Entry; 3 | use rand::{thread_rng, Rng}; 4 | 5 | fn do_benchmark(key_size: usize, value_size: usize, c: &mut Criterion) { 6 | let mut rng = thread_rng(); 7 | let key = (0..key_size).map(|_| rng.gen()).collect(); 8 | let value = (0..value_size).map(|_| rng.gen()).collect(); 9 | let entry = Entry { 10 | timestamp: 1234423, 11 | key, 12 | value, 13 | }; 14 | 15 | c.bench_function( 16 | &format!("encode {}B / {}B entry", key_size, value_size), 17 | |b| b.iter(|| entry.encode()), 18 | ); 19 | let bytes = entry.encode(); 20 | c.bench_function( 21 | &format!("decode {}B / {}B entry", key_size, value_size), 22 | |b| b.iter(|| Entry::decode(&bytes)), 23 | ); 24 | } 25 | 26 | fn entry_codec_benchmark(c: &mut Criterion) { 27 | do_benchmark(64, 8, c); 28 | do_benchmark(64, 32, c); 29 | do_benchmark(64, 1024, c); 30 | do_benchmark(64, 4096, c); 31 | } 32 | 33 | fn fibonacci(n: u64) -> u64 { 34 | let mut a = 0; 35 | let mut b = 1; 36 | 37 | match n { 38 | 0 => b, 39 | _ => { 40 | for _ in 0..n { 41 | let c = a + b; 42 | a = b; 43 | b = c; 44 | } 45 | b 46 | } 47 | } 48 | } 49 | 50 | fn some_bench(c: &mut Criterion) { 51 | c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20)))); 52 | } 53 | 54 | criterion_group!(benches, entry_codec_benchmark, some_bench); 55 | criterion_main!(benches); 56 | -------------------------------------------------------------------------------- /benchmarks/src/load.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU64, Ordering}; 2 | use std::sync::Arc; 3 | 4 | use helixdb::{Entry, HelixDB}; 5 | use tokio::runtime::Builder; 6 | 7 | use crate::panel::Panel; 8 | 9 | fn generate_entry(timestamp: i64, key: u64, value_size: usize) -> Entry { 10 | let key = key.to_le_bytes().to_vec(); 11 | let mut value = Vec::with_capacity(value_size); 12 | value.resize_with(value_size, Default::default); 13 | 14 | Entry { 15 | timestamp, 16 | key, 17 | value, 18 | } 19 | } 20 | 21 | pub fn load( 22 | helixdb: HelixDB, 23 | num_thread: usize, 24 | batch_size: usize, 25 | num_key: usize, 26 | num_timestamp: usize, 27 | value_size: usize, 28 | ) { 29 | let total_entry = num_key * num_timestamp; 30 | let mut panel = Panel::with_amount(total_entry as u64); 31 | 32 | let rt = Builder::new_multi_thread() 33 | .worker_threads(num_thread) 34 | .build() 35 | .unwrap(); 36 | let progress = Arc::new(AtomicU64::new(0)); 37 | panel.start(); 38 | 39 | for ts in 0..num_timestamp as i64 { 40 | let keys = (0..num_key as u64).collect::>(); 41 | // todo: shuffle keys for "random write". 42 | for keys in keys.chunks(batch_size) { 43 | let keys_len = keys.len() as u64; 44 | let helixdb = helixdb.clone(); 45 | let progress = progress.clone(); 46 | let write_batch = keys 47 | .iter() 48 | .map(|key| generate_entry(ts, *key, value_size)) 49 | .collect(); 50 | rt.spawn(async move { 51 | helixdb.put(write_batch).await.unwrap(); 52 | progress.fetch_add(keys_len, Ordering::Relaxed); 53 | }); 54 | } 55 | } 56 | 57 | loop { 58 | let progress = progress.load(Ordering::Relaxed); 59 | panel.observe(progress); 60 | if progress >= total_entry as u64 { 61 | break; 62 | } 63 | std::thread::sleep(std::time::Duration::from_millis(100)); 64 | } 65 | 66 | rt.block_on(helixdb.close()); 67 | } 68 | -------------------------------------------------------------------------------- /benchmarks/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | use std::io::Write; 3 | use std::path::Path; 4 | use std::sync::Arc; 5 | 6 | use clap::{App, Arg, SubCommand}; 7 | 8 | mod load; 9 | mod panel; 10 | mod read; 11 | mod scan; 12 | 13 | use helixdb::option::Options; 14 | use helixdb::{FnRegistry, HelixDB, SimpleTimestampReviewer}; 15 | use load::load; 16 | use read::read; 17 | use scan::scan; 18 | use tracing::Level; 19 | 20 | fn main() { 21 | tracing_subscriber::fmt() 22 | .with_max_level(Level::TRACE) 23 | .init(); 24 | 25 | let matches = App::new("db_bench") 26 | .about("HelixDB benchmark tool") 27 | .arg( 28 | Arg::with_name("dir") 29 | .long("dir") 30 | .help("Database directory") 31 | .required(true) 32 | .takes_value(true), 33 | ) 34 | .arg( 35 | Arg::with_name("thread") 36 | .long("thread") 37 | .help("Working threads number") 38 | .default_value("8") 39 | .takes_value(true), 40 | ) 41 | .arg( 42 | Arg::with_name("shard") 43 | .long("shard") 44 | .help("Shards number") 45 | .default_value("8") 46 | .takes_value(true), 47 | ) 48 | .arg( 49 | Arg::with_name("compact_interval") 50 | .long("compact_interval") 51 | .help("Timestamp range (interval) of each compacted level") 52 | .default_value("1024"), 53 | ) 54 | .subcommand( 55 | SubCommand::with_name("fill") 56 | .about("Write data") 57 | .arg( 58 | Arg::with_name("batch_size") 59 | .long("batch_size") 60 | .help("batch size of each put request") 61 | .default_value("1024"), 62 | ) 63 | .arg( 64 | Arg::with_name("num_key") 65 | .long("num_key") 66 | .help("Number of different keys to fill") 67 | .default_value("1024"), 68 | ) 69 | .arg( 70 | Arg::with_name("num_timestamp") 71 | .long("num_timestamp") 72 | .help("Number of timestamp per key to fill") 73 | .default_value("1024"), 74 | ) 75 | .arg( 76 | Arg::with_name("value_size") 77 | .long("value_size") 78 | .help("Size of each value in Bytes") 79 | .default_value("1024"), 80 | ), 81 | ) 82 | .subcommand( 83 | SubCommand::with_name("read") 84 | .about("Read data") 85 | .arg( 86 | Arg::with_name("max_key") 87 | .long("max_key") 88 | .help("The max user key in database. This is used to specify key range.") 89 | .takes_value(true), 90 | ) 91 | .arg( 92 | Arg::with_name("max_timestamp") 93 | .long("max_timestamp") 94 | .help( 95 | "The max timestamp in database. This is used to specify timestamp \ 96 | range.", 97 | ) 98 | .takes_value(true), 99 | ) 100 | .arg( 101 | Arg::with_name("repeat_time") 102 | .long("repeat_time") 103 | .default_value("1024") 104 | .help("Repeat times"), 105 | ), 106 | ) 107 | .subcommand( 108 | SubCommand::with_name("scan") 109 | .about("Scan data") 110 | .arg( 111 | Arg::with_name("key_start") 112 | .long("key_start") 113 | .help("Start key of this scan (inclusive)"), 114 | ) 115 | .arg( 116 | Arg::with_name("key_end") 117 | .long("key_end") 118 | .help("End key of this scan (inclusive)"), 119 | ) 120 | .arg( 121 | Arg::with_name("timestamp_start") 122 | .long("timestamp_start") 123 | .help("Start timestamp of this scan (inclusive)"), 124 | ) 125 | .arg( 126 | Arg::with_name("timestamp_end") 127 | .long("timestamp_end") 128 | .help("End timestamp of this scan (inclusive)"), 129 | ) 130 | .arg( 131 | Arg::with_name("prefetch_size") 132 | .long("prefetch_size") 133 | .help("Prefetch buffer size") 134 | .default_value("8"), 135 | ) 136 | .arg( 137 | Arg::with_name("repeat_time") 138 | .long("repeat_time") 139 | .help("Repeat times") 140 | .default_value("1024"), 141 | ), 142 | ) 143 | .get_matches(); 144 | 145 | let dir = matches.value_of("dir").unwrap(); 146 | let num_thread = matches.value_of("thread").unwrap().parse().unwrap(); 147 | let num_shard = matches.value_of("shard").unwrap().parse().unwrap(); 148 | let compact_interval = matches 149 | .value_of("compact_interval") 150 | .unwrap() 151 | .parse() 152 | .unwrap(); 153 | let db = open_helix(dir, num_shard, compact_interval); 154 | // let guard = pprof::ProfilerGuard::new(100).unwrap(); 155 | 156 | match matches.subcommand() { 157 | ("fill", Some(sub_matches)) => { 158 | let batch_size = sub_matches.value_of("batch_size").unwrap().parse().unwrap(); 159 | let num_key = sub_matches.value_of("num_key").unwrap().parse().unwrap(); 160 | let num_timestamp = sub_matches 161 | .value_of("num_timestamp") 162 | .unwrap() 163 | .parse() 164 | .unwrap(); 165 | let value_size = sub_matches.value_of("value_size").unwrap().parse().unwrap(); 166 | 167 | load( 168 | db, 169 | num_thread, 170 | batch_size, 171 | num_key, 172 | num_timestamp, 173 | value_size, 174 | ); 175 | } 176 | 177 | ("read", Some(sub_matches)) => { 178 | let max_key = sub_matches.value_of("max_key").unwrap().parse().unwrap(); 179 | let max_ts = sub_matches 180 | .value_of("max_timestamp") 181 | .unwrap() 182 | .parse() 183 | .unwrap(); 184 | let repeat_time = sub_matches 185 | .value_of("repeat_time") 186 | .unwrap() 187 | .parse() 188 | .unwrap(); 189 | 190 | read(db, num_thread, max_key, max_ts, repeat_time); 191 | } 192 | 193 | ("scan", Some(sub_matches)) => { 194 | let prefetch_size = sub_matches 195 | .value_of("prefetch_size") 196 | .unwrap() 197 | .parse() 198 | .unwrap(); 199 | let repeat_time = sub_matches 200 | .value_of("repeat_time") 201 | .unwrap() 202 | .parse() 203 | .unwrap(); 204 | 205 | scan(db, num_thread, repeat_time, prefetch_size) 206 | } 207 | 208 | _ => unreachable!(), 209 | } 210 | 211 | // post process 212 | // todo: make flamegraph a option 213 | // if let Ok(report) = guard.report().build() { 214 | // let file = File::create("flamegraph.svg").unwrap(); 215 | // report.flamegraph(file).unwrap(); 216 | // }; 217 | std::io::stdout().flush().unwrap(); 218 | } 219 | 220 | fn open_helix>(path: P, num_shard: usize, compact_interval: i64) -> HelixDB { 221 | let simple_tsr = SimpleTimestampReviewer::new(compact_interval, 8192); 222 | let mut fn_registry = FnRegistry::new_noop(); 223 | fn_registry.register_sharding_key_fn(Arc::new(move |key| { 224 | u64::from_le_bytes(key.to_owned().try_into().unwrap()) as usize % num_shard 225 | })); 226 | 227 | let opts = Options::default() 228 | .shards(num_shard) 229 | .set_timestamp_reviewer(Box::new(simple_tsr)) 230 | .set_fn_registry(fn_registry) 231 | .set_task_buffer_size(1024); 232 | 233 | HelixDB::open(path, opts) 234 | } 235 | -------------------------------------------------------------------------------- /benchmarks/src/panel.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | 3 | use indicatif::{ProgressBar, ProgressStyle}; 4 | use procfs::{diskstats, DiskStat}; 5 | 6 | pub struct Panel { 7 | amount: u64, 8 | processed: u64, 9 | 10 | bar: ProgressBar, 11 | timer: Instant, 12 | disk_monitor: DiskMonitor, 13 | } 14 | 15 | impl Panel { 16 | pub fn with_amount(amount: u64) -> Self { 17 | let bar = ProgressBar::new(amount); 18 | bar.set_style( 19 | ProgressStyle::default_bar() 20 | .template("{prefix:.bold.dim} [{bar:60}] ({pos}/{len}) {msg}") 21 | .progress_chars("=> "), 22 | ); 23 | 24 | Self { 25 | amount, 26 | processed: 0, 27 | bar, 28 | timer: Instant::now(), 29 | disk_monitor: DiskMonitor::new(), 30 | } 31 | } 32 | 33 | pub fn start(&mut self) { 34 | self.timer = Instant::now(); 35 | } 36 | 37 | pub fn observe(&mut self, progress: u64) { 38 | self.bar.set_position(progress); 39 | self.processed = progress; 40 | 41 | if progress >= self.amount { 42 | self.finish(); 43 | } 44 | } 45 | 46 | pub fn increase(&mut self, delta: u64) { 47 | self.bar.inc(delta); 48 | self.processed += delta; 49 | 50 | if self.processed >= self.amount { 51 | self.finish(); 52 | } 53 | } 54 | 55 | #[allow(dead_code)] 56 | pub fn reset(&mut self, _amount: usize) { 57 | todo!() 58 | } 59 | 60 | fn finish(&mut self) { 61 | let elapsed_ms = self.timer.elapsed().as_millis(); 62 | self.bar.finish_with_message("done"); 63 | 64 | println!("elapsed: {:?} ms", elapsed_ms); 65 | println!( 66 | "average: {:.2} op/sec", 67 | self.amount as f64 / (elapsed_ms as f64 / 1_000.0) 68 | ); 69 | 70 | self.disk_monitor.finish(); 71 | } 72 | 73 | fn conclude(&mut self) { 74 | // todo!() 75 | } 76 | } 77 | 78 | impl Drop for Panel { 79 | fn drop(&mut self) { 80 | self.conclude() 81 | } 82 | } 83 | 84 | struct DiskMonitor { 85 | records: Vec, 86 | } 87 | 88 | impl DiskMonitor { 89 | pub fn new() -> Self { 90 | let records = DiskMonitor::stats_iter() 91 | .map(DiskRecord::from_stat) 92 | .collect(); 93 | 94 | Self { records } 95 | } 96 | 97 | pub fn finish(&self) { 98 | let delta = self 99 | .records 100 | .iter() 101 | .zip(DiskMonitor::stats_iter()) 102 | .map(|(record, stat)| record.delta(stat)) 103 | .collect::>(); 104 | 105 | println!("{:?}", delta); 106 | } 107 | 108 | /// Return a iterator of disk stat. Only stats that minor number equals to 0 109 | /// will be preserve. This means to only read the root devices' stat. 110 | fn stats_iter() -> impl Iterator { 111 | diskstats() 112 | .unwrap() 113 | .into_iter() 114 | .filter(|stat| stat.minor == 0) 115 | } 116 | } 117 | 118 | #[allow(dead_code)] 119 | #[derive(Debug)] 120 | struct DiskRecord { 121 | read_req: usize, 122 | read_sec: usize, 123 | time_reading_ms: usize, 124 | write_req: usize, 125 | write_sec: usize, 126 | time_writing_ms: usize, 127 | flush_req: Option, 128 | time_flushing_ms: Option, 129 | } 130 | 131 | impl DiskRecord { 132 | pub fn from_stat(stat: DiskStat) -> Self { 133 | Self { 134 | read_req: stat.reads, 135 | read_sec: stat.sectors_read, 136 | time_reading_ms: stat.time_reading, 137 | write_req: stat.writes, 138 | write_sec: stat.sectors_written, 139 | time_writing_ms: stat.time_writing, 140 | flush_req: stat.flushes, 141 | time_flushing_ms: stat.time_flushing, 142 | } 143 | } 144 | 145 | pub fn delta(&self, stat: DiskStat) -> Self { 146 | Self { 147 | read_req: stat.reads.wrapping_sub(self.read_req), 148 | read_sec: stat.sectors_read.wrapping_sub(self.read_sec), 149 | time_reading_ms: stat.time_reading.wrapping_sub(self.time_reading_ms), 150 | write_sec: stat.sectors_written.wrapping_sub(self.write_sec), 151 | write_req: stat.writes.wrapping_sub(self.write_req), 152 | time_writing_ms: stat.time_writing.wrapping_sub(self.time_writing_ms), 153 | // todo: option sub 154 | flush_req: stat.flushes, 155 | time_flushing_ms: stat.time_flushing, 156 | } 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /benchmarks/src/read.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicU64, Ordering}; 2 | use std::sync::Arc; 3 | 4 | use helixdb::option::ReadOption; 5 | use helixdb::HelixDB; 6 | use rand::{thread_rng, Rng}; 7 | use tokio::runtime::Builder; 8 | 9 | use crate::panel::Panel; 10 | 11 | fn generate_key(key: u64) -> Vec { 12 | key.to_le_bytes().to_vec() 13 | } 14 | 15 | pub fn read(helixdb: HelixDB, num_thread: usize, max_key: u64, max_ts: i64, repeat_time: usize) { 16 | let mut panel = Panel::with_amount(repeat_time as u64); 17 | let found = Arc::new(AtomicU64::new(0)); 18 | 19 | let rt = Builder::new_multi_thread() 20 | .worker_threads(num_thread) 21 | .build() 22 | .unwrap(); 23 | let progress = Arc::new(AtomicU64::new(0)); 24 | panel.start(); 25 | 26 | // todo: shuffle keys for "random write". 27 | for _ in 0..repeat_time { 28 | let helixdb = helixdb.clone(); 29 | let progress = progress.clone(); 30 | let mut rng = thread_rng(); 31 | let key = generate_key(rng.gen_range(0..max_key)); 32 | let ts = rng.gen_range(0..max_ts); 33 | let found = found.clone(); 34 | rt.spawn(async move { 35 | if helixdb 36 | .get(ts, key, ReadOption::default()) 37 | .await 38 | .unwrap() 39 | .is_some() 40 | { 41 | found.fetch_add(1, Ordering::Relaxed); 42 | } 43 | progress.fetch_add(1, Ordering::Relaxed); 44 | }); 45 | } 46 | 47 | loop { 48 | let progress = progress.load(Ordering::Relaxed); 49 | panel.observe(progress); 50 | if progress >= repeat_time as u64 { 51 | break; 52 | } 53 | } 54 | 55 | println!("found {} / {}", found.load(Ordering::Relaxed), repeat_time); 56 | } 57 | -------------------------------------------------------------------------------- /benchmarks/src/scan.rs: -------------------------------------------------------------------------------- 1 | use std::sync::mpsc::channel; 2 | 3 | use helixdb::iterator::Iterator; 4 | use helixdb::option::ScanOption; 5 | use helixdb::{HelixDB, NoOrderComparator}; 6 | use tokio::runtime::Builder; 7 | 8 | use crate::panel::Panel; 9 | 10 | pub fn scan(helixdb: HelixDB, num_thread: usize, repeat_time: usize, prefetch_buf_size: usize) { 11 | let (tx, rx) = channel(); 12 | 13 | let mut panel = Panel::with_amount(repeat_time as u64); 14 | let rt = Builder::new_multi_thread() 15 | .worker_threads(num_thread) 16 | .build() 17 | .unwrap(); 18 | 19 | for _ in 0..repeat_time as u64 { 20 | let helixdb = helixdb.clone(); 21 | let tx = tx.clone(); 22 | rt.spawn(async move { 23 | let mut iter = helixdb 24 | .scan::( 25 | (0, 4).into(), 26 | ( 27 | 0usize.to_le_bytes().to_vec(), 28 | 1024usize.to_le_bytes().to_vec(), 29 | ), 30 | ScanOption { prefetch_buf_size }, 31 | ) 32 | .await 33 | .unwrap(); 34 | let mut scan_cnt = 0; 35 | while iter.next().await.unwrap().is_some() { 36 | scan_cnt += 1 37 | } 38 | println!("scanned {} item", scan_cnt); 39 | tx.send(()).unwrap(); 40 | }); 41 | } 42 | 43 | for _ in rx.iter().take(repeat_time) { 44 | panel.increase(1); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /benchmarks/ssd-test.fio: -------------------------------------------------------------------------------- 1 | # Do some important numbers on SSD drives, to gauge what kind of 2 | # performance you might get out of them. 3 | # 4 | # Sequential read and write speeds are tested, these are expected to be 5 | # high. Random reads should also be fast, random writes are where crap 6 | # drives are usually separated from the good drives. 7 | # 8 | # This uses a queue depth of 4. New SATA SSD's will support up to 32 9 | # in flight commands, so it may also be interesting to increase the queue 10 | # depth and compare. Note that most real-life usage will not see that 11 | # large of a queue depth, so 4 is more representative of normal use. 12 | # 13 | [global] 14 | bs=4k 15 | ioengine=io_uring 16 | iodepth=64 17 | size=10g 18 | direct=1 19 | runtime=60 20 | directory=/home/wayne/repo/helixdb/benchmarks/target/fio 21 | filename=ssd.test.file 22 | 23 | [seq-read] 24 | rw=read 25 | stonewall 26 | 27 | [rand-read] 28 | rw=randread 29 | stonewall 30 | 31 | [seq-write] 32 | rw=write 33 | stonewall 34 | 35 | [rand-write] 36 | rw=randwrite 37 | stonewall -------------------------------------------------------------------------------- /docs/image/helix_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/waynexia/helixdb/5efb4a4b42f15561428bbabbe89ec70eb8248871/docs/image/helix_logo.jpg -------------------------------------------------------------------------------- /protos/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "protos" 3 | version = "0.1.0" 4 | authors = ["Ruihang Xia "] 5 | edition = "2018" 6 | description = "Generated rust flatbuffer code." 7 | 8 | [dependencies] 9 | flatbuffers = "24" 10 | -------------------------------------------------------------------------------- /protos/build.rs: -------------------------------------------------------------------------------- 1 | use std::process::Command; 2 | 3 | type Error = Box; 4 | type Result = std::result::Result; 5 | 6 | const IN_DIR: &str = "flatbuffer/helix.fbs"; 7 | const OUT_DIR: &str = "src/"; 8 | 9 | fn main() -> Result<()> { 10 | let status = Command::new("flatc") 11 | .arg("--rust") 12 | .arg("-o") 13 | .arg(OUT_DIR) 14 | .arg(IN_DIR) 15 | .status(); 16 | 17 | match status { 18 | Ok(status) if !status.success() => panic!("`flatc` failed to compile the .fbs to Rust"), 19 | Ok(_status) => Ok(()), // Successfully compiled 20 | Err(err) => panic!("Could not execute `flatc`: {}", err), 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /protos/flatbuffer/helix.fbs: -------------------------------------------------------------------------------- 1 | namespace Helix; 2 | 3 | table Entry { 4 | timestamp: Timestamp ; 5 | key: [ubyte]; 6 | value: [ubyte]; 7 | } 8 | 9 | // todo: no need? 10 | struct Timestamp { 11 | timestamp: long; 12 | } 13 | 14 | struct TimeRange { 15 | start: Timestamp; 16 | end: Timestamp; 17 | } 18 | 19 | struct ThreadId{ 20 | id: uint64; 21 | } 22 | 23 | struct LevelId { 24 | id: uint64; 25 | } 26 | 27 | struct LevelDesc { 28 | time_range: TimeRange; 29 | id: LevelId; 30 | } 31 | 32 | table LevelInfo { 33 | infos: [LevelDesc]; 34 | } 35 | 36 | struct Offset { 37 | offset: uint64; 38 | } 39 | 40 | enum ValueFormat : uint32 { 41 | RawValue, 42 | CompressedValue, 43 | } 44 | 45 | table RickSuperBlock { 46 | is_ordered: bool; 47 | legal_offset_start: Offset; 48 | legal_offset_end: Offset; 49 | value_format: ValueFormat; 50 | align_timestamp: Timestamp; 51 | } 52 | 53 | enum BlockType: uint64 { 54 | SuperBlock, 55 | IndexBlock, 56 | FilterBlock, 57 | } 58 | 59 | struct BlockInfo { 60 | block_type: BlockType; 61 | offset: Offset; 62 | length: uint64; 63 | } 64 | 65 | table SSTableSuperBlock { 66 | thread_id: ThreadId; 67 | level_id: LevelId; 68 | blocks: [BlockInfo]; 69 | } 70 | 71 | table IndexBlockEntry { 72 | value_offset: Offset; 73 | timestamp: Timestamp; 74 | key: [ubyte]; 75 | } 76 | -------------------------------------------------------------------------------- /protos/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_imports)] 2 | #![allow(clippy::all)] 3 | 4 | pub mod helix_generated; 5 | 6 | pub use helix_generated::helix::*; 7 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "nightly-2024-10-19" 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | imports_layout = "HorizontalVertical" 2 | imports_granularity = "Module" 3 | group_imports = "StdExternalCrate" 4 | edition = "2018" 5 | format_code_in_doc_comments = true 6 | format_macro_matchers = true 7 | format_strings = true 8 | wrap_comments = true 9 | -------------------------------------------------------------------------------- /src/blocks/block.rs: -------------------------------------------------------------------------------- 1 | pub struct Block { 2 | data: Vec, 3 | } 4 | 5 | impl Block { 6 | pub fn data(&self) -> &[u8] { 7 | &self.data 8 | } 9 | } 10 | 11 | pub struct BlockBuilder { 12 | buf: Vec, 13 | } 14 | 15 | impl BlockBuilder { 16 | pub fn new(block_size: usize) -> Self { 17 | todo!(); 18 | } 19 | 20 | /// Return whether block_size exceeded. 21 | /// This is a no-op when returning true. 22 | pub fn add(&mut self, data: &[u8]) -> bool { 23 | todo!() 24 | } 25 | 26 | pub fn curr_size(&self) -> usize { 27 | todo!() 28 | } 29 | 30 | pub fn finish(&mut self) -> Block { 31 | todo!() 32 | } 33 | 34 | pub fn reset(&mut self) {} 35 | } 36 | 37 | pub trait BlockIter { 38 | type Key; 39 | type Value; 40 | 41 | fn new(block: Block) -> Self; 42 | 43 | fn seek(&mut self, key: &Self::Key) -> Option<()>; 44 | 45 | fn next(&mut self); 46 | 47 | fn value(&self) -> &Self::Value; 48 | } 49 | -------------------------------------------------------------------------------- /src/blocks/mod.rs: -------------------------------------------------------------------------------- 1 | mod block; 2 | -------------------------------------------------------------------------------- /src/cache.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::fmt::Debug; 3 | use std::rc::Rc; 4 | 5 | use lru::LruCache; 6 | 7 | use crate::error::Result; 8 | use crate::table::{TableIdentifier, TableReadHandle}; 9 | use crate::types::{Bytes, LevelId, ThreadId, Timestamp}; 10 | 11 | #[derive(Debug, Clone, Copy)] 12 | pub struct CacheConfig { 13 | /// Number of `SSTableHandle` cache entries. 14 | pub table_handle_size: usize, 15 | /// Number of "Key - Value" cache entries. 16 | pub kv_cache_size: usize, 17 | /// Number of "Key - Compressed values" cache entries. 18 | pub kc_cache_size: usize, 19 | /// Number of "Key - Position in value log" cache entries. 20 | pub kp_cache_size: usize, 21 | 22 | /// The largest entry size will be held by kv_cache. 23 | pub kv_cache_threshold: usize, 24 | /// The largest entry size will be held by kc_cache. 25 | pub kc_cache_threshold: usize, 26 | } 27 | 28 | impl Default for CacheConfig { 29 | fn default() -> Self { 30 | Self { 31 | table_handle_size: 32, 32 | kp_cache_size: 512, 33 | kv_cache_size: 256, 34 | kc_cache_size: 64, 35 | kv_cache_threshold: 1024, 36 | kc_cache_threshold: 4096, 37 | } 38 | } 39 | } 40 | 41 | /// # Entry Cache 42 | /// There are three types of entry cache: kv (for Key to Value), kc (for Key to 43 | /// Compressed value bytes) and kp (for Key to corresponding value's Position in 44 | /// value log). 45 | /// 46 | /// As the total space for caching is limited, cache small and frequent (or hot) 47 | /// is better. 48 | pub struct Cache { 49 | config: CacheConfig, 50 | handle_cache: RefCell>>, 51 | 52 | kv_cache: RefCell>, 53 | kc_cache: RefCell>, 54 | // todo: make it a `VLogIdentifier`. 55 | #[allow(clippy::type_complexity)] 56 | kp_cache: RefCell>, 57 | } 58 | 59 | impl Cache { 60 | pub fn with_config(config: CacheConfig) -> Self { 61 | Self { 62 | handle_cache: RefCell::new(LruCache::new(config.table_handle_size)), 63 | kv_cache: RefCell::new(LruCache::new(config.kv_cache_size)), 64 | kc_cache: RefCell::new(LruCache::new(config.kc_cache_size)), 65 | kp_cache: RefCell::new(LruCache::new(config.kp_cache_size)), 66 | 67 | config, 68 | } 69 | } 70 | 71 | pub fn default() -> Self { 72 | Self::with_config(CacheConfig::default()) 73 | } 74 | 75 | pub fn get_table_handle(&self, table_id: &TableIdentifier) -> Option> { 76 | self.handle_cache.borrow_mut().get(table_id).cloned() 77 | } 78 | 79 | pub async fn put_table_handle( 80 | &self, 81 | table_id: TableIdentifier, 82 | handle: Rc, 83 | ) -> Result<()> { 84 | self.handle_cache.borrow_mut().put(table_id, handle); 85 | 86 | Ok(()) 87 | } 88 | 89 | // todo: use `TimeKey` struct instead. 90 | pub fn get_key(&self, time_key: &(Timestamp, Bytes)) -> KeyCacheResult { 91 | if let Some(value) = self.kv_cache.borrow_mut().get(time_key) { 92 | return KeyCacheResult::Value(value.to_owned()); 93 | } else if let Some(compressed) = self.kc_cache.borrow_mut().get(time_key) { 94 | return KeyCacheResult::Compressed(compressed.to_owned()); 95 | } else if let Some((tid, lid, offset)) = self.kp_cache.borrow_mut().get(time_key) { 96 | return KeyCacheResult::Position(*tid, *lid, *offset); 97 | } 98 | 99 | KeyCacheResult::NotFound 100 | } 101 | 102 | pub fn put_key(&self, key_entry: KeyCacheEntry) { 103 | if let Some(value) = key_entry.value { 104 | if value.len() < self.config.kv_cache_threshold { 105 | self.kv_cache 106 | .borrow_mut() 107 | .put(key_entry.key.to_owned(), value.to_owned()); 108 | } 109 | } else if let Some(compressed) = key_entry.compressed { 110 | if compressed.len() < self.config.kv_cache_threshold { 111 | self.kc_cache 112 | .borrow_mut() 113 | .put(key_entry.key.to_owned(), compressed.to_owned()); 114 | } 115 | } else if let Some(position) = key_entry.position { 116 | self.kp_cache 117 | .borrow_mut() 118 | .put(key_entry.key.to_owned(), position); 119 | } 120 | } 121 | } 122 | 123 | pub enum KeyCacheResult { 124 | Value(Bytes), 125 | Compressed(Bytes), 126 | /// Thread id and level id is for constructing rick file's identifier. 127 | /// The third `usize` is offset. 128 | Position(ThreadId, LevelId, usize), 129 | NotFound, 130 | } 131 | 132 | impl Debug for KeyCacheResult { 133 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 134 | let mut f = f.debug_struct("KeyCacheResult"); 135 | match self { 136 | KeyCacheResult::Value(bytes) => f.field("Value", &bytes.len()), 137 | KeyCacheResult::Compressed(bytes) => f.field("Compressed", &bytes.len()), 138 | KeyCacheResult::Position(tid, lid, offset) => f.field("Position", &(tid, lid, offset)), 139 | KeyCacheResult::NotFound => f.field("NotFound", &()), 140 | }; 141 | f.finish() 142 | } 143 | } 144 | 145 | /// For inserting Key into cache. 146 | pub struct KeyCacheEntry<'a> { 147 | pub key: &'a (Timestamp, Bytes), 148 | pub value: Option<&'a Bytes>, 149 | pub compressed: Option<&'a Bytes>, 150 | pub position: Option<(ThreadId, LevelId, usize)>, 151 | } 152 | 153 | impl<'a> KeyCacheEntry<'a> { 154 | pub fn new(key: &'a (Timestamp, Bytes)) -> Self { 155 | Self { 156 | key, 157 | value: None, 158 | compressed: None, 159 | position: None, 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/compact_sched.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::collections::VecDeque; 3 | use std::mem; 4 | use std::rc::Rc; 5 | use std::time::Duration; 6 | 7 | use async_trait::async_trait; 8 | use glommio::timer::TimerActionRepeat; 9 | use glommio::TaskQueueHandle; 10 | 11 | use crate::error::Result; 12 | use crate::level::Levels; 13 | use crate::types::LevelId; 14 | 15 | #[async_trait(?Send)] 16 | pub(crate) trait CompactScheduler: 'static { 17 | fn enqueue(&self, l_id: LevelId); 18 | 19 | fn finished(&self, l_id: LevelId); 20 | 21 | async fn schedule(self: Rc) -> Option; 22 | 23 | fn install(self: Rc, tq: TaskQueueHandle) -> Result<()> { 24 | let sched = self.clone(); 25 | TimerActionRepeat::repeat_into(move || sched.clone().schedule(), tq)?; 26 | 27 | Ok(()) 28 | } 29 | } 30 | 31 | pub(crate) struct QueueUpCompSched { 32 | is_compacting: RefCell, 33 | interval: Duration, 34 | queue: RefCell>, 35 | delay_num: usize, 36 | levels: Rc>, 37 | tq: TaskQueueHandle, 38 | } 39 | 40 | impl QueueUpCompSched { 41 | /// Create a not fully initialized instance. The return value should be 42 | /// `init()` first. 43 | /// 44 | /// This is expected to create a "memory leak" manifests as cyclic reference 45 | /// ([Self] and [Levels]) after `init()`. 46 | pub(crate) unsafe fn new_zeroed( 47 | interval: Duration, 48 | delay_num: usize, 49 | tq: TaskQueueHandle, 50 | ) -> Rc { 51 | Rc::new(Self { 52 | is_compacting: RefCell::new(false), 53 | interval, 54 | queue: RefCell::new(VecDeque::new()), 55 | delay_num, 56 | levels: mem::transmute::, Rc>>(Rc::new(())), 57 | tq, 58 | }) 59 | } 60 | 61 | /// Initialize this with given levels. 62 | pub(crate) fn init(self: Rc, levels: Rc>) { 63 | unsafe { 64 | let empty_rc = mem::replace( 65 | &mut (*(Rc::as_ptr(&self) as *mut QueueUpCompSched)).levels, 66 | levels.clone(), 67 | ); 68 | let _ = mem::transmute::>, Rc<()>>(empty_rc); 69 | } 70 | } 71 | 72 | fn enqueue(&self, l_id: LevelId) { 73 | self.queue.borrow_mut().push_back(l_id); 74 | } 75 | 76 | fn finished(&self, l_id: LevelId) { 77 | *self.is_compacting.borrow_mut() = false; 78 | } 79 | 80 | async fn schedule(self: Rc) -> Option { 81 | if *self.is_compacting.borrow() || self.queue.borrow().len() < self.delay_num { 82 | return Some(self.interval); 83 | } 84 | 85 | let level_id = self.queue.borrow_mut().pop_front().unwrap(); 86 | *self.is_compacting.borrow_mut() = true; 87 | 88 | let levels = self.levels.clone(); 89 | glommio::spawn_local_into( 90 | async move { 91 | // todo: propagate Error? 92 | let _ = levels.compact_level(level_id).await; 93 | }, 94 | self.tq, 95 | ) 96 | .unwrap() 97 | .detach(); 98 | 99 | Some(self.interval) 100 | } 101 | 102 | /// For writing mock test. 103 | /// 104 | /// # Panic 105 | /// `levels` in the returning object is not initialize (an empty `Weak`). 106 | /// Any operations make this to call `levels` will panic due to 107 | /// the attempt of trying to upgrade that empty weak pointer. 108 | #[cfg(test)] 109 | pub(crate) fn default() -> (Rc, TaskQueueHandle) { 110 | let tq = glommio::executor().create_task_queue( 111 | glommio::Shares::default(), 112 | glommio::Latency::NotImportant, 113 | "test_comp_tq", 114 | ); 115 | let this = Self { 116 | is_compacting: RefCell::new(false), 117 | interval: Duration::from_secs(1), 118 | queue: RefCell::new(VecDeque::new()), 119 | delay_num: 3, 120 | levels: unsafe { 121 | std::mem::transmute::, Rc>>(Rc::new(())) 122 | }, 123 | tq, 124 | }; 125 | 126 | (Rc::new(this), tq) 127 | } 128 | } 129 | 130 | #[async_trait(?Send)] 131 | impl CompactScheduler for QueueUpCompSched { 132 | fn enqueue(&self, l_id: LevelId) { 133 | self.enqueue(l_id) 134 | } 135 | 136 | fn finished(&self, l_id: LevelId) { 137 | self.finished(l_id) 138 | } 139 | 140 | async fn schedule(self: Rc) -> Option { 141 | self.schedule().await 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | use crate::file::FileManager; 2 | use crate::fn_registry::FnRegistry; 3 | 4 | pub struct Context { 5 | pub fn_registry: FnRegistry, 6 | pub(crate) file_manager: FileManager, 7 | } 8 | -------------------------------------------------------------------------------- /src/db.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::intrinsics::unlikely; 3 | use std::path::Path; 4 | use std::sync::atomic::{AtomicBool, Ordering}; 5 | use std::sync::Arc; 6 | 7 | use futures_util::future::{join_all, try_join_all}; 8 | use glommio::channels::channel_mesh::MeshBuilder; 9 | use glommio::{ExecutorJoinHandle, LocalExecutor, LocalExecutorBuilder}; 10 | use tokio::sync::mpsc::{channel as bounded_channel, Sender}; 11 | use tokio::sync::oneshot::channel as oneshot; 12 | use tokio::sync::Mutex; 13 | use tracing::info; 14 | 15 | use crate::context::Context; 16 | use crate::error::{HelixError, Result}; 17 | use crate::file::FileManager; 18 | use crate::io_worker::{IOWorker, Task}; 19 | use crate::iterator::{Iterator, ShardMuxTimeIterator, ShardTimeIterator, TimeIterator}; 20 | use crate::option::{Options, ReadOption, ScanOption}; 21 | use crate::types::{Bytes, Entry, TimeRange}; 22 | use crate::util::Comparator; 23 | 24 | /// Size of channels that used to do IPC between shards. 25 | const CHANNEL_MESH_SIZE: usize = 128; 26 | 27 | #[derive(Clone)] 28 | pub struct HelixDB { 29 | core: Arc, 30 | } 31 | 32 | impl HelixDB { 33 | pub fn open>(path: P, opts: Options) -> Self { 34 | Self { 35 | core: Arc::new(HelixCore::new(path, opts)), 36 | } 37 | } 38 | 39 | /// Open HelixDB with default [Options] 40 | pub fn open_default>(path: P) -> Self { 41 | let opts = Options::default(); 42 | Self::open(path, opts) 43 | } 44 | 45 | pub async fn put(&self, write_batch: Vec) -> Result<()> { 46 | self.core.sharding_put(write_batch).await 47 | } 48 | 49 | pub async fn direct_put(&self, shard_id: usize, write_batch: Vec) -> Result<()> { 50 | self.core.put_unchecked(shard_id, write_batch).await 51 | } 52 | 53 | pub async fn get(&self, timestamp: i64, key: Bytes, opt: ReadOption) -> Result> { 54 | self.core.sharding_get(timestamp, key, opt).await 55 | } 56 | 57 | pub async fn direct_get( 58 | &self, 59 | shard_id: usize, 60 | timestamp: i64, 61 | key: Bytes, 62 | opt: ReadOption, 63 | ) -> Result> { 64 | self.core.get_unchecked(shard_id, timestamp, key, opt).await 65 | } 66 | 67 | pub async fn scan( 68 | &self, 69 | time_range: TimeRange, 70 | key_range: (Bytes, Bytes), 71 | opt: ScanOption, 72 | ) -> Result { 73 | self.core.scan::(time_range, key_range, opt).await 74 | } 75 | 76 | pub async fn close(self) { 77 | info!("Closing HelixDB"); 78 | self.core.close().await; 79 | } 80 | } 81 | 82 | unsafe impl Send for HelixDB {} 83 | unsafe impl Sync for HelixDB {} 84 | 85 | pub(crate) struct HelixCore { 86 | /// Join handles of shards' working threads. 87 | worker_handle: Vec>, 88 | task_txs: Vec>, 89 | ctx: Arc, 90 | is_closed: AtomicBool, 91 | } 92 | 93 | impl HelixCore { 94 | fn new>(path: P, mut opts: Options) -> Self { 95 | let file_manager = FileManager::with_base_dir(path, opts.num_shard).unwrap(); 96 | let ctx = Arc::new(Context { 97 | file_manager, 98 | fn_registry: opts.fn_registry.take().unwrap(), 99 | }); 100 | let tsr = Arc::new(Mutex::new(opts.tsr.take().unwrap())); 101 | let level_info = LocalExecutor::default().run(async { 102 | // initialize components requiring runtime. 103 | Arc::new(Mutex::new( 104 | ctx.file_manager.open_level_info().await.unwrap(), 105 | )) 106 | }); 107 | 108 | let mut worker_handle = Vec::with_capacity(opts.num_shard); 109 | let mut task_txs = Vec::with_capacity(opts.num_shard); 110 | let mesh_builder = MeshBuilder::full(opts.num_shard, CHANNEL_MESH_SIZE); 111 | 112 | for tid in 0..opts.num_shard as u64 { 113 | let tsr = tsr.clone(); 114 | let ctx = ctx.clone(); 115 | let opts = opts.clone_partial(); 116 | let (tx, rx) = bounded_channel(opts.task_buffer_size); 117 | let mesh_builder = mesh_builder.clone(); 118 | let level_info = level_info.clone(); 119 | 120 | let handle = LocalExecutorBuilder::new(glommio::Placement::Fixed(tid as usize)) 121 | .spawn(move || async move { 122 | let (sender, receiver) = mesh_builder.join().await.unwrap(); 123 | let worker = IOWorker::try_new(tid, opts, tsr, level_info, ctx, sender) 124 | .await 125 | .unwrap(); 126 | worker.run(rx, receiver).await 127 | }) 128 | .unwrap(); 129 | 130 | worker_handle.push(handle); 131 | task_txs.push(tx); 132 | } 133 | 134 | Self { 135 | worker_handle, 136 | task_txs, 137 | ctx, 138 | is_closed: AtomicBool::new(false), 139 | } 140 | } 141 | 142 | /// Dispatch entries in write batch to corresponding shards. 143 | async fn sharding_put(&self, write_batch: Vec) -> Result<()> { 144 | self.check_closed()?; 145 | 146 | let mut tasks = HashMap::>::new(); 147 | 148 | for entry in write_batch { 149 | let shard_id = self.ctx.fn_registry.sharding_fn()(&entry.key); 150 | tasks.entry(shard_id).or_default().push(entry); 151 | } 152 | 153 | let mut futures = Vec::with_capacity(tasks.len()); 154 | for (shard_id, write_batch) in tasks { 155 | futures.push(self.put_unchecked(shard_id, write_batch)); 156 | } 157 | 158 | try_join_all(futures).await?; 159 | Ok(()) 160 | } 161 | 162 | /// Put on specified shard without routing. 163 | async fn put_unchecked(&self, worker: usize, write_batch: Vec) -> Result<()> { 164 | self.check_closed()?; 165 | 166 | let (tx, rx) = oneshot(); 167 | let task = Task::Put(write_batch, tx); 168 | 169 | self.task_txs[worker].send(task).await?; 170 | 171 | rx.await? 172 | } 173 | 174 | async fn sharding_get( 175 | &self, 176 | timestamp: i64, 177 | key: Bytes, 178 | opt: ReadOption, 179 | ) -> Result> { 180 | self.check_closed()?; 181 | 182 | let shard_id = self.ctx.fn_registry.sharding_fn()(&key); 183 | self.get_unchecked(shard_id, timestamp, key, opt).await 184 | } 185 | 186 | /// Get on specified shard without routing. 187 | async fn get_unchecked( 188 | &self, 189 | worker: usize, 190 | timestamp: i64, 191 | key: Bytes, 192 | opt: ReadOption, 193 | ) -> Result> { 194 | self.check_closed()?; 195 | 196 | let (tx, rx) = oneshot(); 197 | let task = Task::Get(timestamp, key, tx, opt); 198 | 199 | self.task_txs[worker].send(task).await?; 200 | 201 | rx.await? 202 | } 203 | 204 | async fn scan( 205 | &self, 206 | time_range: TimeRange, 207 | key_range: (Bytes, Bytes), 208 | opt: ScanOption, 209 | ) -> Result { 210 | self.check_closed()?; 211 | 212 | let iters: Vec<_> = (0..self.shards()) 213 | .map(|worker| (worker, key_range.clone())) 214 | .map(async |(worker, key_range)| -> Result<_> { 215 | let (tx, rx) = bounded_channel(opt.prefetch_buf_size); 216 | 217 | self.task_txs[worker] 218 | .send(Task::Scan( 219 | time_range, 220 | key_range.0, 221 | key_range.1, 222 | tx, 223 | Arc::new(C::cmp), 224 | )) 225 | .await?; 226 | Ok(ShardTimeIterator::new(rx).await) 227 | }) 228 | .collect(); 229 | 230 | let iters = try_join_all(iters).await?; 231 | let mux_iter = ShardMuxTimeIterator::::new(iters, opt.prefetch_buf_size).await; 232 | let iter = TimeIterator::new(mux_iter); 233 | 234 | Ok(iter) 235 | } 236 | 237 | async fn close(&self) { 238 | self.is_closed.store(true, Ordering::SeqCst); 239 | 240 | for index in 0..self.shards() { 241 | let _ = self.task_txs[index].send(Task::Shutdown).await; 242 | } 243 | 244 | join_all(self.task_txs.iter().map(|sender| sender.closed())).await; 245 | } 246 | 247 | fn shards(&self) -> usize { 248 | self.worker_handle.len() 249 | } 250 | 251 | fn check_closed(&self) -> Result<()> { 252 | // false positive 253 | #[allow(unused_unsafe)] 254 | if unsafe { unlikely(self.is_closed.load(Ordering::SeqCst)) } { 255 | return Err(HelixError::Closed); 256 | } 257 | 258 | Ok(()) 259 | } 260 | } 261 | 262 | impl Drop for HelixCore { 263 | fn drop(&mut self) { 264 | drop(std::mem::take(&mut self.task_txs)); 265 | 266 | for handle in std::mem::take(&mut self.worker_handle) { 267 | let _ = handle.join(); 268 | } 269 | } 270 | } 271 | 272 | #[cfg(test)] 273 | mod test { 274 | use std::convert::TryInto; 275 | 276 | use tempfile::tempdir; 277 | 278 | use super::*; 279 | use crate::{FnRegistry, LexicalComparator, SimpleTimestampReviewer}; 280 | 281 | #[tokio::test] 282 | async fn example() { 283 | let base_dir = tempdir().unwrap(); 284 | let db = HelixDB::open_default(base_dir.path()); 285 | 286 | let entry = Entry { 287 | timestamp: 0, 288 | key: b"key".to_vec(), 289 | value: b"value".to_vec(), 290 | }; 291 | db.put(vec![entry.clone()]).await.unwrap(); 292 | 293 | let result = db 294 | .get(0, b"key".to_vec(), ReadOption::default()) 295 | .await 296 | .unwrap(); 297 | assert_eq!(result.unwrap(), entry); 298 | } 299 | 300 | async fn scan_test_scaffold( 301 | num_shard: usize, 302 | num_timestamp: i64, 303 | num_key: u64, 304 | compact_interval: i64, 305 | ) { 306 | assert!(num_timestamp > 0, "timestamp number should be positive"); 307 | 308 | let mut fn_registry = FnRegistry::new_noop(); 309 | fn_registry.register_sharding_key_fn(Arc::new(move |key| { 310 | u64::from_le_bytes(key.to_owned().try_into().unwrap()) as usize % num_shard 311 | })); 312 | let simple_tsr = SimpleTimestampReviewer::new(compact_interval, i64::MAX); 313 | let opts = Options::default() 314 | .shards(num_shard) 315 | .set_fn_registry(fn_registry) 316 | .set_timestamp_reviewer(Box::new(simple_tsr)); 317 | let base_dir = tempdir().unwrap(); 318 | let db = HelixDB::open(base_dir.path(), opts); 319 | 320 | // write 321 | for timestamp in 0..num_timestamp { 322 | let entries = (0..num_key) 323 | .into_iter() 324 | .map(|key| Entry { 325 | timestamp, 326 | key: key.to_le_bytes().to_vec(), 327 | value: b"value".to_vec(), 328 | }) 329 | .collect(); 330 | db.put(entries).await.unwrap(); 331 | } 332 | 333 | println!("write finished"); 334 | 335 | // scan 336 | let mut iter = db 337 | .scan::( 338 | (0, num_timestamp).into(), 339 | (0u64.to_le_bytes().to_vec(), num_key.to_le_bytes().to_vec()), 340 | ScanOption { 341 | prefetch_buf_size: 1, 342 | }, 343 | ) 344 | .await 345 | .unwrap(); 346 | 347 | let mut count = 0; 348 | while iter.is_valid() { 349 | iter.next().await.unwrap(); 350 | count += 1; 351 | } 352 | 353 | assert_eq!(num_timestamp as u64 * num_key, count); 354 | } 355 | 356 | #[tokio::test] 357 | async fn scan_1_shard_without_compaction() { 358 | scan_test_scaffold(1, 10, 128, 1024).await; 359 | } 360 | 361 | #[tokio::test] 362 | async fn scan_many_shards_without_compaction() { 363 | scan_test_scaffold(num_cpus::get(), 10, 128, 1024).await; 364 | } 365 | 366 | #[tokio::test] 367 | async fn scan_many_shards_with_compaction() { 368 | scan_test_scaffold(2, 64, 8, 32).await; 369 | } 370 | 371 | #[tokio::test] 372 | async fn recover_from_restart() { 373 | let base_dir = tempdir().unwrap(); 374 | let opts = Options::default() 375 | .set_timestamp_reviewer(Box::new(SimpleTimestampReviewer::new(5, 100))) 376 | .shards(1); 377 | let db = HelixDB::open(base_dir.path(), opts); 378 | 379 | let tasks = (0..50) 380 | .map(|ts| { 381 | db.put(vec![Entry { 382 | timestamp: ts, 383 | key: b"key".to_vec(), 384 | value: b"value".to_vec(), 385 | }]) 386 | }) 387 | .collect::>(); 388 | try_join_all(tasks).await.unwrap(); 389 | db.close().await; 390 | 391 | let opts = Options::default() 392 | .set_timestamp_reviewer(Box::new(SimpleTimestampReviewer::new(5, 100))) 393 | .shards(1); 394 | let db = HelixDB::open(base_dir.path(), opts); 395 | for ts in 0..50 { 396 | let result = db 397 | .get(ts, b"key".to_vec(), ReadOption::default()) 398 | .await 399 | .unwrap(); 400 | assert_eq!(result.unwrap().value, b"value".to_vec()); 401 | } 402 | } 403 | } 404 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use thiserror::Error; 4 | 5 | use crate::io_worker::Task; 6 | use crate::types::Entry; 7 | 8 | pub type Result = std::result::Result; 9 | 10 | #[derive(Error, Debug)] 11 | pub enum HelixError { 12 | #[error("IO error {0}")] 13 | IO(#[from] io::Error), 14 | #[error("Glommio error {0}")] 15 | Glommio(#[from] glommio::GlommioError<()>), 16 | #[error("Common HelixDB error")] 17 | Common, 18 | #[error("Element not found")] 19 | NotFound, 20 | #[error("Task dropped")] 21 | Dropped(#[from] tokio::sync::oneshot::error::RecvError), 22 | #[error("Failed to send due to Helix is stopped")] 23 | Stopped(#[from] tokio::sync::mpsc::error::SendError), 24 | #[error("Operation {0} is poisoned")] 25 | Poisoned(String), 26 | // todo: review this usage. 27 | #[error("Internal channel disconnected")] 28 | Disconnected(#[from] tokio::sync::mpsc::error::SendError>), 29 | #[error("Incompatible length or size, expect {0}, got {1}")] 30 | IncompatibleLength(usize, usize), 31 | #[error("Helix is closed")] 32 | Closed, 33 | #[error("Running into unreachable situation {0}")] 34 | Unreachable(String), 35 | } 36 | -------------------------------------------------------------------------------- /src/file/file_manager.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::fs; 3 | use std::path::{Path, PathBuf}; 4 | use std::rc::Rc; 5 | use std::sync::Arc; 6 | 7 | use tokio::sync::Mutex; 8 | use tracing::warn; 9 | 10 | use crate::error::{HelixError, Result}; 11 | use crate::io::File; 12 | use crate::types::{Bytes, LevelId, LevelInfo, ThreadId}; 13 | use crate::util::{AssertSend, AssertSync}; 14 | 15 | const COMMON_FILE_PREFIX: &str = "helix"; 16 | const COMMON_FILE_EXTENSION: &str = "hlx"; 17 | const BINARY_FILE_EXTENSION: &str = "bin"; 18 | 19 | const LEVEL_INFO_FILENAME: &str = "LEVEL_INFO"; 20 | 21 | pub(crate) enum FileType { 22 | Rick, 23 | VLog, 24 | SSTable, 25 | Manifest, 26 | Others(String), 27 | } 28 | 29 | impl FileType { 30 | fn file_name_desc(&self) -> &str { 31 | match self { 32 | FileType::Rick => "rick", 33 | FileType::VLog => "vlog", 34 | FileType::SSTable => "sst", 35 | FileType::Manifest => "manifest", 36 | FileType::Others(name) => name, 37 | } 38 | } 39 | } 40 | 41 | #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] 42 | pub(crate) enum FileNo { 43 | LevelInfo, 44 | Rick(LevelId), 45 | SSTable(LevelId), 46 | } 47 | 48 | impl FileNo { 49 | fn name(&self) -> String { 50 | match self { 51 | FileNo::LevelInfo => "LEVEL_INFO".to_string(), 52 | FileNo::Rick(l_id) => format!("rick-{}.{}", l_id, BINARY_FILE_EXTENSION), 53 | FileNo::SSTable(l_id) => format!("sst-{}.{}", l_id, BINARY_FILE_EXTENSION), 54 | } 55 | } 56 | } 57 | 58 | pub(crate) enum OtherType { 59 | /// Timestamp range of each level. 60 | LevelInfo, 61 | /// Thread Identifier. 62 | TId, 63 | } 64 | 65 | #[derive(Clone)] 66 | struct RawFilePtr(Rc); 67 | 68 | unsafe impl Send for RawFilePtr {} 69 | unsafe impl Sync for RawFilePtr {} 70 | 71 | /// Proxy for all file open/create operations. 72 | /// 73 | /// It will keep opened file until a explicit garbage collection. So others 74 | /// needn't to close file. 75 | pub(crate) struct FileManager { 76 | base_dir: PathBuf, 77 | // todo: GC. maybe do it when outdating some level. 78 | fd_pool: Arc>>, 79 | } 80 | 81 | impl AssertSync for FileManager {} 82 | impl AssertSend for FileManager {} 83 | 84 | impl FileManager { 85 | pub(crate) fn with_base_dir>(base_dir: P, shards: usize) -> Result { 86 | fs::create_dir_all(base_dir.as_ref())?; 87 | 88 | // check dir 89 | let dir_num = fs::read_dir(base_dir.as_ref())? 90 | .map(|dir| Ok(dir?.file_type()?.is_dir())) 91 | .collect::>>()? 92 | .len(); 93 | if dir_num != shards { 94 | warn!( 95 | "Detected {} folder in {:?}, which isn't equal to given shard number {}", 96 | dir_num, 97 | base_dir.as_ref().to_str(), 98 | shards 99 | ); 100 | } 101 | 102 | // create sub dir 103 | for id in 0..shards { 104 | match fs::create_dir(base_dir.as_ref().join(id.to_string())) { 105 | Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {} 106 | other => other?, 107 | } 108 | } 109 | 110 | Ok(Self { 111 | base_dir: base_dir.as_ref().to_path_buf(), 112 | fd_pool: Arc::default(), 113 | }) 114 | } 115 | 116 | pub(crate) async fn open(&self, tid: ThreadId, file_no: FileNo) -> Result> { 117 | if let Some(file_ptr) = self.fd_pool.lock().await.get(&(tid, file_no)) { 118 | return Ok(file_ptr.0.clone()); 119 | } 120 | 121 | let name = file_no.name(); 122 | let path = self.base_dir.join(tid.to_string()).join(name); 123 | let file = Rc::new(File::open(path).await?); 124 | let cache_file = file.clone(); 125 | self.fd_pool 126 | .lock() 127 | .await 128 | .insert((tid, file_no), RawFilePtr(cache_file)); 129 | 130 | Ok(file) 131 | } 132 | 133 | /// Close files not used by others, i.e., strong count is 1. 134 | /// 135 | /// # Notice 136 | /// As [FileManager] is shared between all shards, it keep all files that 137 | /// should not be visible to other shards. Trying to close with wrong `tid` 138 | /// is undefined behavior. 139 | pub(crate) async fn close_some(&self, tid: ThreadId) -> Result<()> { 140 | let free_list = self 141 | .fd_pool 142 | .lock() 143 | .await 144 | .extract_if(|(thread_id, _), file| *thread_id == tid && Rc::strong_count(&file.0) == 1) 145 | .collect::>(); 146 | 147 | for (_, file) in free_list { 148 | match Rc::try_unwrap(file.0) { 149 | Ok(file) => file.close().await?, 150 | Err(file) => { 151 | return Err(HelixError::Unreachable( 152 | "Going to close a file which is still referenced".to_string(), 153 | )); 154 | } 155 | } 156 | } 157 | 158 | Ok(()) 159 | } 160 | 161 | // todo: deprecate this. 162 | /// Open or create [LevelInfo]. 163 | pub(crate) async fn open_level_info(&self) -> Result { 164 | let filename = self.base_dir.join(LEVEL_INFO_FILENAME); 165 | let file = File::open(filename).await?; 166 | 167 | // read all 168 | let size = file.size().await?; 169 | let buf = file.read(0, size).await?; 170 | file.close().await?; 171 | 172 | let level_info = LevelInfo::decode(&buf); 173 | Ok(level_info) 174 | } 175 | 176 | // todo: correct this. 177 | /// Refresh (overwrite) level info file. 178 | pub(crate) async fn sync_level_info(&self, bytes: Bytes) -> Result<()> { 179 | let filename = self.base_dir.join(LEVEL_INFO_FILENAME); 180 | let file = File::open(filename).await?; 181 | 182 | file.write(bytes, 0).await?; 183 | file.sync().await?; 184 | file.close().await?; 185 | 186 | Ok(()) 187 | } 188 | } 189 | 190 | #[cfg(test)] 191 | mod test { 192 | use std::os::unix::io::AsRawFd; 193 | 194 | use glommio::LocalExecutor; 195 | use tempfile::tempdir; 196 | 197 | use super::*; 198 | 199 | #[test] 200 | fn new_file_manager() { 201 | let ex = LocalExecutor::default(); 202 | let base_dir = tempdir().unwrap(); 203 | let file_manager = FileManager::with_base_dir(base_dir.path(), 8).unwrap(); 204 | 205 | ex.run(async { 206 | assert_eq!(base_dir.path().read_dir().unwrap().count(), 8); 207 | }); 208 | } 209 | 210 | #[test] 211 | fn reopen_file() { 212 | let ex = LocalExecutor::default(); 213 | let base_dir = tempdir().unwrap(); 214 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 215 | 216 | ex.run(async { 217 | let info_file = file_manager.open(0, FileNo::LevelInfo).await.unwrap(); 218 | let first_fd = info_file.as_raw_fd(); 219 | 220 | drop(info_file); 221 | let info_file = file_manager.open(0, FileNo::LevelInfo).await.unwrap(); 222 | let second_fd = info_file.as_raw_fd(); 223 | 224 | assert_eq!(first_fd, second_fd); 225 | }); 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/file/mod.rs: -------------------------------------------------------------------------------- 1 | mod file_manager; 2 | mod rick; 3 | mod sstable; 4 | 5 | pub(crate) use file_manager::{FileManager, FileNo}; 6 | pub use rick::Rick; 7 | pub use sstable::{IndexBlockBuilder, SSTable, TableBuilder}; 8 | -------------------------------------------------------------------------------- /src/file/rick.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::rc::Rc; 3 | use std::time::Instant; 4 | 5 | use tracing::trace; 6 | 7 | use crate::error::Result; 8 | use crate::index::MemIndex; 9 | use crate::io::File; 10 | use crate::types::{ 11 | Bytes, 12 | Entry, 13 | EntryMeta, 14 | Offset, 15 | RickSuperBlock, 16 | TimeRange, 17 | Timestamp, 18 | ValueFormat, 19 | }; 20 | use crate::util::check_bytes_length; 21 | 22 | /// Handles to entries in rick (level 0). 23 | /// 24 | /// Every shard will only have up to one rick file at any time. 25 | /// 26 | /// (above is out-of-date) 27 | /// 28 | /// Rick file may contains "hole" due to garbage collection. 29 | /// It will have a [RickSuperBlock] at the very beginning (offset 0) 30 | /// contains two pointers "start" and "end" (start < end) 31 | /// which can tell where the legal range is. The start pointer should 32 | /// points to a record's beginning. The start pointer is pushed by GC procedure 33 | /// and end pointer is pushed by both `append()` method and GC procedure. 34 | /// 35 | /// Rick can be either ordered or disordered, dependents on which level 36 | /// it sites. 37 | pub struct Rick { 38 | file: Rc, 39 | sb: RickSuperBlock, 40 | } 41 | 42 | impl Rick { 43 | /// Open a `Rick` from given file. 44 | /// 45 | /// Optional parameter `value_format` will be used to initialize a rick 46 | /// file. If the rick file is not empty it will be ignored. If `None` is 47 | /// provided, the `value_format` field in super block will be set to 48 | /// default value, which is `RawValue`. 49 | pub async fn open(file: Rc, value_format: Option) -> Result { 50 | let sb = Self::read_super_block(&file, value_format).await?; 51 | 52 | Ok(Self { file, sb }) 53 | } 54 | 55 | /// Returns vector of (timestamp, key, entry's offset) to update index. 56 | /// 57 | /// `sync()` will be called before return. 58 | /// 59 | /// Once this method return, this `append` operation is considered finished 60 | /// on rick file. Even if it crashed before returned indices are 61 | /// persist. 62 | /// 63 | /// Encoding format: | payload length (u64) | payload | 64 | // todo: is it necessary to return inserted timestamp and key? 65 | pub async fn append(&mut self, entries: Vec) -> Result> { 66 | let mut positions = Vec::with_capacity(entries.len()); 67 | let file_length = self.sb.legal_offset_end; 68 | 69 | // construct binary buffer. 70 | let mut buf = vec![]; 71 | for entry in entries { 72 | let bytes = entry.encode(); 73 | let length = EntryMeta::new(bytes.len() as u64).encode(); 74 | let buf_len_before = buf.len() as u64; 75 | buf.extend_from_slice(&length); 76 | buf.extend_from_slice(&bytes); 77 | positions.push((entry.timestamp, entry.key, file_length + buf_len_before)); 78 | } 79 | 80 | // write to file 81 | let new_file_length = file_length + buf.len() as u64; 82 | self.file.write(buf, file_length).await?; 83 | 84 | // update super block and sync 85 | self.sb.legal_offset_end = new_file_length; 86 | self.sync_super_block().await?; 87 | self.sync().await?; 88 | 89 | Ok(positions) 90 | } 91 | 92 | /// Read from a offset. 93 | /// 94 | /// Entry not found will be return as a error. 95 | /// 96 | /// Maybe verify key here? 97 | pub async fn read(&self, offset: u64) -> Result { 98 | let meta_buf = self 99 | .file 100 | .read(offset, EntryMeta::meta_size() as u64) 101 | .await?; 102 | check_bytes_length(&meta_buf, EntryMeta::meta_size())?; 103 | let meta = EntryMeta::decode(&meta_buf); 104 | 105 | let offload_buf = self 106 | .file 107 | .read(offset + EntryMeta::meta_size() as u64, meta.length) 108 | .await?; 109 | check_bytes_length(&offload_buf, meta.length as usize)?; 110 | 111 | Ok(Entry::decode(&offload_buf)) 112 | } 113 | 114 | /// Reads offsets. 115 | // todo: this might be refined by batching io. 116 | pub async fn reads(&mut self, mut offsets: Vec) -> Result> { 117 | // fast pass 118 | if offsets.len() < 2 { 119 | return match offsets.first() { 120 | Some(offset) => Ok(vec![self.read(*offset).await?]), 121 | None => Ok(vec![]), 122 | }; 123 | } 124 | 125 | let now = Instant::now(); 126 | trace!("[rick] start reads {} entries", offsets.len()); 127 | 128 | offsets.sort_unstable(); 129 | let min = *offsets.first().unwrap(); 130 | let max = offsets.remove(offsets.len() - 1); 131 | let bytes = self.file.read(min, max - min).await?; 132 | let mut entries_iter = Self::decode_entries(&bytes)?.into_iter().peekable(); 133 | let mut entries = Vec::with_capacity(offsets.len() + 1); 134 | 135 | trace!( 136 | "[rick] read and decode takes {:?} ms", 137 | now.elapsed().as_millis() 138 | ); 139 | 140 | // filter decoded entries via given offsets 141 | for offset in &offsets { 142 | while entries_iter.peek().unwrap().1 + min != *offset { 143 | entries_iter.next(); 144 | } 145 | entries.push(entries_iter.next().unwrap().0); 146 | } 147 | 148 | trace!("[rick] filter takes {:?} ms", now.elapsed().as_millis()); 149 | 150 | // read the last offset 151 | entries.push(self.read(max).await?); 152 | 153 | Ok(entries) 154 | } 155 | 156 | pub fn is_compressed(&self) -> bool { 157 | self.sb.value_format == ValueFormat::CompressedValue 158 | } 159 | 160 | /// Scan this rick file and construct its memindex 161 | /// 162 | /// Generally, Rick file will couple with a persisted index file SSTable. 163 | /// Except those new ricks that memindex is not flushed to disk yet. 164 | pub async fn construct_index(&self) -> Result { 165 | let contents = self 166 | .file 167 | .read(self.start(), self.end() - self.start()) 168 | .await?; 169 | let mut index = 0; 170 | 171 | let mut indices = BTreeMap::new(); 172 | let mut offset = RickSuperBlock::LENGTH; 173 | 174 | while index < contents.len() { 175 | let prefix_buf = &contents[index..index + EntryMeta::meta_size()]; 176 | index += EntryMeta::meta_size(); 177 | let meta = EntryMeta::decode(prefix_buf); 178 | let offload_length = meta.length as usize; 179 | let offload_buf = &contents[index..index + offload_length]; 180 | index += offload_length; 181 | let entry = Entry::decode(offload_buf); 182 | 183 | indices.insert((entry.timestamp, entry.key), offset as u64); 184 | offset += EntryMeta::meta_size() + offload_length; 185 | } 186 | 187 | let mem_index = MemIndex::from_existing(indices); 188 | Ok(mem_index) 189 | } 190 | 191 | pub async fn sync(&self) -> Result<()> { 192 | self.file.sync().await?; 193 | 194 | Ok(()) 195 | } 196 | 197 | /// Recycle entries in given `range` by free them using `fallocate` syscall. 198 | /// 199 | /// The general procedure would be like: 200 | /// - Traverse some entries from "start", for each entry 201 | /// - suit in `range`, should be recycle. 202 | /// - not suit, query index (if have) whether it is legal. Put it into 203 | /// "need rewrite" buffer if is, and discard if not. 204 | /// - Acquire write lock and write those "need rewrite" to the end of file. 205 | /// Then update index (if have) and sync index (if need). 206 | /// - Sync super block to update "start" and "end" pointer to make above 207 | /// change visible. After this the write l ock can be released. 208 | /// - Recycle space occupied by those offset is smaller than "start" 209 | /// pointer. 210 | pub async fn garbage_collect(&self, range: TimeRange) -> Result<()> { 211 | // yield control to executor. 212 | glommio::yield_if_needed().await; 213 | 214 | todo!() 215 | } 216 | 217 | // This is a temporary work around. Should be replaced by `garbage_collect()` 218 | // above. 219 | pub async fn clean(&mut self) -> Result<()> { 220 | // mark as illegal 221 | self.sb.legal_offset_start = self.sb.legal_offset_end; 222 | self.sync_super_block().await 223 | } 224 | 225 | pub async fn push_legal_offset_start(&mut self, new_offset_start: Offset) -> Result<()> { 226 | if new_offset_start <= self.sb.legal_offset_start { 227 | return Ok(()); 228 | } 229 | 230 | self.sb.legal_offset_start = new_offset_start; 231 | self.sync_super_block().await 232 | } 233 | 234 | pub fn get_legal_offset_end(&self) -> Offset { 235 | self.sb.legal_offset_end 236 | } 237 | 238 | /// Read super block from the first 4KB block of file. 239 | /// And if file is empty a new super block will be created. 240 | /// 241 | /// `value_format` only works when initializing rick file. 242 | /// Default value is `RawValue`. 243 | async fn read_super_block( 244 | file: &File, 245 | value_format: Option, 246 | ) -> Result { 247 | // check whether super block exist. 248 | let file_length = file.size().await?; 249 | if file_length == 0 { 250 | let value_format = value_format.unwrap_or(ValueFormat::RawValue); 251 | // create super block and write it to file. 252 | let sb = RickSuperBlock { 253 | // todo: make it a parameter. 254 | is_ordered: false, 255 | legal_offset_start: RickSuperBlock::LENGTH as u64, 256 | legal_offset_end: RickSuperBlock::LENGTH as u64, 257 | // todo: make it a parameter. 258 | value_format, 259 | align_timestamp: 0, 260 | }; 261 | 262 | let buf = sb.encode(); 263 | file.write(buf, 0).await?; 264 | 265 | Ok(sb) 266 | } else { 267 | // otherwise read from head. 268 | let buf = file.read(0, RickSuperBlock::LENGTH as u64).await?; 269 | let sb = RickSuperBlock::decode(&buf); 270 | 271 | Ok(sb) 272 | } 273 | } 274 | 275 | // todo: check crash consistency. 276 | async fn sync_super_block(&self) -> Result<()> { 277 | let buf = self.sb.encode(); 278 | self.file.write(buf, 0).await?; 279 | 280 | Ok(()) 281 | } 282 | 283 | /// Decode to entries and the offset over input bytes. 284 | // todo: let `construct_index()` use this 285 | fn decode_entries(contents: &[u8]) -> Result> { 286 | let mut index = 0; 287 | let mut offset = 0; 288 | let mut entries = vec![]; 289 | 290 | while index < contents.len() { 291 | let prefix_buf = &contents[index..index + EntryMeta::meta_size()]; 292 | index += EntryMeta::meta_size(); 293 | let meta = EntryMeta::decode(prefix_buf); 294 | let offload_length = meta.length as usize; 295 | let offload_buf = &contents[index..index + offload_length]; 296 | index += offload_length; 297 | let entry = Entry::decode(offload_buf); 298 | entries.push((entry, offset as u64)); 299 | 300 | offset += EntryMeta::meta_size() + offload_length; 301 | } 302 | 303 | Ok(entries) 304 | } 305 | 306 | /// Get rick's start offset 307 | #[inline] 308 | pub fn start(&self) -> Offset { 309 | self.sb.legal_offset_start 310 | } 311 | 312 | /// Get rick's end offset. 313 | #[inline] 314 | pub fn end(&self) -> Offset { 315 | self.sb.legal_offset_end 316 | } 317 | 318 | pub fn get_align_ts(&self) -> Timestamp { 319 | self.sb.align_timestamp 320 | } 321 | 322 | pub async fn set_align_ts(&mut self, ts: Timestamp) -> Result<()> { 323 | self.sb.align_timestamp = ts; 324 | self.sync_super_block().await 325 | } 326 | } 327 | 328 | #[cfg(test)] 329 | mod test { 330 | use glommio::LocalExecutor; 331 | use tempfile::tempdir; 332 | 333 | use super::*; 334 | use crate::file::file_manager::FileManager; 335 | use crate::file::FileNo; 336 | 337 | #[test] 338 | fn new_super_block() { 339 | let ex = LocalExecutor::default(); 340 | 341 | ex.run(async { 342 | let base_dir = tempdir().unwrap(); 343 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 344 | let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap(); 345 | let mut rick = Rick::open(rick_file, None).await.unwrap(); 346 | 347 | assert_eq!(RickSuperBlock::LENGTH, rick.start() as usize); 348 | assert_eq!(RickSuperBlock::LENGTH, rick.end() as usize); 349 | 350 | // write something 351 | let entry = Entry { 352 | timestamp: 1, 353 | key: b"key".to_vec(), 354 | value: b"value".to_vec(), 355 | }; 356 | rick.append(vec![entry.clone()]).await.unwrap(); 357 | let new_rick_end = rick.end(); 358 | assert_ne!(RickSuperBlock::LENGTH, rick.end() as usize); 359 | 360 | // close and open again 361 | drop(rick); 362 | file_manager.close_some(0).await.unwrap(); 363 | let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap(); 364 | let rick = Rick::open(rick_file, None).await.unwrap(); 365 | 366 | assert_eq!(RickSuperBlock::LENGTH, rick.start() as usize); 367 | assert_eq!(new_rick_end, rick.end()); 368 | }); 369 | } 370 | 371 | #[test] 372 | fn read_write_one_entry() { 373 | let ex = LocalExecutor::default(); 374 | 375 | ex.run(async { 376 | let base_dir = tempdir().unwrap(); 377 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 378 | let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap(); 379 | let mut rick = Rick::open(rick_file, None).await.unwrap(); 380 | 381 | let entry = Entry { 382 | timestamp: 1, 383 | key: b"key".to_vec(), 384 | value: b"value".to_vec(), 385 | }; 386 | rick.append(vec![entry.clone()]).await.unwrap(); 387 | 388 | let read_entry = rick.read(RickSuperBlock::LENGTH as u64).await.unwrap(); 389 | assert_eq!(entry, read_entry); 390 | }); 391 | } 392 | 393 | #[test] 394 | fn reconstruct_memindex() { 395 | let ex = LocalExecutor::default(); 396 | 397 | ex.run(async { 398 | let base_dir = tempdir().unwrap(); 399 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 400 | let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap(); 401 | let mut rick = Rick::open(rick_file, None).await.unwrap(); 402 | 403 | let entries = vec![ 404 | // one key with three timestamps. 405 | (1, b"key1".to_vec(), b"value".to_vec()).into(), 406 | (2, b"key1".to_vec(), b"value".to_vec()).into(), 407 | (3, b"key1".to_vec(), b"value".to_vec()).into(), 408 | // overwrite 409 | (1, b"key2".to_vec(), b"value1".to_vec()).into(), 410 | (1, b"key2".to_vec(), b"value2".to_vec()).into(), 411 | ]; 412 | rick.append(entries.clone()).await.unwrap(); 413 | 414 | let memindex = rick.construct_index().await.unwrap(); 415 | 416 | assert_eq!(3, *memindex.user_keys.get(&b"key1".to_vec()).unwrap()); 417 | assert_eq!(1, *memindex.user_keys.get(&b"key2".to_vec()).unwrap()); 418 | 419 | for index in memindex.into_iter() { 420 | rick.read(index.1).await.unwrap(); 421 | } 422 | }); 423 | } 424 | 425 | #[test] 426 | fn rick_reads_method() { 427 | let ex = LocalExecutor::default(); 428 | 429 | ex.run(async { 430 | let base_dir = tempdir().unwrap(); 431 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 432 | let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap(); 433 | let mut rick = Rick::open(rick_file, None).await.unwrap(); 434 | 435 | let mut entries = vec![ 436 | (1, b"key1".to_vec(), b"value".to_vec()).into(), 437 | (2, b"key1".to_vec(), b"value".to_vec()).into(), 438 | (3, b"key1".to_vec(), b"value".to_vec()).into(), 439 | (1, b"key2".to_vec(), b"value".to_vec()).into(), 440 | (2, b"key2".to_vec(), b"value".to_vec()).into(), 441 | (3, b"key2".to_vec(), b"value".to_vec()).into(), 442 | (1, b"key3".to_vec(), b"value".to_vec()).into(), 443 | (2, b"key3".to_vec(), b"value".to_vec()).into(), 444 | (3, b"key3".to_vec(), b"value".to_vec()).into(), 445 | ]; 446 | let mut offsets: Vec = rick 447 | .append(entries.clone()) 448 | .await 449 | .unwrap() 450 | .into_iter() 451 | .map(|item| item.2) 452 | .collect(); 453 | 454 | // all entries 455 | let reads_result = rick.reads(offsets.clone()).await.unwrap(); 456 | assert_eq!(entries, reads_result); 457 | 458 | // eliminate some in the middle 459 | entries.remove(entries.len() / 2); 460 | offsets.remove(offsets.len() / 2); 461 | let reads_result = rick.reads(offsets.clone()).await.unwrap(); 462 | assert_eq!(entries, reads_result); 463 | }); 464 | } 465 | } 466 | -------------------------------------------------------------------------------- /src/file/sstable.rs: -------------------------------------------------------------------------------- 1 | use std::rc::Rc; 2 | use std::sync::Arc; 3 | 4 | use tracing::error; 5 | 6 | use crate::context::Context; 7 | use crate::error::{HelixError, Result}; 8 | use crate::file::{FileNo, Rick}; 9 | use crate::index::MemIndex; 10 | use crate::io::File; 11 | use crate::table::TableReadHandle; 12 | use crate::types::sstable::{BlockInfo, BlockType, IndexBlockEntry, SSTableSuperBlock}; 13 | use crate::types::{Bytes, LevelId, Offset, ThreadId, Timestamp}; 14 | use crate::util::{check_bytes_length, decode_u64, encode_u64}; 15 | 16 | pub struct SSTable { 17 | file: Rc, 18 | sb: SSTableSuperBlock, 19 | } 20 | 21 | impl SSTable { 22 | pub async fn open(file: Rc) -> Result { 23 | let sb = Self::read_super_block(&file).await?; 24 | 25 | Ok(Self { file, sb }) 26 | } 27 | 28 | pub async fn into_read_handle(self, ctx: Arc) -> Result { 29 | // read index block 30 | let index_blocks = self.sb.get_block_info(BlockType::IndexBlock); 31 | if index_blocks.is_empty() { 32 | error!("index block is empty"); 33 | return Err(HelixError::NotFound); 34 | } 35 | let mut indices = vec![]; 36 | for block in index_blocks { 37 | let block_buf = self.file.read(block.offset, block.length).await?; 38 | check_bytes_length(&block_buf, block.length as usize)?; 39 | let memindex = IndexBlockReader::read(block_buf)?; 40 | indices.push(memindex); 41 | } 42 | // todo: merge multi mem-indices 43 | let memindex = indices.pop().unwrap(); 44 | 45 | // open rick file 46 | let rick_file = ctx 47 | .file_manager 48 | .open(self.sb.thread_id, FileNo::Rick(self.sb.level_id)) 49 | .await?; 50 | let rick = Rick::open(rick_file, None).await?; 51 | 52 | let handle = TableReadHandle::new(memindex, self, rick, ctx); 53 | Ok(handle) 54 | } 55 | 56 | /// Read super block from the first 4KB block of file. 57 | /// And if file is empty a new super block will be created. 58 | // todo: duplicate code with `Rick::read_super_block()` 59 | async fn read_super_block(file: &File) -> Result { 60 | // check whether super block exist. 61 | let file_length = file.size().await?; 62 | if file_length == 0 { 63 | // create super block and write it to file. 64 | let sb = SSTableSuperBlock { 65 | // todo: which default value? 66 | thread_id: 0, 67 | level_id: 0, 68 | blocks: vec![], 69 | }; 70 | 71 | let buf = sb.encode(); 72 | file.write(buf, 0).await?; 73 | 74 | Ok(sb) 75 | } else { 76 | // otherwise read from head. 77 | let buf = file.read(0, SSTableSuperBlock::LENGTH as u64).await?; 78 | let sb = SSTableSuperBlock::decode(&buf); 79 | 80 | Ok(sb) 81 | } 82 | } 83 | } 84 | 85 | pub struct TableBuilder { 86 | thread_id: ThreadId, 87 | level_id: LevelId, 88 | file: Rc, 89 | block_buffer: Bytes, 90 | blocks: Vec, 91 | tail_offset: Offset, 92 | } 93 | 94 | impl TableBuilder { 95 | /// Start to build table. 96 | pub fn begin(thread_id: ThreadId, level_id: LevelId, file: Rc) -> Self { 97 | Self { 98 | thread_id, 99 | level_id, 100 | file, 101 | block_buffer: vec![], 102 | blocks: vec![], 103 | tail_offset: SSTableSuperBlock::LENGTH as u64, 104 | } 105 | } 106 | 107 | pub fn add_block(&mut self, block_builder: impl BlockBuilder) { 108 | let (block_type, mut block_data) = block_builder.finish(); 109 | 110 | let block_size = block_data.len() as u64; 111 | self.blocks.push(BlockInfo { 112 | block_type, 113 | offset: self.tail_offset, 114 | length: block_size, 115 | }); 116 | self.block_buffer.append(&mut block_data); 117 | self.tail_offset += block_size; 118 | } 119 | 120 | /// Consume this builder to build a SSTable. 121 | pub async fn finish(self) -> Result<()> { 122 | // write super block 123 | let sb = SSTableSuperBlock { 124 | thread_id: self.thread_id, 125 | level_id: self.level_id, 126 | blocks: self.blocks, 127 | }; 128 | self.file.write(sb.encode(), 0).await?; 129 | 130 | debug_assert_eq!( 131 | self.block_buffer.len(), 132 | self.tail_offset as usize - SSTableSuperBlock::LENGTH 133 | ); 134 | // write other blocks 135 | // todo: finish this in one write req 136 | self.file 137 | .write(self.block_buffer, SSTableSuperBlock::LENGTH as u64) 138 | .await?; 139 | 140 | self.file.sync().await?; 141 | 142 | Ok(()) 143 | } 144 | } 145 | 146 | pub trait BlockBuilder { 147 | fn finish(self) -> (BlockType, Bytes); 148 | } 149 | 150 | pub struct IndexBlockBuilder { 151 | entry_buffer: Bytes, 152 | } 153 | 154 | impl IndexBlockBuilder { 155 | pub fn new() -> Self { 156 | Self { 157 | entry_buffer: vec![], 158 | } 159 | } 160 | 161 | pub fn from_memindex() -> Self { 162 | todo!() 163 | } 164 | 165 | pub fn add_entry(&mut self, key: &[u8], timestamp: Timestamp, offset: Offset) { 166 | let index_entry = IndexBlockEntry { 167 | value_offset: offset, 168 | timestamp, 169 | key: key.to_owned(), 170 | }; 171 | let mut entry_bytes = index_entry.encode(); 172 | let bytes_len = entry_bytes.len() as u64; 173 | 174 | self.entry_buffer.append(&mut encode_u64(bytes_len)); 175 | self.entry_buffer.append(&mut entry_bytes); 176 | } 177 | } 178 | 179 | impl BlockBuilder for IndexBlockBuilder { 180 | fn finish(self) -> (BlockType, Bytes) { 181 | (BlockType::IndexBlock, self.entry_buffer) 182 | } 183 | } 184 | 185 | pub trait BlockReader { 186 | type Output; 187 | 188 | fn read(_: Bytes) -> Result; 189 | } 190 | 191 | pub struct IndexBlockReader {} 192 | 193 | impl BlockReader for IndexBlockReader { 194 | type Output = MemIndex; 195 | 196 | fn read(mut data: Bytes) -> Result { 197 | let mut memindex = MemIndex::default(); 198 | 199 | // todo: benchmark this 200 | while !data.is_empty() { 201 | // read length 202 | let length_buf: Vec<_> = data.drain(..std::mem::size_of::()).collect(); 203 | check_bytes_length(&length_buf, std::mem::size_of::())?; 204 | let length = decode_u64(&length_buf) as usize; 205 | 206 | // read index entry 207 | let data_buf: Vec<_> = data.drain(..length).collect(); 208 | check_bytes_length(&data_buf, length)?; 209 | let index_entry = IndexBlockEntry::decode(&data_buf); 210 | 211 | memindex.insert(( 212 | index_entry.timestamp, 213 | index_entry.key, 214 | index_entry.value_offset, 215 | ))?; 216 | } 217 | 218 | Ok(memindex) 219 | } 220 | } 221 | 222 | #[cfg(test)] 223 | mod test { 224 | use glommio::LocalExecutor; 225 | use tempfile::tempdir; 226 | 227 | use super::*; 228 | use crate::file::FileManager; 229 | use crate::fn_registry::FnRegistry; 230 | 231 | #[test] 232 | fn index_block_builder_and_reader() { 233 | let ex = LocalExecutor::default(); 234 | ex.run(async { 235 | let mut builder = IndexBlockBuilder::new(); 236 | 237 | builder.add_entry(&b"key1".to_vec(), 1, 3); 238 | builder.add_entry(&b"key2".to_vec(), 1, 10); 239 | 240 | let (block_type, bytes) = builder.finish(); 241 | assert_eq!(BlockType::IndexBlock, block_type); 242 | 243 | let memindex = IndexBlockReader::read(bytes).unwrap(); 244 | assert_eq!(memindex.get(&(1, b"key1".to_vec())).unwrap(), Some(3)); 245 | assert_eq!(memindex.get(&(1, b"key2".to_vec())).unwrap(), Some(10)); 246 | }); 247 | } 248 | 249 | #[test] 250 | fn simple_table_builder() { 251 | let ex = LocalExecutor::default(); 252 | ex.run(async { 253 | let base_dir = tempdir().unwrap(); 254 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 255 | let ctx = Arc::new(Context { 256 | file_manager, 257 | fn_registry: FnRegistry::new_noop(), 258 | }); 259 | let mut table_builder = TableBuilder::begin( 260 | 0, 261 | 1, 262 | ctx.file_manager.open(0, FileNo::SSTable(1)).await.unwrap(), 263 | ); 264 | let mut index_bb = IndexBlockBuilder::new(); 265 | 266 | let indices = vec![ 267 | (b"key1".to_vec(), 1, 1), 268 | (b"key2key2".to_vec(), 2, 2), 269 | (b"key333".to_vec(), 3, 3), 270 | ]; 271 | 272 | for index in &indices { 273 | index_bb.add_entry(&index.0, index.1, index.2); 274 | } 275 | table_builder.add_block(index_bb); 276 | table_builder.finish().await.unwrap(); 277 | 278 | let table_handle = 279 | SSTable::open(ctx.file_manager.open(0, FileNo::SSTable(1)).await.unwrap()) 280 | .await 281 | .unwrap() 282 | .into_read_handle(ctx) 283 | .await 284 | .unwrap(); 285 | 286 | for index in indices { 287 | assert_eq!( 288 | table_handle.get_offset(&(index.1, index.0)).unwrap(), 289 | Some(index.2) 290 | ); 291 | } 292 | assert_eq!( 293 | table_handle 294 | .get_offset(&(233, b"not exist".to_vec())) 295 | .unwrap(), 296 | None 297 | ); 298 | }); 299 | } 300 | } 301 | -------------------------------------------------------------------------------- /src/fn_registry.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, VecDeque}; 2 | use std::convert::TryInto; 3 | use std::sync::Arc; 4 | 5 | use crate::error::{HelixError, Result}; 6 | use crate::types::{Bytes, Timestamp}; 7 | 8 | /// Custom compaction function. This will be called when compacting L0 9 | /// files to L1. 10 | /// 11 | /// The inputs are key, [(timestamp, values),]. 12 | pub type CompressFn = Arc) -> Bytes + Send + Sync>; 13 | 14 | /// The inputs are key and compressed bytes. 15 | /// Output is [(timestamp, values),] 16 | pub type DecompressFn = Arc Vec<(Timestamp, Bytes)> + Send + Sync>; 17 | 18 | /// `UDCF` stands for "User Defined Compress Function". 19 | /// Includes compress and decompress implementation. 20 | #[derive(Clone)] 21 | #[allow(clippy::upper_case_acronyms)] 22 | pub struct UDCF { 23 | name: String, 24 | compress_fn: CompressFn, 25 | decompress_fn: DecompressFn, 26 | } 27 | 28 | impl UDCF { 29 | pub fn new(name: String, compress_fn: CompressFn, decompress_fn: DecompressFn) -> Self { 30 | Self { 31 | name, 32 | compress_fn, 33 | decompress_fn, 34 | } 35 | } 36 | 37 | pub fn compress(&self) -> CompressFn { 38 | self.compress_fn.clone() 39 | } 40 | 41 | pub fn decompress(&self) -> DecompressFn { 42 | self.decompress_fn.clone() 43 | } 44 | } 45 | 46 | /// Determine compress function based on key. 47 | /// 48 | /// This will be called on each key that going to be compressed. 49 | pub type CompressDispatchFn = Arc &str + Send + Sync>; 50 | 51 | /// Dispatch key to different shards. Called "sharding a key". 52 | /// 53 | /// Input type is a reference to a key in `Bytes` and output is which shard this 54 | /// key belongs to. 55 | pub type ShardingKeyFn = Arc usize + Send + Sync>; 56 | 57 | pub struct FnRegistry { 58 | sharding_key_fn: ShardingKeyFn, 59 | dispatch_fn: CompressDispatchFn, 60 | compress_functions: HashMap, 61 | } 62 | 63 | impl FnRegistry { 64 | // #[cfg(test)] 65 | pub fn new_noop() -> Self { 66 | let mut compress_functions = HashMap::new(); 67 | compress_functions.insert("noop".to_string(), noop_udcf()); 68 | Self { 69 | sharding_key_fn: noop_sharding_key_fn(), 70 | dispatch_fn: noop_dispatch_fn(), 71 | compress_functions, 72 | } 73 | } 74 | 75 | pub fn register_udcf(&mut self, udcf: UDCF) { 76 | self.compress_functions.insert(udcf.name.clone(), udcf); 77 | } 78 | 79 | pub fn register_dispatch_fn(&mut self, dispatch_fn: CompressDispatchFn) { 80 | self.dispatch_fn = dispatch_fn; 81 | } 82 | 83 | pub fn register_sharding_key_fn(&mut self, sharding_key_fn: ShardingKeyFn) { 84 | self.sharding_key_fn = sharding_key_fn; 85 | } 86 | 87 | pub fn dispatch_fn(&self) -> CompressDispatchFn { 88 | self.dispatch_fn.clone() 89 | } 90 | 91 | pub fn sharding_fn(&self) -> &ShardingKeyFn { 92 | &self.sharding_key_fn 93 | } 94 | 95 | pub fn udcf(&self, name: &str) -> Result { 96 | self.compress_functions 97 | .get(name) 98 | .cloned() 99 | .ok_or(HelixError::NotFound) 100 | } 101 | 102 | pub fn compress_entries(&self, key: Bytes, data: Vec<(i64, Vec)>) -> Result { 103 | let compress_fn_name = self.dispatch_fn()(&key); 104 | let compress_fn = self.udcf(compress_fn_name)?.compress(); 105 | Ok(compress_fn(key.clone(), data)) 106 | } 107 | 108 | pub fn decompress_entries(&self, key: &[u8], data: &[u8]) -> Result> { 109 | let name = self.dispatch_fn()(key); 110 | let udcf = self.udcf(name)?; 111 | Ok(udcf.decompress()(key.to_owned(), data)) 112 | } 113 | } 114 | 115 | pub fn noop_sharding_key_fn() -> ShardingKeyFn { 116 | Arc::new(|_| 0) 117 | } 118 | 119 | /// Dispatch all keys to [noop_udcf]. 120 | pub fn noop_dispatch_fn() -> CompressDispatchFn { 121 | Arc::new(|_| "noop") 122 | } 123 | 124 | /// A No-Op compress function. 125 | /// 126 | /// Compress: first put all entries' bytes together. Then followed a block of 127 | /// bytes records each entry's length in u64. The last 8 bytes is how many 128 | /// entries sited. ```text 129 | /// | N var-length bytes | N * u64 for length | N as u64 | 130 | /// ``` 131 | pub fn noop_udcf() -> UDCF { 132 | // todo switch to `byteorder` crate. 133 | let compress_fn: CompressFn = Arc::new(|_key, ts_values| { 134 | let value_num = ts_values.len() as u64; 135 | 136 | // concat timestamp and value together. 137 | let ts_values: Vec = ts_values 138 | .into_iter() 139 | .map(|(ts, mut value)| { 140 | let mut ts_bytes = ts.to_le_bytes().to_vec(); 141 | ts_bytes.append(&mut value); 142 | ts_bytes 143 | }) 144 | .collect(); 145 | 146 | // calculate length for every ts_value's bytes and put them together. 147 | let mut value_length = Vec::with_capacity(value_num as usize); 148 | for bytes in &ts_values { 149 | value_length.append(&mut (bytes.len() as u64).to_le_bytes().to_vec()) 150 | } 151 | 152 | // concat all ts_values, lengths, and number of entries 153 | let mut concated_value = ts_values.concat(); 154 | concated_value.append(&mut value_length); 155 | concated_value.append(&mut value_num.to_le_bytes().to_vec()); 156 | 157 | concated_value 158 | }); 159 | 160 | const TIMESTAMP_SIZE: usize = std::mem::size_of::(); 161 | const U64_SIZE: usize = std::mem::size_of::(); 162 | let decompress_fn: DecompressFn = Arc::new(|_key, raw_values| { 163 | let mut raw_values: VecDeque = raw_values.iter().cloned().collect(); 164 | let mut len = raw_values.len(); 165 | 166 | // decode `N` 167 | let value_num_bytes: Vec = raw_values.drain(len - U64_SIZE..).collect(); 168 | len -= U64_SIZE; 169 | let value_num = u64::from_le_bytes(value_num_bytes.try_into().unwrap()) as usize; 170 | 171 | // decode lengths 172 | let mut value_length = Vec::with_capacity(value_num); 173 | // start from 1 174 | for i in 1..=value_num { 175 | let length_bytes: Vec = raw_values.drain(len - i * U64_SIZE..).collect(); 176 | let length = u64::from_le_bytes(length_bytes.try_into().unwrap()) as usize; 177 | value_length.push(length); 178 | } 179 | len -= U64_SIZE * value_num; 180 | 181 | // slice values 182 | let mut values = VecDeque::with_capacity(value_num); 183 | for length in value_length { 184 | let mut ts_value_bytes: Vec = raw_values.drain(len - length..).collect(); 185 | len -= length; 186 | let value_bytes = ts_value_bytes.drain(TIMESTAMP_SIZE..).collect(); 187 | let timestamp = Timestamp::from_le_bytes(ts_value_bytes.try_into().unwrap()); 188 | values.push_front((timestamp, value_bytes)); 189 | } 190 | 191 | // convert VecDeque to Vec. 192 | // the decompress procedure is in reverse order. 193 | values.into() 194 | }); 195 | 196 | UDCF::new("noop".to_string(), compress_fn, decompress_fn) 197 | } 198 | 199 | #[cfg(test)] 200 | mod test { 201 | use super::*; 202 | 203 | #[test] 204 | fn noop_udcf_compress_decompress() { 205 | let udcf = noop_udcf(); 206 | 207 | let key = b"key".to_vec(); 208 | let ts_values = vec![ 209 | (1, b"value1".to_vec()), 210 | (2, b"value2".to_vec()), 211 | (3, b"value3".to_vec()), 212 | (4, b"value1".to_vec()), 213 | (5, b"value3".to_vec()), 214 | (6, b"value2".to_vec()), 215 | ]; 216 | 217 | let compressed = udcf.compress()(key.clone(), ts_values.clone()); 218 | let decompressed = udcf.decompress()(key, &compressed); 219 | 220 | assert_eq!(ts_values, decompressed); 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/index.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | use std::collections::btree_map; 3 | use std::collections::{BTreeMap, HashMap}; 4 | use std::ops::AddAssign; 5 | 6 | use crate::error::Result; 7 | use crate::types::{Bytes, Offset, TimeRange, Timestamp}; 8 | 9 | #[derive(Default, Debug)] 10 | pub struct MemIndex { 11 | /// (timestamp, key) => value's position in rick file. 12 | pub index: BTreeMap<(Timestamp, Bytes), Offset>, 13 | /// Counting user key. 14 | pub user_keys: HashMap, 15 | } 16 | 17 | impl MemIndex { 18 | pub fn from_existing(index: BTreeMap<(Timestamp, Bytes), u64>) -> Self { 19 | let user_keys = HashMap::new(); 20 | let mut result = Self { 21 | index: BTreeMap::new(), 22 | user_keys, 23 | }; 24 | 25 | for (_, user_key) in index.keys() { 26 | result.update_user_key(user_key); 27 | } 28 | result.index = index; 29 | 30 | result 31 | } 32 | 33 | pub fn insert(&mut self, entry: (Timestamp, Bytes, u64)) -> Result<()> { 34 | let (timestamp, key, value) = entry; 35 | self.update_user_key(&key); 36 | self.index.insert((timestamp, key), value); 37 | 38 | Ok(()) 39 | } 40 | 41 | pub fn insert_entries(&mut self, entries: Vec<(Timestamp, Bytes, u64)>) -> Result<()> { 42 | for entry in entries { 43 | let (timestamp, key, value) = entry; 44 | self.update_user_key(&key); 45 | self.index.insert((timestamp, key), value); 46 | } 47 | 48 | Ok(()) 49 | } 50 | 51 | pub fn get(&self, time_key: &(Timestamp, Bytes)) -> Result> { 52 | Ok(self.index.get(time_key).copied()) 53 | } 54 | 55 | #[cfg(test)] 56 | pub fn into_iter(self) -> btree_map::IntoIter<(i64, std::vec::Vec), u64> { 57 | self.index.into_iter() 58 | } 59 | 60 | /// Get all existing user keys. 61 | pub fn user_keys(&self) -> Vec { 62 | self.user_keys.keys().cloned().collect() 63 | } 64 | 65 | pub fn load_time_range(&self, range: TimeRange) -> Vec { 66 | let mut offsets = vec![]; 67 | for ((ts, _), offset) in &self.index { 68 | if range.contains(*ts) { 69 | offsets.push(*offset); 70 | } 71 | } 72 | 73 | offsets 74 | } 75 | 76 | pub fn purge_time_range(&mut self, range: TimeRange) { 77 | self.index.retain(|(ts, _), _| !range.contains(*ts)); 78 | } 79 | 80 | fn update_user_key(&mut self, user_key: &[u8]) { 81 | if !self.user_keys.contains_key(user_key) { 82 | self.user_keys.insert(user_key.to_vec(), 1); 83 | } else { 84 | self.user_keys.get_mut(user_key).unwrap().add_assign(1); 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/io.rs: -------------------------------------------------------------------------------- 1 | use std::os::unix::prelude::{AsRawFd, RawFd}; 2 | use std::path::Path; 3 | 4 | use glommio::io::{DmaFile, OpenOptions}; 5 | use glommio::ByteSliceMutExt; 6 | 7 | use crate::error::Result; 8 | use crate::types::Bytes; 9 | 10 | pub struct File(DmaFile); 11 | 12 | // todo: check these. required by async trait `Iterator` 13 | unsafe impl Send for File {} 14 | unsafe impl Sync for File {} 15 | 16 | impl File { 17 | /// Open or create on given path. 18 | #[inline] 19 | pub async fn open>(path: P) -> Result { 20 | let file = OpenOptions::new() 21 | .create(true) 22 | .read(true) 23 | .write(true) 24 | .dma_open(path) 25 | .await?; 26 | 27 | Ok(File(file)) 28 | } 29 | 30 | #[inline] 31 | pub async fn read(&self, offset: u64, size: u64) -> Result { 32 | let read_result = self.0.read_at(offset, size as usize).await?; 33 | 34 | // todo: remove this copy 35 | Ok(read_result.to_vec()) 36 | } 37 | 38 | #[inline] 39 | pub async fn write(&self, bytes: Bytes, offset: u64) -> Result<()> { 40 | let mut buf = self.0.alloc_dma_buffer(bytes.len()); 41 | buf.as_bytes_mut().write_at(0, &bytes); 42 | 43 | self.0.write_at(buf, offset).await?; 44 | 45 | Ok(()) 46 | } 47 | 48 | #[inline] 49 | pub async fn sync(&self) -> Result<()> { 50 | self.0.fdatasync().await?; 51 | 52 | Ok(()) 53 | } 54 | 55 | #[inline] 56 | pub async fn size(&self) -> Result { 57 | Ok(self.0.file_size().await?) 58 | } 59 | 60 | /// Synchronous operation. 61 | #[inline] 62 | pub async fn truncate(&self, size: u64) -> Result<()> { 63 | self.0.truncate(size).await?; 64 | 65 | Ok(()) 66 | } 67 | 68 | #[inline] 69 | pub async fn close(self) -> Result<()> { 70 | self.0.close().await?; 71 | 72 | Ok(()) 73 | } 74 | } 75 | 76 | impl AsRawFd for File { 77 | fn as_raw_fd(&self) -> RawFd { 78 | self.0.as_raw_fd() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/io_worker.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::rc::Rc; 3 | use std::sync::Arc; 4 | 5 | use glommio::channels::channel_mesh::{ 6 | Receivers as ChannelMeshReceiver, 7 | Senders as ChannelMeshSender, 8 | }; 9 | use glommio::sync::Gate; 10 | use glommio::{Latency, Shares}; 11 | use tokio::sync::mpsc::{Receiver, Sender}; 12 | use tokio::sync::oneshot::Sender as Notifier; 13 | use tokio::sync::Mutex; 14 | use tracing::trace; 15 | 16 | use crate::compact_sched::{CompactScheduler, QueueUpCompSched}; 17 | use crate::context::Context; 18 | use crate::error::Result; 19 | use crate::level::{Levels, TimestampReviewer}; 20 | use crate::option::{Options, ReadOption}; 21 | use crate::types::{Bytes, Entry, LevelInfo, ThreadId, TimeRange, Timestamp}; 22 | use crate::TimestampAction; 23 | 24 | thread_local!( 25 | // todo: the api of glommio::Gate seems not very suit for our use case. 26 | // Expecting for a more ergonomic way to register critical task. 27 | // It is essentially a counter. 28 | /// A TLS variable for graceful shutdown. 29 | /// 30 | /// It will wait until all tasks spawned via it are finished when closing. 31 | pub static GATE: Rc = Rc::new(Gate::new()) 32 | ); 33 | 34 | /// A un-Send handle to accept and process requests. 35 | pub struct IOWorker { 36 | tid: ThreadId, 37 | levels: Rc>, 38 | // todo: maybe add channel mesh for scan 39 | } 40 | 41 | impl IOWorker { 42 | pub async fn try_new( 43 | tid: ThreadId, 44 | opts: Options, 45 | timestamp_reviewer: Arc>>, 46 | level_info: Arc>, 47 | ctx: Arc, 48 | ts_action_sender: ChannelMeshSender, 49 | ) -> Result { 50 | let compact_task_queue = glommio::executor().create_task_queue( 51 | Shares::default(), 52 | Latency::NotImportant, 53 | "compact_tq", 54 | ); 55 | // Safety: 56 | // this is initialized below. 57 | let sched = unsafe { 58 | QueueUpCompSched::new_zeroed(opts.compact_prompt_interval, 2, compact_task_queue) 59 | }; 60 | 61 | let levels = Levels::try_new( 62 | tid, 63 | opts.clone(), 64 | timestamp_reviewer, 65 | ctx, 66 | ts_action_sender, 67 | level_info, 68 | sched.clone(), 69 | ) 70 | .await?; 71 | 72 | sched.clone().init(levels.clone()); 73 | sched.install(compact_task_queue)?; 74 | 75 | Ok(Self { tid, levels }) 76 | } 77 | 78 | /// Won't return until shut down. 79 | pub async fn run( 80 | self, 81 | mut rx: Receiver, 82 | mut ts_action_receiver: ChannelMeshReceiver, 83 | ) { 84 | let connected_receivers: Vec<_> = ts_action_receiver 85 | .streams() 86 | .into_iter() 87 | .map(|(_, rx)| rx) 88 | .collect(); 89 | 90 | for rx in connected_receivers { 91 | let levels = self.levels.clone(); 92 | let tid = self.tid; 93 | glommio::spawn_local(async move { 94 | while let Some(action) = rx.recv().await { 95 | trace!("{} received action {:?}", tid, action); 96 | let _ = levels.handle_actions(vec![action]).await; 97 | } 98 | }) 99 | .detach(); 100 | } 101 | 102 | // the `Error` case of `Gate::spawn()` is glommio runtime cannot find given task 103 | // queue which needn't to take into consideration since we don't specify 104 | // task queue. 105 | while let Some(task) = rx.recv().await { 106 | match task { 107 | Task::Put(entries, tx) => { 108 | let levels = self.levels.clone(); 109 | GATE.with(|gate| { 110 | gate.spawn(async move { 111 | levels.put(entries, tx).await; 112 | }) 113 | .unwrap() 114 | .detach() 115 | }); 116 | } 117 | Task::Get(ts, key, tx, opt) => { 118 | let levels = self.levels.clone(); 119 | GATE.with(|gate| { 120 | gate.spawn(async move { 121 | let result = levels.get(&(ts, key), opt).await; 122 | let _ = tx.send(result); 123 | }) 124 | .unwrap() 125 | .detach() 126 | }); 127 | } 128 | Task::Scan(time_range, key_start, key_end, sender, cmp) => { 129 | let levels = self.levels.clone(); 130 | GATE.with(|gate| { 131 | gate.spawn(async move { 132 | let _ = levels 133 | .scan(time_range, key_start, key_end, sender, cmp) 134 | .await; 135 | }) 136 | .unwrap() 137 | .detach() 138 | }); 139 | } 140 | Task::Shutdown => { 141 | trace!("going to close shard {}", self.tid); 142 | 143 | let gate = GATE.with(|gate| gate.clone()); 144 | let _ = gate.close().await; 145 | 146 | trace!("shard {} is closed", self.tid); 147 | break; 148 | } 149 | } 150 | } 151 | } 152 | } 153 | 154 | pub enum Task { 155 | // todo: add put option 156 | Put(Vec, Notifier>), 157 | Get( 158 | Timestamp, 159 | Bytes, 160 | Notifier>>, 161 | ReadOption, 162 | ), 163 | /// time range, start key, end key, result sender, comparator 164 | Scan( 165 | TimeRange, 166 | Bytes, 167 | Bytes, 168 | Sender>, 169 | Arc Ordering + Send + Sync>, 170 | ), 171 | Shutdown, 172 | } 173 | 174 | // todo: finish this 175 | impl std::fmt::Debug for Task { 176 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 177 | f.debug_struct("HelixTask").finish() 178 | } 179 | } 180 | 181 | #[cfg(test)] 182 | mod test { 183 | use futures_util::future::select_all; 184 | use glommio::channels::channel_mesh::{Full, MeshBuilder}; 185 | use glommio::LocalExecutorBuilder; 186 | 187 | // todo: investigate this. The receiver will receive lots of message from "peer 188 | // 0" without any sender. 189 | #[test] 190 | #[ignore] 191 | fn channel_mesh_select_recv_loop() { 192 | let mesh_builder: MeshBuilder<(), Full> = MeshBuilder::full(8, 2); 193 | 194 | for _ in 0..7 { 195 | let mesh_builder = mesh_builder.clone(); 196 | LocalExecutorBuilder::new(glommio::Placement::Unbound) 197 | .spawn(move || async move { 198 | let (_, mut rx) = mesh_builder.join().await.unwrap(); 199 | 200 | let connected_receivers: Vec<_> = 201 | rx.streams().into_iter().map(|(_, rx)| rx).collect(); 202 | 203 | loop { 204 | let recvs = connected_receivers 205 | .iter() 206 | .map(|rx| { 207 | let fut = rx.recv(); 208 | Box::pin(fut) 209 | }) 210 | .collect::>(); 211 | 212 | let action = select_all(recvs).await; 213 | let (_, index, _) = action; 214 | println!("{} received action from {}", rx.peer_id(), index); 215 | } 216 | }) 217 | .unwrap(); 218 | } 219 | 220 | LocalExecutorBuilder::new(glommio::Placement::Unbound) 221 | .spawn(move || async move { 222 | let (tx, _) = mesh_builder.join().await.unwrap(); 223 | 224 | for _ in 0..4 { 225 | for peer in 0..7 { 226 | tx.send_to(peer, ()).await.unwrap(); 227 | } 228 | println!("finished once") 229 | } 230 | }) 231 | .unwrap() 232 | .join() 233 | .unwrap(); 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/iterator.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BinaryHeap; 2 | 3 | use async_trait::async_trait; 4 | use tokio::sync::mpsc::Receiver; 5 | 6 | use crate::error::Result; 7 | use crate::types::Entry; 8 | use crate::util::{Comparator, KeyExtractor, OrderingHelper}; 9 | 10 | // todo: add type param 11 | #[async_trait] 12 | pub trait Iterator { 13 | async fn next(&mut self) -> Result>; 14 | 15 | fn is_valid(&self) -> bool; 16 | } 17 | 18 | /// Iterate over timestamp. i.e, (ts 0, key 1) -> (ts 1, key 1) -> (ts 2, key 19 | /// 1)... 20 | /// 21 | /// "Scan" is achieved via (lots of) `get()` 22 | pub struct TimeIterator { 23 | inner: ShardMuxTimeIterator, 24 | buf: Vec, 25 | } 26 | 27 | impl TimeIterator { 28 | pub(crate) fn new(mux_iter: ShardMuxTimeIterator) -> Self { 29 | Self { 30 | inner: mux_iter, 31 | buf: vec![], 32 | } 33 | } 34 | 35 | pub(crate) async fn next(&mut self) -> Result> { 36 | if self.buf.is_empty() { 37 | self.buf = ok_unwrap!(self.inner.next().await); 38 | } 39 | 40 | Ok(self.buf.pop()) 41 | } 42 | 43 | /// Valid when inner iterator is valid or its own buffer still contains 44 | /// things. 45 | pub(crate) fn is_valid(&self) -> bool { 46 | self.inner.is_valid() || !self.buf.is_empty() 47 | } 48 | } 49 | 50 | #[async_trait] 51 | impl Iterator for TimeIterator { 52 | async fn next(&mut self) -> Result> { 53 | self.next().await 54 | } 55 | 56 | fn is_valid(&self) -> bool { 57 | self.is_valid() 58 | } 59 | } 60 | 61 | pub(crate) struct ShardTimeIterator { 62 | ready: Option>, 63 | source: Receiver>, 64 | is_finished: bool, 65 | } 66 | 67 | impl ShardTimeIterator { 68 | // will wait source to yield the first element. 69 | pub(crate) async fn new(mut source: Receiver>) -> Self { 70 | let ready = source.recv().await; 71 | let is_finished = ready.is_none(); 72 | 73 | Self { 74 | ready, 75 | source, 76 | is_finished, 77 | } 78 | } 79 | 80 | // todo: maybe add a trait `PeekableIterator` : `Iterator` 81 | pub fn peek(&self) -> Option<&Vec> { 82 | self.ready.as_ref() 83 | } 84 | 85 | /// Take current value but not step iterator after that. 86 | pub async fn take(&mut self) -> Result>> { 87 | let ready = self.ready.take(); 88 | self.step().await?; 89 | 90 | Ok(ready) 91 | } 92 | 93 | async fn step(&mut self) -> Result<()> { 94 | if self.is_finished { 95 | return Ok(()); 96 | } 97 | 98 | match self.source.recv().await { 99 | Some(item) => self.ready = Some(item), 100 | None => self.is_finished = true, 101 | } 102 | 103 | Ok(()) 104 | } 105 | 106 | pub fn is_valid(&self) -> bool { 107 | !self.is_finished 108 | } 109 | } 110 | 111 | pub struct ShardMuxTimeIterator { 112 | iters: Vec, 113 | entry_buf: BinaryHeap>>, 114 | } 115 | 116 | impl ShardMuxTimeIterator { 117 | pub(crate) async fn new(iters: Vec, buf_size: usize) -> Self { 118 | let mut s = Self { 119 | iters, 120 | entry_buf: BinaryHeap::default(), 121 | }; 122 | s.init(buf_size).await; 123 | 124 | s 125 | } 126 | 127 | async fn next(&mut self) -> Option> { 128 | if self.entry_buf.is_empty() { 129 | return None; 130 | } 131 | 132 | let next = self.entry_buf.pop().unwrap().data; 133 | // todo: check this Result 134 | let _ = self.consume_one().await; 135 | 136 | Some(next) 137 | } 138 | 139 | /// Valid when underlying iters aren't all consumed or `entry_buf` still 140 | /// buffers some entries. 141 | fn is_valid(&self) -> bool { 142 | !self.iters.is_empty() || !self.entry_buf.is_empty() 143 | } 144 | 145 | async fn init(&mut self, buf_size: usize) { 146 | // sort underlying iterators 147 | self.purge_finished(); 148 | self.iters.sort_by(|lhs, rhs| { 149 | C::cmp( 150 | Vec::::key(lhs.peek().unwrap()), 151 | Vec::::key(rhs.peek().unwrap()), 152 | ) 153 | }); 154 | 155 | // fill `entry_buf` 156 | while !self.iters.is_empty() && self.entry_buf.len() < buf_size { 157 | // todo: check this Result 158 | let _ = self.consume_one().await; 159 | } 160 | } 161 | 162 | /// Remove and deconstruct finished iterator to release source. 163 | fn purge_finished(&mut self) { 164 | self.iters.retain(|iter| iter.is_valid()) 165 | } 166 | 167 | /// Get one element from underlying iterators and put it into `entry_buf`. 168 | /// Then step the iterator which supplies that element and reordering 169 | /// the iterator list to keep them ordered. 170 | async fn consume_one(&mut self) -> Result<()> { 171 | if self.iters.is_empty() { 172 | return Ok(()); 173 | } 174 | 175 | // consume 176 | let mut first_iter = self.iters.pop().unwrap(); 177 | let item = first_iter.take().await?.unwrap(); 178 | self.entry_buf.push(item.into()); 179 | // this iterator is finished 180 | if !first_iter.is_valid() { 181 | return Ok(()); 182 | } 183 | 184 | // insert popped iterator to ordered position 185 | let new_entry = first_iter.peek().unwrap(); 186 | let lhs = Vec::::key(new_entry); 187 | let index = self 188 | .iters 189 | .binary_search_by(|iter| C::cmp(lhs, Vec::::key(iter.peek().unwrap()))) 190 | .unwrap_or_else(|x| x); 191 | self.iters.insert(index, first_iter); 192 | 193 | Ok(()) 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/level.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use std::cmp::Ordering; 3 | use std::collections::HashMap; 4 | use std::rc::Rc; 5 | use std::sync::Arc; 6 | use std::time::Duration; 7 | 8 | use glommio::channels::channel_mesh::Senders as ChannelMeshSender; 9 | use glommio::sync::RwLock; 10 | use glommio::timer::TimerActionOnce; 11 | use tokio::sync::mpsc::Sender as BoundedSender; 12 | use tokio::sync::oneshot::Sender; 13 | use tokio::sync::Mutex; 14 | use tracing::{debug, instrument, trace}; 15 | 16 | use crate::cache::{Cache, KeyCacheEntry, KeyCacheResult}; 17 | use crate::compact_sched::CompactScheduler; 18 | use crate::context::Context; 19 | use crate::error::{HelixError, Result}; 20 | use crate::file::{FileNo, IndexBlockBuilder, Rick, SSTable, TableBuilder}; 21 | use crate::index::MemIndex; 22 | use crate::io_worker; 23 | use crate::option::{Options, ReadOption}; 24 | use crate::types::{Bytes, Entry, LevelId, LevelInfo, ThreadId, TimeRange, Timestamp, ValueFormat}; 25 | 26 | pub struct LevelConfig { 27 | /// Use one file to store non-Rick (SSTable) entries or not. 28 | pub sharding_sstable: bool, 29 | /// Max levels can hold. This option should be greater than 0. 30 | /// Levels will be L0 to L`max_level` (inclusive). 31 | /// Might be useless due to TimeStamp Reviewer? 32 | pub max_level: usize, 33 | /// The max difference of timestamps inside one level. 34 | /// Might be useless due to TimeStamp Reviewer? 35 | pub level_duration: u64, 36 | } 37 | 38 | /// APIs require unique reference (&mut self) because this `Level` is designed 39 | /// to be used inside one thread (!Send). The fields should also be !Send if 40 | /// possible. 41 | pub(crate) struct Levels { 42 | tid: ThreadId, 43 | // todo: remove this mutex 44 | timestamp_reviewer: Arc>>, 45 | ctx: Arc, 46 | memindex: Mutex, 47 | // todo: use group of ricks to achieve log-rotate/GC 48 | rick: Mutex, 49 | level_info: Arc>, 50 | cache: Cache, 51 | write_batch: Rc, 52 | ts_action_sender: ChannelMeshSender, 53 | compact_sched: Rc, 54 | } 55 | 56 | impl Levels { 57 | pub async fn try_new( 58 | tid: ThreadId, 59 | opts: Options, 60 | timestamp_reviewer: Arc>>, 61 | ctx: Arc, 62 | ts_action_sender: ChannelMeshSender, 63 | level_info: Arc>, 64 | compact_sched: Rc, 65 | ) -> Result> { 66 | // todo: remove the default rick. the number in `FileNo::Rick` shouldn't be 0. 67 | let rick_file = ctx.file_manager.open(tid, FileNo::Rick(0)).await.unwrap(); 68 | let rick = Rick::open(rick_file, Some(ValueFormat::RawValue)).await?; 69 | let memindex = rick.construct_index().await?; 70 | 71 | let cache = Cache::with_config(opts.cache); 72 | let write_batch = WriteBatch::with_config(opts.write_batch); 73 | 74 | let levels = Self { 75 | tid, 76 | timestamp_reviewer, 77 | ctx, 78 | memindex: Mutex::new(memindex), 79 | rick: Mutex::new(rick), 80 | level_info, 81 | cache, 82 | write_batch: Rc::new(write_batch), 83 | ts_action_sender, 84 | compact_sched, 85 | }; 86 | 87 | let levels = Rc::new(levels); 88 | 89 | Ok(levels) 90 | } 91 | 92 | pub async fn put(self: Rc, entries: Vec, notifier: Sender>) { 93 | self.write_batch 94 | .clone() 95 | .enqueue(entries, notifier, self.clone()) 96 | .await; 97 | } 98 | 99 | /// Put entries without batching them. 100 | pub async fn put_internal(&self, entries: Vec) -> Result<()> { 101 | if entries.is_empty() { 102 | return Ok(()); 103 | } 104 | 105 | let max_timestamp = entries 106 | .iter() 107 | .max_by_key(|entry| entry.timestamp) 108 | .unwrap() 109 | .timestamp; 110 | 111 | let indices = self.rick.lock().await.append(entries).await?; 112 | self.memindex.lock().await.insert_entries(indices)?; 113 | 114 | // review timestamp and handle actions. 115 | let review_actions = self.timestamp_reviewer.lock().await.observe(max_timestamp); 116 | self.handle_actions(review_actions.clone()).await?; 117 | 118 | glommio::yield_if_needed().await; 119 | 120 | Ok(()) 121 | } 122 | 123 | pub async fn get( 124 | &self, 125 | time_key: &(Timestamp, Bytes), 126 | opt: ReadOption, 127 | ) -> Result> { 128 | let level = self.level_info.lock().await.get_level_id(time_key.0); 129 | match level { 130 | None => Ok(None), 131 | Some(0) => self.get_from_rick(time_key).await, 132 | Some(l) => self.get_from_table(time_key, l, opt).await, 133 | } 134 | } 135 | 136 | // todo: handle multi level scan 137 | pub async fn scan( 138 | &self, 139 | time_range: TimeRange, 140 | key_start: Bytes, 141 | key_end: Bytes, 142 | sender: BoundedSender>, 143 | cmp: Arc Ordering>, 144 | ) -> Result<()> { 145 | let mut user_keys = self.memindex.lock().await.user_keys(); 146 | // filter 147 | user_keys.retain(|key| { 148 | cmp(key, &key_start) != Ordering::Less && cmp(key, &key_end) != Ordering::Greater 149 | }); 150 | // sort 151 | user_keys.sort_by(|lhs, rhs| cmp(lhs, rhs)); 152 | 153 | // todo: refine this 154 | for user_key in user_keys { 155 | let mut time_key = (0, user_key); 156 | for ts in time_range.range() { 157 | time_key.0 = ts; 158 | if let Some(entry) = self.get(&time_key, ReadOption::default()).await? { 159 | sender.send(vec![entry]).await?; 160 | } 161 | } 162 | } 163 | 164 | Ok(()) 165 | } 166 | 167 | #[inline] 168 | async fn get_from_rick(&self, time_key: &(Timestamp, Bytes)) -> Result> { 169 | if let Some(offset) = self.memindex.lock().await.get(time_key)? { 170 | let entry = self.rick.lock().await.read(offset).await?; 171 | 172 | return Ok(Some(entry)); 173 | } 174 | 175 | Ok(None) 176 | } 177 | 178 | // todo: refine, split 179 | #[inline] 180 | async fn get_from_table( 181 | &self, 182 | time_key: &(Timestamp, Bytes), 183 | level_id: LevelId, 184 | opt: ReadOption, 185 | ) -> Result> { 186 | let mut key_cache_entry = KeyCacheEntry::new(time_key); 187 | 188 | let cache_result = self.cache.get_key(time_key); 189 | trace!("cache result of {:?} : {:?}", time_key, cache_result); 190 | match cache_result { 191 | KeyCacheResult::Value(value) => Ok(Some(Entry { 192 | timestamp: time_key.0, 193 | key: time_key.1.to_owned(), 194 | value, 195 | })), 196 | KeyCacheResult::Compressed(compressed) => { 197 | let value = 198 | ok_unwrap!(self.decompress_and_find(time_key, &compressed, opt.decompress)?); 199 | 200 | key_cache_entry.value = Some(&value); 201 | key_cache_entry.compressed = Some(&compressed); 202 | self.cache.put_key(key_cache_entry); 203 | 204 | Ok(Some(Entry { 205 | timestamp: time_key.0, 206 | key: time_key.1.clone(), 207 | value: value.clone(), 208 | })) 209 | } 210 | KeyCacheResult::Position(tid, level_id, offset) => { 211 | let rick_file = self 212 | .ctx 213 | .file_manager 214 | .open(tid, FileNo::Rick(level_id)) 215 | .await?; 216 | let rick = Rick::open(rick_file, None).await?; 217 | let raw_bytes = rick.read(offset as u64).await?; 218 | 219 | let value = ok_unwrap!(self.decompress_and_find( 220 | time_key, 221 | &raw_bytes.value, 222 | opt.decompress 223 | )?); 224 | 225 | key_cache_entry.value = Some(&value); 226 | key_cache_entry.compressed = Some(&raw_bytes.value); 227 | self.cache.put_key(key_cache_entry); 228 | 229 | Ok(Some(Entry { 230 | timestamp: time_key.0, 231 | key: time_key.1.clone(), 232 | value: value.clone(), 233 | })) 234 | } 235 | KeyCacheResult::NotFound => { 236 | let handle = if let Some(handle) = 237 | self.cache.get_table_handle(&(self.tid, level_id).into()) 238 | { 239 | handle 240 | } else { 241 | let table_file = self 242 | .ctx 243 | .file_manager 244 | .open(self.tid, FileNo::SSTable(level_id)) 245 | .await?; 246 | // table file is empty, means this level haven't finished it compaction. Need to 247 | // read value from L0 rick. 248 | // But this check (via file's size) is not good. the write operation may not 249 | // guarantee to be atomic. todo: add a flag to indicate 250 | // whether a compact is finished. 251 | if table_file.size().await? == 0 { 252 | return self.get_from_rick(time_key).await; 253 | } 254 | let handle = SSTable::open(table_file) 255 | .await? 256 | .into_read_handle(self.ctx.clone()) 257 | .await?; 258 | 259 | let handle = Rc::new(handle); 260 | self.cache 261 | .put_table_handle((self.tid, level_id).into(), handle.clone()) 262 | .await?; 263 | handle 264 | }; 265 | 266 | let entry = handle.get(time_key).await?; 267 | let is_compressed = handle.is_compressed(); 268 | // update cache 269 | if let Some(mut entry) = entry { 270 | if is_compressed { 271 | let value = ok_unwrap!(self.decompress_and_find( 272 | time_key, 273 | &entry.value, 274 | opt.decompress, 275 | )?); 276 | key_cache_entry.compressed = Some(&entry.value); 277 | self.cache.put_key(key_cache_entry); 278 | entry.timestamp = time_key.0; 279 | entry.value = value; 280 | } else { 281 | key_cache_entry.value = Some(&entry.value); 282 | self.cache.put_key(key_cache_entry); 283 | } 284 | Ok(Some(entry)) 285 | } else { 286 | Ok(None) 287 | } 288 | } 289 | } 290 | } 291 | 292 | /// Propagate action to other peers. 293 | async fn propagate_action(&self, action: TimestampAction) -> Result<()> { 294 | for consumer_id in 0..self.ts_action_sender.nr_consumers() { 295 | if consumer_id != self.ts_action_sender.peer_id() { 296 | self.ts_action_sender 297 | .send_to(consumer_id, action) 298 | .await 299 | // todo: check this unwrap 300 | .unwrap(); 301 | } 302 | } 303 | 304 | Ok(()) 305 | } 306 | 307 | pub(crate) async fn handle_actions(&self, actions: Vec) -> Result<()> { 308 | for action in actions { 309 | debug!("tid: {}, action: {:?}", self.tid, action); 310 | match action { 311 | TimestampAction::Compact(start_ts, end_ts, level_id) => { 312 | let level_id = match level_id { 313 | Some(id) => id, 314 | None => { 315 | // fetch new level id and update level info 316 | let level_id = self 317 | .level_info 318 | .lock() 319 | .await 320 | .add_level(start_ts, end_ts, &self.ctx.file_manager) 321 | .await?; 322 | // propagate 323 | self.propagate_action(TimestampAction::Compact( 324 | start_ts, 325 | end_ts, 326 | Some(level_id), 327 | )) 328 | .await?; 329 | 330 | level_id 331 | } 332 | }; 333 | self.compact(TimeRange::from((start_ts, end_ts)), level_id) 334 | .await?; 335 | // todo: enable this 336 | // self.compact_sched.enqueue(level_id); 337 | } 338 | TimestampAction::Outdate(_) => { 339 | self.propagate_action(action).await?; 340 | self.outdate().await? 341 | } 342 | } 343 | } 344 | 345 | Ok(()) 346 | } 347 | 348 | /// Compact entries from rick in [start_ts, end_ts] to next level. 349 | /// 350 | /// This function is wrapped by `Gate` which means HelixCore will wait it to 351 | /// finish before close and shutdown. Whereas compactions that are invoked 352 | /// after the gate is closing or closed will be ignored. 353 | /// 354 | /// todo: how to handle rick file is not fully covered by given time range?. 355 | #[instrument] 356 | pub(crate) async fn compact(&self, range: TimeRange, level_id: LevelId) -> Result<()> { 357 | // Keep the gate open until compact finished. The question mark (try) indicates 358 | // a early return once it's failed to spawn to the gate. 359 | let (tx, rx) = glommio::channels::local_channel::new_bounded(1); 360 | io_worker::GATE 361 | .with(|gate| gate.spawn(async move { rx.recv().await }))? 362 | .detach(); 363 | debug!( 364 | "[compact] start compact. range {:?}, level {}", 365 | range, level_id 366 | ); 367 | 368 | let mut table_builder = TableBuilder::begin( 369 | self.tid, 370 | level_id, 371 | self.ctx 372 | .file_manager 373 | .open(self.tid, FileNo::SSTable(level_id)) 374 | .await?, 375 | ); 376 | 377 | // make entry_map (from memindex) and purge 378 | let memindex = self.memindex.lock().await; 379 | let offsets = memindex.load_time_range(range); 380 | drop(memindex); 381 | let mut rick = self.rick.lock().await; 382 | let entries = rick.reads(offsets).await?; 383 | let offset_end = rick.get_legal_offset_end(); 384 | drop(rick); 385 | trace!("[compact] level {}, rick reads", level_id); 386 | 387 | let mut entry_map = HashMap::new(); 388 | for entry in entries { 389 | let Entry { 390 | timestamp, 391 | key, 392 | value, 393 | } = entry; 394 | 395 | let pair_list: &mut Vec<_> = entry_map.entry(key).or_default(); 396 | pair_list.push((timestamp, value)); 397 | } 398 | 399 | trace!("[compact] level {}, make entry map", level_id); 400 | 401 | // prepare output files. 402 | let mut index_bb = IndexBlockBuilder::new(); 403 | let rick_file = self 404 | .ctx 405 | .file_manager 406 | .open(self.tid, FileNo::Rick(level_id)) 407 | .await?; 408 | let mut rick = Rick::open(rick_file, Some(ValueFormat::CompressedValue)).await?; 409 | rick.set_align_ts(range.start()).await?; 410 | 411 | // call compress_fn to compact points, build rick file and index block. 412 | for (key, ts_value) in entry_map { 413 | debug_assert!(!ts_value.is_empty()); 414 | let first_ts = ts_value[0].0; 415 | 416 | let compressed_data = self 417 | .ctx 418 | .fn_registry 419 | .compress_entries(key.clone(), ts_value)?; 420 | 421 | // todo: add rick builder 422 | let mut position = rick 423 | .append(vec![Entry { 424 | timestamp: first_ts, 425 | key, 426 | value: compressed_data, 427 | }]) 428 | .await?; 429 | let (timestamp, key, offset) = position.pop().unwrap(); 430 | index_bb.add_entry(&key, timestamp, offset); 431 | } 432 | 433 | trace!("[compact] level {}, build rick", level_id); 434 | 435 | // make sstable 436 | // table_builder.add_entries(keys, value_positions); 437 | table_builder.add_block(index_bb); 438 | table_builder.finish().await?; 439 | 440 | trace!("[compact] level {}, build table", level_id); 441 | 442 | // todo: gc rick 443 | // self.rick.lock().await.clean().await?; 444 | // todo: gc memindex 445 | // self.memindex.lock().await.purge_time_range(range); 446 | let mut memindex = self.memindex.lock().await; 447 | memindex.purge_time_range(range); 448 | drop(memindex); 449 | trace!("[compact] level {}, purge memindex", level_id); 450 | let mut rick = self.rick.lock().await; 451 | rick.push_legal_offset_start(offset_end).await?; 452 | drop(rick); 453 | trace!("[compact] level {}, clean rick", level_id); 454 | 455 | debug!("compact {} finish", level_id); 456 | let _ = tx.send(()).await; 457 | 458 | Ok(()) 459 | } 460 | 461 | /// Perform compaction on the given level id. 462 | /// 463 | /// This procedure assume the level going to compact is inactive, which has 464 | /// the level id assigned, has corresponding rick file, and may serving read 465 | /// requests. 466 | /// 467 | /// It's not this procedure's response to switch active level. And it also 468 | /// has nothing to do with memindex. 469 | #[instrument] 470 | pub(crate) async fn compact_level(&self, level_id: LevelId) -> Result<()> { 471 | self.compact_sched.finished(level_id); 472 | 473 | Ok(()) 474 | } 475 | 476 | async fn outdate(&self) -> Result<()> { 477 | self.level_info 478 | .lock() 479 | .await 480 | .remove_last_level(&self.ctx.file_manager) 481 | .await?; 482 | 483 | todo!() 484 | } 485 | 486 | fn decompress_and_find( 487 | &self, 488 | time_key: &(Timestamp, Bytes), 489 | raw_bytes: &[u8], 490 | decompress: bool, 491 | ) -> Result> { 492 | if !decompress { 493 | return Ok(Some(raw_bytes.to_owned())); 494 | } 495 | 496 | let mut entries = self 497 | .ctx 498 | .fn_registry 499 | .decompress_entries(&time_key.1, raw_bytes)?; 500 | 501 | // todo: move this logic to UDCF 502 | entries.sort_by_key(|e| e.0); 503 | let index = ok_unwrap!(entries 504 | .binary_search_by_key(&time_key.0, |(ts, _)| *ts) 505 | .ok()); 506 | let (_, value) = &entries[index]; 507 | 508 | Ok(Some(value.clone())) 509 | } 510 | } 511 | 512 | impl std::fmt::Debug for Levels { 513 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 514 | f.debug_struct("Levels") 515 | .field("thread id", &self.tid) 516 | .finish() 517 | } 518 | } 519 | 520 | /// "Timestamp" in `HelixDB` is a logical concept. It is not bound with the real 521 | /// time. [TimestampReviewer] defines how timestamp should be considered. 522 | /// Including when to do a compaction, when to outdate a part of data etc. 523 | pub trait TimestampReviewer: Send + Sync { 524 | fn observe(&mut self, timestamp: Timestamp) -> Vec; 525 | } 526 | 527 | /// Actions given by [TimestampReviewer]. 528 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 529 | pub enum TimestampAction { 530 | /// Compact data between two timestamps (both inclusive). 531 | /// The third parameter is the id of new level. This field is filled by the 532 | /// peer who observed this original "compact action" (sent by 533 | /// `TimestampReviewer`). 534 | Compact(Timestamp, Timestamp, Option), 535 | /// Outdate data which timestamp is smaller than given. 536 | Outdate(Timestamp), 537 | } 538 | 539 | /// A simple timestamp review implementation. It has two config entries 540 | /// `rick_range` and `outdate_range`. `rick_range` defines the range of 541 | /// rick and sstable files. `outdate_range` defines how much data should 542 | /// be kept. `outdate_range` should be integer times of `rick_range` even 543 | /// if it is unchecked. 544 | /// 545 | /// This implementation is not bound with real world time. It assume the 546 | /// timestamp comes from `observe()` call is the newest. And just triggers 547 | /// compaction and outdate only based on this. In real scenario 548 | /// when timestamp has more meaning or restriction, more complex logic can 549 | /// be achieved. 550 | pub struct SimpleTimestampReviewer { 551 | // config part 552 | rick_range: Timestamp, 553 | outdate_range: Timestamp, 554 | 555 | // status part 556 | last_compacted: Timestamp, 557 | last_outdated: Timestamp, 558 | } 559 | 560 | impl SimpleTimestampReviewer { 561 | pub fn new(rick_range: Timestamp, outdate_range: Timestamp) -> Self { 562 | Self { 563 | rick_range, 564 | outdate_range, 565 | last_compacted: 0, 566 | last_outdated: 0, 567 | } 568 | } 569 | } 570 | 571 | impl TimestampReviewer for SimpleTimestampReviewer { 572 | fn observe(&mut self, timestamp: Timestamp) -> Vec { 573 | let mut actions = vec![]; 574 | if timestamp - self.last_compacted + 1 >= self.rick_range { 575 | actions.push(TimestampAction::Compact( 576 | self.last_compacted, 577 | timestamp, 578 | None, 579 | )); 580 | self.last_compacted = timestamp + 1; 581 | } 582 | if timestamp - self.last_outdated + 1 >= self.outdate_range { 583 | actions.push(TimestampAction::Outdate( 584 | self.last_outdated + self.rick_range - 1, 585 | )); 586 | self.last_outdated += self.rick_range; 587 | } 588 | 589 | actions 590 | } 591 | } 592 | 593 | #[derive(Debug, Clone, Copy)] 594 | pub struct WriteBatchConfig { 595 | /// The maximum number of entries can be hold in one batch. 596 | pub batch_size: usize, 597 | /// The longest time duration between two batch consumptions. 598 | pub timeout: Duration, 599 | } 600 | 601 | impl Default for WriteBatchConfig { 602 | fn default() -> Self { 603 | Self { 604 | batch_size: 0, 605 | timeout: Duration::from_millis(0), 606 | } 607 | } 608 | } 609 | 610 | /// Batching write request 611 | struct WriteBatch { 612 | notifier: RefCell>>>, 613 | buf: RefCell>, 614 | timeout: Duration, 615 | batch_size: usize, 616 | /// Lock on two vectors `notifier` and `buf`. 617 | lock: Mutex<()>, 618 | /// Generated by `TimerActionOnce::do_in()` with the purpose of 619 | /// consuming batched entries after some duration. 620 | action: RwLock>>, 621 | // level: Rc, 622 | } 623 | 624 | impl WriteBatch { 625 | pub fn with_config(config: WriteBatchConfig) -> Self { 626 | Self { 627 | notifier: RefCell::new(vec![]), 628 | buf: RefCell::new(vec![]), 629 | timeout: config.timeout, 630 | batch_size: config.batch_size, 631 | lock: Mutex::new(()), 632 | action: RwLock::new(None), 633 | } 634 | } 635 | 636 | pub fn default() -> Self { 637 | Self::with_config(WriteBatchConfig::default()) 638 | } 639 | 640 | /// Enqueue some write requests. Then check the size limit. 641 | /// This will reset the timeout timer. 642 | #[allow(clippy::branches_sharing_code)] 643 | pub async fn enqueue( 644 | self: Rc, 645 | mut reqs: Vec, 646 | tx: Sender>, 647 | level: Rc>, 648 | ) { 649 | // enqueue 650 | let guard = self.lock.lock().await; 651 | self.notifier.borrow_mut().push(tx); 652 | self.buf.borrow_mut().append(&mut reqs); 653 | 654 | // check size limit 655 | if self.buf.borrow().len() >= self.batch_size { 656 | drop(guard); 657 | self.consume(level).await; 658 | } else { 659 | drop(guard); 660 | self.set_or_rearm(level).await; 661 | } 662 | } 663 | 664 | /// Consume all batched entries. 665 | pub async fn consume(self: Rc, level: Rc>) { 666 | // let mut action_guard = self.action.write().await.unwrap(); 667 | // take contents 668 | let guard = self.lock.lock().await; 669 | let notifier = self.notifier.take(); 670 | let buf = self.buf.take(); 671 | drop(guard); 672 | 673 | // write and reply 674 | let result = io_worker::GATE 675 | .with(|gate| { 676 | gate.spawn(async move { level.put_internal(buf).await }) 677 | .unwrap() 678 | }) 679 | .await; 680 | if result.is_ok() { 681 | for tx in notifier { 682 | let _ = tx.send(Ok(())); 683 | } 684 | } else { 685 | for tx in notifier { 686 | let _ = tx.send(Err(HelixError::Poisoned("Put".to_string()))); 687 | } 688 | } 689 | 690 | // todo: finish cancellation 691 | // destroy action timer as this "consume action" is already triggered 692 | // (regardless of it is triggered by timer or `Levels`'). 693 | // if let Some(action) = action_guard.take() { 694 | // action.cancel().await; 695 | // } 696 | } 697 | 698 | async fn destroy_action(&self) { 699 | let mut action_guard = self.action.write().await.unwrap(); 700 | if let Some(action) = &*action_guard { 701 | action.destroy(); 702 | } 703 | drop(action_guard.take()); 704 | } 705 | 706 | async fn set_or_rearm(self: Rc, level: Rc>) { 707 | let mut action = self.action.write().await.unwrap(); 708 | 709 | // rearm timer 710 | if let Some(action) = &*action { 711 | action.rearm_in(self.timeout); 712 | return; 713 | } 714 | 715 | // otherwise set the action 716 | *action = Some(TimerActionOnce::do_in( 717 | self.timeout, 718 | self.clone().consume(level), 719 | )); 720 | } 721 | } 722 | 723 | #[cfg(test)] 724 | mod test { 725 | use glommio::channels::channel_mesh::MeshBuilder; 726 | use glommio::LocalExecutor; 727 | use tempfile::tempdir; 728 | 729 | use super::*; 730 | use crate::compact_sched::QueueUpCompSched; 731 | use crate::file::FileManager; 732 | use crate::fn_registry::FnRegistry; 733 | 734 | #[tokio::test] 735 | async fn simple_timestamp_reviewer_trigger_compact_and_outdate() { 736 | let mut tsr = SimpleTimestampReviewer::new(10, 30); 737 | 738 | let mut actions = vec![]; 739 | let expected = vec![ 740 | TimestampAction::Compact(0, 9, None), 741 | TimestampAction::Compact(10, 19, None), 742 | TimestampAction::Compact(20, 29, None), 743 | TimestampAction::Outdate(9), 744 | TimestampAction::Compact(30, 39, None), 745 | TimestampAction::Outdate(19), 746 | ]; 747 | 748 | for i in 0..40 { 749 | actions.append(&mut tsr.observe(i)); 750 | } 751 | 752 | assert_eq!(actions, expected); 753 | } 754 | 755 | #[test] 756 | fn put_get_on_rick() { 757 | let ex = LocalExecutor::default(); 758 | ex.run(async { 759 | let base_dir = tempdir().unwrap(); 760 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 761 | let fn_registry = FnRegistry::new_noop(); 762 | let ctx = Arc::new(Context { 763 | file_manager, 764 | fn_registry, 765 | }); 766 | let timestamp_reviewer: Arc>> = 767 | Arc::new(Mutex::new(Box::new(SimpleTimestampReviewer::new(10, 30)))); 768 | let sender = MeshBuilder::full(1, 1).join().await.unwrap().0; 769 | let level_info = Arc::new(Mutex::new( 770 | ctx.file_manager.open_level_info().await.unwrap(), 771 | )); 772 | let (sched, tq) = QueueUpCompSched::default(); 773 | let levels = Levels::try_new( 774 | 0, 775 | Options::default(), 776 | timestamp_reviewer, 777 | ctx, 778 | sender, 779 | level_info, 780 | sched.clone(), 781 | ) 782 | .await 783 | .unwrap(); 784 | sched.clone().init(levels.clone()); 785 | sched.install(tq).unwrap(); 786 | 787 | let entries = vec![ 788 | (1, b"key1".to_vec(), b"value1".to_vec()).into(), 789 | (2, b"key1".to_vec(), b"value1".to_vec()).into(), 790 | (3, b"key1".to_vec(), b"value1".to_vec()).into(), 791 | (1, b"key2".to_vec(), b"value2".to_vec()).into(), 792 | (2, b"key2".to_vec(), b"value2".to_vec()).into(), 793 | (3, b"key3".to_vec(), b"value1".to_vec()).into(), 794 | ]; 795 | 796 | levels.put_internal(entries.clone()).await.unwrap(); 797 | 798 | for entry in entries { 799 | assert_eq!( 800 | entry, 801 | levels 802 | .get(entry.time_key(), ReadOption::default().no_decompress()) 803 | .await 804 | .unwrap() 805 | .unwrap() 806 | ); 807 | } 808 | 809 | // overwrite a key 810 | let new_entry: Entry = (1, b"key1".to_vec(), b"value3".to_vec()).into(); 811 | levels.put_internal(vec![new_entry.clone()]).await.unwrap(); 812 | assert_eq!( 813 | new_entry, 814 | levels 815 | .get(new_entry.time_key(), ReadOption::default().no_decompress()) 816 | .await 817 | .unwrap() 818 | .unwrap() 819 | ); 820 | }); 821 | } 822 | 823 | #[test] 824 | fn put_get_with_compaction() { 825 | let ex = LocalExecutor::default(); 826 | ex.run(async { 827 | let base_dir = tempdir().unwrap(); 828 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 829 | let fn_registry = FnRegistry::new_noop(); 830 | let ctx = Arc::new(Context { 831 | file_manager, 832 | fn_registry, 833 | }); 834 | let timestamp_reviewer: Arc>> = 835 | Arc::new(Mutex::new(Box::new(SimpleTimestampReviewer::new(10, 30)))); 836 | let sender = MeshBuilder::full(1, 1).join().await.unwrap().0; 837 | let level_info = Arc::new(Mutex::new( 838 | ctx.file_manager.open_level_info().await.unwrap(), 839 | )); 840 | let (sched, tq) = QueueUpCompSched::default(); 841 | let levels = Levels::try_new( 842 | 0, 843 | Options::default(), 844 | timestamp_reviewer, 845 | ctx.clone(), 846 | sender, 847 | level_info, 848 | sched.clone(), 849 | ) 850 | .await 851 | .unwrap(); 852 | sched.clone().init(levels.clone()); 853 | sched.install(tq).unwrap(); 854 | 855 | for timestamp in 0..25 { 856 | levels 857 | .put_internal(vec![(timestamp, b"key".to_vec(), b"value".to_vec()).into()]) 858 | .await 859 | .unwrap(); 860 | } 861 | 862 | for timestamp in 0..25 { 863 | let result = levels 864 | .get(&(timestamp, b"key".to_vec()), ReadOption::default()) 865 | .await 866 | .unwrap() 867 | .unwrap(); 868 | 869 | assert_eq!( 870 | result, 871 | (timestamp, b"key".to_vec(), b"value".to_vec()).into() 872 | ); 873 | } 874 | }); 875 | } 876 | } 877 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! HelixDB is a KV-Engine designed for time-series data. 2 | 3 | #![feature(vec_into_raw_parts)] 4 | #![feature(trait_alias)] 5 | #![feature(async_closure)] 6 | #![allow(internal_features)] 7 | #![feature(core_intrinsics)] 8 | #![feature(hash_extract_if)] 9 | // todo: open these lints 10 | #![allow(dead_code)] 11 | #![allow(unused_variables)] 12 | 13 | /// Unwrap `Option` under the `Result>` return type requirement. 14 | /// The `None` case will early return with `Ok(None)`. 15 | /// 16 | /// # Example 17 | /// *Notice this macro is not exported via "`#[macro_export]`" so the following 18 | /// example will not be run as a test case.* 19 | /// ```ignore 20 | /// # #![feature(never_type)] 21 | /// # #[macro_use] extern crate helixdb; 22 | /// fn return_ok_none() -> Result, !> { 23 | /// let val: Option = None; 24 | /// ok_unwrap!(val); 25 | /// panic!("should have returned"); 26 | /// } 27 | /// 28 | /// # fn container() -> Result, !> { 29 | /// let val = ok_unwrap!(Some(0usize)); 30 | /// assert_eq!(val, 0usize); 31 | /// assert_eq!(return_ok_none(), Ok(None)); 32 | /// # Ok(None) 33 | /// # } 34 | /// 35 | /// # let _ = container(); 36 | /// ``` 37 | macro_rules! ok_unwrap { 38 | ($e:expr) => { 39 | match $e { 40 | Some(thing) => thing, 41 | None => return Ok(None), 42 | } 43 | }; 44 | } 45 | 46 | #[deprecated] 47 | mod blocks; 48 | mod cache; 49 | mod compact_sched; 50 | mod context; 51 | mod db; 52 | mod error; 53 | mod file; 54 | mod fn_registry; 55 | mod index; 56 | mod io; 57 | mod io_worker; 58 | pub mod iterator; 59 | mod level; 60 | pub mod option; 61 | mod table; 62 | mod types; 63 | mod util; 64 | 65 | pub use db::*; 66 | pub use fn_registry::FnRegistry; 67 | pub use level::{SimpleTimestampReviewer, TimestampAction, TimestampReviewer}; 68 | pub use types::{Entry, TimeRange}; 69 | pub use util::{Comparator, LexicalComparator, NoOrderComparator}; 70 | 71 | #[global_allocator] 72 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; 73 | -------------------------------------------------------------------------------- /src/option.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use crate::cache::CacheConfig; 4 | use crate::fn_registry::FnRegistry; 5 | use crate::level::{SimpleTimestampReviewer, TimestampReviewer, WriteBatchConfig}; 6 | 7 | /// Options for opening HelixDB 8 | pub struct Options { 9 | // parameters 10 | /// Number of shards. It is recommended to equal to the number of system 11 | /// processors. 12 | pub(crate) num_shard: usize, 13 | /// Queue length of each shard's task receiver. 14 | pub(crate) task_buffer_size: usize, 15 | /// Configurations of cache. 16 | pub(crate) cache: CacheConfig, 17 | pub(crate) write_batch: WriteBatchConfig, 18 | pub(crate) compact_prompt_interval: Duration, 19 | 20 | // helixdb context 21 | pub(crate) fn_registry: Option, 22 | pub(crate) tsr: Option>, 23 | } 24 | 25 | // todo: remove this 26 | impl Clone for Options { 27 | /// a 28 | fn clone(&self) -> Self { 29 | Self { 30 | num_shard: self.num_shard, 31 | task_buffer_size: self.task_buffer_size, 32 | cache: self.cache, 33 | write_batch: self.write_batch, 34 | compact_prompt_interval: self.compact_prompt_interval, 35 | 36 | fn_registry: None, 37 | tsr: None, 38 | } 39 | } 40 | } 41 | 42 | impl Options { 43 | pub fn default() -> Self { 44 | Self { 45 | num_shard: num_cpus::get(), 46 | task_buffer_size: 128, 47 | cache: CacheConfig::default(), 48 | write_batch: WriteBatchConfig::default(), 49 | compact_prompt_interval: Duration::from_secs(1), 50 | 51 | fn_registry: Some(FnRegistry::new_noop()), 52 | tsr: Some(Box::new(SimpleTimestampReviewer::new(1024, 1024 * 8))), 53 | } 54 | } 55 | 56 | /// Returns a copy of the value. 57 | /// This function may not works as expected. It is a "partial" clone. 58 | /// 59 | /// Some fields in this [`Options`] struct isn't suit for clone, like 60 | /// `fn_registry` or `tsr`. They are wrapped by a `Option`, and will 61 | /// only leave a `None` after called `clone_partial()`. 62 | /// 63 | /// This is to making [`Options`] more general and unified. Other fields 64 | /// works as what common [`std::marker::Clone`] does. 65 | /// # Example 66 | /// *Just a example and it isn't runnable since `fn_registry` is a private 67 | /// field.* ```compile_fail 68 | /// # use helixdb::option::Options; 69 | /// let options = Options::default(); 70 | /// assert!(options.fn_registry.is_some()); 71 | /// // after calling `clone_partial()` some will be `None` because they 72 | /// won't be cloned actually. let options_cloned = 73 | /// options.clone_partial(); assert!(options_cloned.fn_registry. 74 | /// is_none()); ``` 75 | pub fn clone_partial(&self) -> Self { 76 | Self { 77 | num_shard: self.num_shard, 78 | task_buffer_size: self.task_buffer_size, 79 | cache: self.cache, 80 | write_batch: self.write_batch, 81 | compact_prompt_interval: self.compact_prompt_interval, 82 | 83 | fn_registry: None, 84 | tsr: None, 85 | } 86 | } 87 | 88 | pub fn shards(mut self, num_shard: usize) -> Self { 89 | self.num_shard = num_shard; 90 | self 91 | } 92 | 93 | pub fn set_fn_registry(mut self, fn_registry: FnRegistry) -> Self { 94 | self.fn_registry = Some(fn_registry); 95 | self 96 | } 97 | 98 | pub fn cache(mut self, f: F) -> Self 99 | where 100 | F: FnOnce(CacheConfig) -> CacheConfig, 101 | { 102 | self.cache = f(self.cache); 103 | self 104 | } 105 | 106 | pub fn write_batch(mut self, f: F) -> Self 107 | where 108 | F: FnOnce(WriteBatchConfig) -> WriteBatchConfig, 109 | { 110 | self.write_batch = f(self.write_batch); 111 | self 112 | } 113 | 114 | pub fn set_timestamp_reviewer(mut self, tsr: Box) -> Self { 115 | self.tsr = Some(tsr); 116 | self 117 | } 118 | 119 | pub fn set_task_buffer_size(mut self, buffer_size: usize) -> Self { 120 | self.task_buffer_size = buffer_size; 121 | self 122 | } 123 | 124 | pub fn set_compact_prompt_interval(mut self, interval: Duration) -> Self { 125 | self.compact_prompt_interval = interval; 126 | self 127 | } 128 | } 129 | 130 | #[derive(Clone, Copy)] 131 | pub struct ReadOption { 132 | /// Read request will decompress a compressed value then try to find 133 | /// requested timestamp if true. Default value: true. 134 | pub(crate) decompress: bool, 135 | } 136 | 137 | impl Default for ReadOption { 138 | fn default() -> Self { 139 | Self { decompress: true } 140 | } 141 | } 142 | 143 | impl ReadOption { 144 | pub fn no_decompress(mut self) -> Self { 145 | self.decompress = false; 146 | self 147 | } 148 | } 149 | 150 | #[derive(Copy, Clone, Debug)] 151 | pub struct ScanOption { 152 | pub prefetch_buf_size: usize, 153 | } 154 | 155 | #[cfg(test)] 156 | mod test { 157 | use super::*; 158 | 159 | #[test] 160 | fn partial_clone() { 161 | let options = Options::default(); 162 | assert!(options.fn_registry.is_some()); 163 | assert!(options.tsr.is_some()); 164 | 165 | // after calling `clone()` some will be `None` because they won't be cloned 166 | // actually. 167 | let options_cloned = options.clone_partial(); 168 | assert!(options_cloned.fn_registry.is_none()); 169 | assert!(options_cloned.tsr.is_none()); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/table.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use crate::context::Context; 4 | use crate::error::Result; 5 | use crate::file::{Rick, SSTable}; 6 | use crate::index::MemIndex; 7 | #[cfg(test)] 8 | use crate::types::Offset; 9 | use crate::types::{Bytes, Entry, LevelId, ThreadId, Timestamp}; 10 | 11 | #[derive(Hash, PartialEq, Eq)] 12 | pub struct TableIdentifier { 13 | pub tid: ThreadId, 14 | pub lid: LevelId, 15 | } 16 | 17 | impl From<(ThreadId, LevelId)> for TableIdentifier { 18 | fn from(ids: (ThreadId, LevelId)) -> Self { 19 | Self { 20 | tid: ids.0, 21 | lid: ids.1, 22 | } 23 | } 24 | } 25 | 26 | /// Provides read methods to SSTable. 27 | /// 28 | /// If wants to modify a sstable should upgrade to a writable handle 29 | /// (unimplemented). 30 | pub struct TableReadHandle { 31 | memindex: MemIndex, 32 | sstable: SSTable, 33 | rick: Rick, 34 | ctx: Arc, 35 | } 36 | 37 | impl TableReadHandle { 38 | pub fn new(memindex: MemIndex, sstable: SSTable, rick: Rick, ctx: Arc) -> Self { 39 | Self { 40 | memindex, 41 | sstable, 42 | rick, 43 | ctx, 44 | } 45 | } 46 | 47 | pub async fn get(&self, time_key: &(Timestamp, Bytes)) -> Result> { 48 | let offset = if self.is_compressed() { 49 | let mut align_time_key = time_key.clone(); 50 | align_time_key.0 = self.rick.get_align_ts(); 51 | self.memindex.get(&align_time_key)? 52 | } else { 53 | self.memindex.get(time_key)? 54 | }; 55 | if let Some(offset) = offset { 56 | Ok(Some(self.rick.read(offset).await?)) 57 | } else { 58 | Ok(None) 59 | } 60 | } 61 | 62 | /// Upgrade to writeable handle. 63 | pub fn upgrade() -> ! { 64 | todo!() 65 | } 66 | 67 | pub fn is_compressed(&self) -> bool { 68 | self.rick.is_compressed() 69 | } 70 | 71 | // For test case. 72 | /// Get value's offset in rick file. 73 | #[cfg(test)] 74 | pub fn get_offset(&self, time_key: &(Timestamp, Bytes)) -> Result> { 75 | self.memindex.get(time_key) 76 | } 77 | 78 | fn decompress_entry(&self, key: &[u8], data: &[u8]) -> Result> { 79 | self.ctx.fn_registry.decompress_entries(key, data) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/types/entry.rs: -------------------------------------------------------------------------------- 1 | use std::convert::TryInto; 2 | use std::mem; 3 | use std::ops::Range; 4 | 5 | use flatbuffers::FlatBufferBuilder; 6 | 7 | pub type Bytes = Vec; 8 | pub type Timestamp = i64; 9 | pub type ThreadId = u64; 10 | /// Mono-increase identifier to level files. Starts from 1. 11 | /// Level id `0` stands for Rick level. 12 | pub type LevelId = u64; 13 | 14 | /// Wrapper struct over protos::Entry. 15 | /// 16 | /// C representation is needed to converting `(&ts, &key)` to `&(ts, key)`. 17 | #[derive(Debug, PartialEq, Eq, Clone)] 18 | #[repr(C)] 19 | pub struct Entry { 20 | pub timestamp: Timestamp, 21 | pub key: Bytes, 22 | pub value: Bytes, 23 | } 24 | 25 | impl Entry { 26 | pub fn encode(&self) -> Bytes { 27 | let mut fbb = FlatBufferBuilder::new(); 28 | 29 | let timestamp = protos::Timestamp::new(self.timestamp); 30 | let key_bytes = fbb.create_vector(&self.key); 31 | let value_bytes = fbb.create_vector(&self.value); 32 | 33 | let entry = protos::Entry::create( 34 | &mut fbb, 35 | &protos::EntryArgs { 36 | timestamp: Some(×tamp), 37 | key: Some(key_bytes), 38 | value: Some(value_bytes), 39 | }, 40 | ); 41 | 42 | fbb.finish(entry, None); 43 | fbb.finished_data().to_vec() 44 | } 45 | 46 | pub fn decode(bytes: &[u8]) -> Self { 47 | // let fb_entry = flatbuffers::get_root::>(bytes); 48 | let fb_entry = flatbuffers::root::>(bytes).unwrap(); 49 | 50 | Self { 51 | timestamp: fb_entry.timestamp().unwrap().timestamp(), 52 | key: fb_entry.key().unwrap().bytes().to_vec(), 53 | value: fb_entry.value().unwrap().bytes().to_vec(), 54 | } 55 | } 56 | 57 | /// # Unsafe 58 | /// - Purpose: make a `&(A, B)` over a `&Self{A, B, C}` 59 | /// - Safety: `Entry` is qualified with `repr(C)`, memory layout is ensured 60 | /// to be the same with `((A, B), C)`. 61 | pub fn time_key(&self) -> &(Timestamp, Bytes) { 62 | unsafe { 63 | let p_entry = self as *const Entry; 64 | let p_ts_key = p_entry as *const (Timestamp, Bytes); 65 | &*p_ts_key 66 | } 67 | } 68 | } 69 | 70 | impl From<(Timestamp, Bytes, Bytes)> for Entry { 71 | fn from(input: (Timestamp, Bytes, Bytes)) -> Entry { 72 | let (timestamp, key, value) = input; 73 | Entry { 74 | timestamp, 75 | key, 76 | value, 77 | } 78 | } 79 | } 80 | 81 | /// Describe a encoded [Entry]'s buffer. 82 | pub struct EntryMeta { 83 | pub length: u64, 84 | } 85 | 86 | impl EntryMeta { 87 | pub fn new(length: u64) -> Self { 88 | Self { length } 89 | } 90 | 91 | pub const fn meta_size() -> usize { 92 | mem::size_of::() 93 | } 94 | 95 | pub fn encode(&self) -> [u8; 8] { 96 | self.length.to_le_bytes() 97 | } 98 | 99 | pub fn decode(bytes: &[u8]) -> Self { 100 | Self { 101 | length: u64::from_le_bytes(bytes.try_into().unwrap()), 102 | } 103 | } 104 | } 105 | 106 | // todo: replace with std::ops::Range. 107 | #[derive(Debug, Copy, Clone)] 108 | pub struct TimeRange { 109 | start: Timestamp, 110 | end: Timestamp, 111 | } 112 | 113 | impl TimeRange { 114 | /// Is `self` containing given timestamp. 115 | pub fn contains(&self, ts: Timestamp) -> bool { 116 | self.start <= ts && self.end >= ts 117 | } 118 | 119 | pub fn range(&self) -> Range { 120 | Range { 121 | start: self.start, 122 | end: self.end, 123 | } 124 | } 125 | 126 | pub fn start(&self) -> Timestamp { 127 | self.start 128 | } 129 | 130 | pub fn end(&self) -> Timestamp { 131 | self.end 132 | } 133 | } 134 | 135 | impl From<(Timestamp, Timestamp)> for TimeRange { 136 | fn from(tuple: (Timestamp, Timestamp)) -> TimeRange { 137 | Self { 138 | start: tuple.0, 139 | end: tuple.1, 140 | } 141 | } 142 | } 143 | 144 | #[cfg(test)] 145 | mod test { 146 | 147 | use super::*; 148 | 149 | #[test] 150 | fn entry_codec() { 151 | let entry = Entry { 152 | timestamp: 1000, 153 | key: b"key".to_vec(), 154 | value: b"value".to_vec(), 155 | }; 156 | 157 | let bytes = entry.encode(); 158 | 159 | assert_eq!(entry, Entry::decode(&bytes)); 160 | } 161 | 162 | #[test] 163 | fn time_range_contains() { 164 | let range = TimeRange::from((0, 10)); 165 | 166 | assert!(!range.contains(-1)); 167 | assert!(range.contains(0)); 168 | assert!(range.contains(5)); 169 | assert!(range.contains(10)); 170 | assert!(!range.contains(101)); 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/types/level_info.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | 3 | use flatbuffers::FlatBufferBuilder; 4 | 5 | use super::{Bytes, LevelId, Timestamp}; 6 | use crate::error::Result; 7 | use crate::file::FileManager; 8 | 9 | #[derive(Default, PartialEq, Eq, Debug, Clone, Copy)] 10 | pub struct LevelDesc { 11 | start: Timestamp, 12 | end: Timestamp, 13 | id: LevelId, 14 | } 15 | 16 | impl From for LevelDesc { 17 | fn from(fb_desc: protos::LevelDesc) -> LevelDesc { 18 | let time_range = fb_desc.time_range(); 19 | Self { 20 | start: time_range.start().timestamp(), 21 | end: time_range.end().timestamp(), 22 | id: fb_desc.id().id(), 23 | } 24 | } 25 | } 26 | 27 | impl LevelDesc { 28 | pub fn as_generated_type(&self) -> protos::LevelDesc { 29 | let start = protos::Timestamp::new(self.start); 30 | let end = protos::Timestamp::new(self.end); 31 | let time_range = protos::TimeRange::new(&start, &end); 32 | 33 | let id = protos::LevelId::new(self.id); 34 | 35 | protos::LevelDesc::new(&time_range, &id) 36 | } 37 | 38 | #[inline] 39 | pub fn is_timestamp_match(&self, timestamp: Timestamp) -> bool { 40 | self.start <= timestamp && timestamp <= self.end 41 | } 42 | } 43 | 44 | /// Metadata of every levels. Is a array-like container of [LevelDesc]. 45 | /// 46 | /// [LevelDesc] is arranged from old (smaller timestamp) to new 47 | /// (larger timestamp). 48 | #[derive(Debug, PartialEq, Eq)] 49 | pub struct LevelInfo { 50 | // todo: remove RwLock 51 | infos: VecDeque, 52 | } 53 | 54 | impl LevelInfo { 55 | pub fn encode(&self) -> Bytes { 56 | let mut fbb = FlatBufferBuilder::new(); 57 | 58 | fbb.start_vector::(self.infos.len()); 59 | for desc in &self.infos { 60 | fbb.push(desc.as_generated_type()); 61 | } 62 | let batch = fbb.end_vector::(self.infos.len()); 63 | 64 | let infos = 65 | protos::LevelInfo::create(&mut fbb, &protos::LevelInfoArgs { infos: Some(batch) }); 66 | 67 | fbb.finish(infos, None); 68 | fbb.finished_data().to_vec() 69 | } 70 | 71 | pub fn decode(bytes: &[u8]) -> Self { 72 | // for empty level-info file. 73 | if bytes.is_empty() { 74 | return Self { 75 | infos: VecDeque::default(), 76 | }; 77 | } 78 | 79 | let fb_info = flatbuffers::root::>(bytes).unwrap(); 80 | let infos = fb_info 81 | .infos() 82 | .unwrap() 83 | .into_iter() 84 | .rev() // `fbb.push()` in encode reversed the order 85 | .cloned() 86 | .map(LevelDesc::from) 87 | .collect(); 88 | 89 | Self { infos } 90 | } 91 | 92 | /// Give a timestamp and find the level suits it. 93 | /// 94 | /// Rick entries' timestamp will not present in level-info. 95 | /// Thus if given timestamp is larger than the biggest timestamp recorded by 96 | /// this level-info, `Some(0)` will be returned. `0` is a special [LevelId] 97 | /// stands for Rick level. 98 | pub fn get_level_id(&self, timestamp: Timestamp) -> Option { 99 | // timestamp covered by rick will not present in level-info 100 | if self.infos.is_empty() || timestamp > self.infos.back().unwrap().end { 101 | return Some(0); 102 | } 103 | 104 | for desc in &self.infos { 105 | if desc.is_timestamp_match(timestamp) { 106 | return Some(desc.id); 107 | } 108 | } 109 | 110 | None 111 | } 112 | 113 | /// Return new level id. 114 | pub(crate) async fn add_level( 115 | &mut self, 116 | start: Timestamp, 117 | end: Timestamp, 118 | file_manager: &FileManager, 119 | ) -> Result { 120 | let mut new_desc = LevelDesc { start, end, id: 0 }; 121 | 122 | let next_id = self.infos.back().map_or_else(|| 1, |desc| desc.id + 1); 123 | new_desc.id = next_id; 124 | self.infos.push_back(new_desc); 125 | self.sync(file_manager).await?; 126 | 127 | Ok(next_id) 128 | } 129 | 130 | pub(crate) async fn remove_last_level(&mut self, file_manager: &FileManager) -> Result<()> { 131 | self.infos.pop_front(); 132 | 133 | self.sync(file_manager).await 134 | } 135 | 136 | /// Sync file infos to disk. Requires read lock. 137 | async fn sync(&self, file_manager: &FileManager) -> Result<()> { 138 | let bytes = self.encode(); 139 | file_manager.sync_level_info(bytes).await?; 140 | 141 | Ok(()) 142 | } 143 | 144 | #[cfg(test)] 145 | fn new(descriptions: Vec) -> Self { 146 | let infos = VecDeque::from(descriptions); 147 | 148 | Self { infos } 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod test { 154 | 155 | use glommio::LocalExecutor; 156 | use tempfile::tempdir; 157 | 158 | use super::*; 159 | 160 | #[test] 161 | fn level_desc_codec() { 162 | let infos = LevelInfo::new(vec![ 163 | LevelDesc { 164 | start: 21, 165 | end: 40, 166 | id: 4, 167 | }, 168 | LevelDesc { 169 | start: 100, 170 | end: 200, 171 | id: 8, 172 | }, 173 | ]); 174 | 175 | let bytes = infos.encode(); 176 | let decoded = LevelInfo::decode(&bytes); 177 | 178 | assert_eq!(decoded, infos); 179 | } 180 | 181 | #[test] 182 | fn add_level() { 183 | let ex = LocalExecutor::default(); 184 | ex.run(async { 185 | let base_dir = tempdir().unwrap(); 186 | let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap(); 187 | 188 | let mut info = LevelInfo::new(vec![]); 189 | info.add_level(0, 9, &file_manager).await.unwrap(); 190 | info.add_level(10, 19, &file_manager).await.unwrap(); 191 | info.add_level(20, 29, &file_manager).await.unwrap(); 192 | drop(info); 193 | 194 | let info = file_manager.open_level_info().await.unwrap(); 195 | let infos: Vec<_> = info.infos.iter().copied().collect(); 196 | let expected = vec![ 197 | LevelDesc { 198 | start: 0, 199 | end: 9, 200 | id: 1, 201 | }, 202 | LevelDesc { 203 | start: 10, 204 | end: 19, 205 | id: 2, 206 | }, 207 | LevelDesc { 208 | start: 20, 209 | end: 29, 210 | id: 3, 211 | }, 212 | ]; 213 | 214 | assert_eq!(infos, expected); 215 | }); 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /src/types/mod.rs: -------------------------------------------------------------------------------- 1 | //! Wrapper over all generated types / structs. And implements 2 | //! their utilities. 3 | //! 4 | //! `protos` dependency should only present in this mod. 5 | 6 | mod entry; 7 | mod level_info; 8 | mod rick; 9 | pub mod sstable; 10 | 11 | pub use entry::{Bytes, Entry, EntryMeta, LevelId, ThreadId, TimeRange, Timestamp}; 12 | pub use level_info::LevelInfo; 13 | pub(crate) use rick::{Offset, RickSuperBlock, ValueFormat}; 14 | 15 | // todo: maybe make a trait `Generated` or sth. 16 | // contains `encode()`, `decode()`, `to_generated_type()`. 17 | -------------------------------------------------------------------------------- /src/types/rick.rs: -------------------------------------------------------------------------------- 1 | use flatbuffers::FlatBufferBuilder; 2 | 3 | use super::{Bytes, Timestamp}; 4 | 5 | pub(crate) type Offset = u64; 6 | 7 | pub(crate) type ValueFormat = protos::ValueFormat; 8 | 9 | /// [Rick] file's super block. 10 | /// 11 | /// The binary representation will be padded to 4KB. 12 | #[derive(Debug, PartialEq, Eq)] 13 | pub(crate) struct RickSuperBlock { 14 | pub is_ordered: bool, 15 | pub legal_offset_start: Offset, 16 | pub legal_offset_end: Offset, 17 | // todo: add `version` and `crc` fields 18 | pub value_format: ValueFormat, 19 | /// Only valid when value format is `CompressedValue` 20 | pub align_timestamp: Timestamp, 21 | } 22 | 23 | impl RickSuperBlock { 24 | pub const LENGTH: usize = 4096; 25 | 26 | pub fn encode(&self) -> Bytes { 27 | let mut fbb = FlatBufferBuilder::new(); 28 | 29 | let legal_offset_start = protos::Offset::new(self.legal_offset_start); 30 | let legal_offset_end = protos::Offset::new(self.legal_offset_end); 31 | let align_timestamp = protos::Timestamp::new(self.align_timestamp); 32 | 33 | let sb = protos::RickSuperBlock::create( 34 | &mut fbb, 35 | &protos::RickSuperBlockArgs { 36 | is_ordered: self.is_ordered, 37 | legal_offset_start: Some(&legal_offset_start), 38 | legal_offset_end: Some(&legal_offset_end), 39 | value_format: self.value_format, 40 | align_timestamp: Some(&align_timestamp), 41 | }, 42 | ); 43 | 44 | fbb.finish(sb, None); 45 | let mut padding_bytes = fbb.finished_data().to_vec(); 46 | 47 | // the un-padding bytes should shorter than 4096 otherwise it will be truncated. 48 | debug_assert!(padding_bytes.len() <= Self::LENGTH); 49 | // padding it. Flatbuffers has the information about payload's length, so 50 | // tailing zero doesn't matter. 51 | padding_bytes.resize(Self::LENGTH, 0); 52 | padding_bytes 53 | } 54 | 55 | pub fn decode(bytes: &[u8]) -> Self { 56 | let fb_sb = flatbuffers::root::>(bytes).unwrap(); 57 | // let fb_sb = flatbuffers::get_root::>(bytes); 58 | 59 | Self { 60 | is_ordered: fb_sb.is_ordered(), 61 | legal_offset_start: fb_sb.legal_offset_start().unwrap().offset(), 62 | legal_offset_end: fb_sb.legal_offset_end().unwrap().offset(), 63 | value_format: fb_sb.value_format(), 64 | align_timestamp: fb_sb.align_timestamp().unwrap().timestamp(), 65 | } 66 | } 67 | } 68 | 69 | #[cfg(test)] 70 | mod test { 71 | use super::*; 72 | 73 | #[test] 74 | fn rick_super_block_codec() { 75 | let sb = RickSuperBlock { 76 | is_ordered: true, 77 | legal_offset_start: 4096, 78 | legal_offset_end: 8192, 79 | value_format: ValueFormat::RawValue, 80 | align_timestamp: 10086, 81 | }; 82 | 83 | let bytes = sb.encode(); 84 | assert_eq!(bytes.len(), RickSuperBlock::LENGTH); 85 | assert_eq!(sb, RickSuperBlock::decode(&bytes)); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/types/sstable.rs: -------------------------------------------------------------------------------- 1 | use flatbuffers::FlatBufferBuilder; 2 | 3 | use super::{Bytes, LevelId, Offset, ThreadId, Timestamp}; 4 | 5 | /// Enumeration of blocks' type 6 | pub type BlockType = protos::BlockType; 7 | 8 | #[derive(Debug, PartialEq, Eq, Clone, Copy)] 9 | pub(crate) struct BlockInfo { 10 | pub block_type: BlockType, 11 | pub offset: Offset, 12 | pub length: u64, 13 | } 14 | 15 | impl BlockInfo { 16 | pub fn as_generated_type(&self) -> protos::BlockInfo { 17 | let offset = protos::Offset::new(self.offset); 18 | 19 | protos::BlockInfo::new(self.block_type, &offset, self.length) 20 | } 21 | } 22 | 23 | impl From for BlockInfo { 24 | fn from(fb_info: protos::BlockInfo) -> BlockInfo { 25 | Self { 26 | block_type: fb_info.block_type(), 27 | offset: fb_info.offset().offset(), 28 | length: fb_info.length(), 29 | } 30 | } 31 | } 32 | 33 | /// Will be padded to 4096 bytes. 34 | #[derive(Debug, PartialEq, Eq)] 35 | pub(crate) struct SSTableSuperBlock { 36 | pub thread_id: ThreadId, 37 | pub level_id: LevelId, 38 | pub blocks: Vec, 39 | } 40 | 41 | impl SSTableSuperBlock { 42 | pub const LENGTH: usize = 4096; 43 | 44 | pub fn encode(&self) -> Bytes { 45 | let mut fbb = FlatBufferBuilder::new(); 46 | 47 | let thread_id = protos::ThreadId::new(self.thread_id); 48 | let level_id = protos::LevelId::new(self.level_id); 49 | fbb.start_vector::(self.blocks.len()); 50 | for info in &self.blocks { 51 | fbb.push(info.as_generated_type()); 52 | } 53 | let blocks = fbb.end_vector::(self.blocks.len()); 54 | let blocks = protos::SSTableSuperBlock::create( 55 | &mut fbb, 56 | &protos::SSTableSuperBlockArgs { 57 | thread_id: Some(&thread_id), 58 | level_id: Some(&level_id), 59 | blocks: Some(blocks), 60 | }, 61 | ); 62 | 63 | fbb.finish(blocks, None); 64 | let mut padding_bytes = fbb.finished_data().to_vec(); 65 | 66 | // the un-padding bytes should shorter than 4096 otherwise it will be truncated. 67 | debug_assert!(padding_bytes.len() <= Self::LENGTH); 68 | // padding it. Flatbuffers has the information about payload's length, so 69 | // tailing zero doesn't matter. 70 | padding_bytes.resize(Self::LENGTH, 0); 71 | padding_bytes 72 | } 73 | 74 | pub fn decode(bytes: &[u8]) -> Self { 75 | if bytes.is_empty() { 76 | return Self { 77 | blocks: vec![], 78 | thread_id: 0, 79 | level_id: 0, 80 | }; 81 | } 82 | 83 | let fb_blocks = flatbuffers::root::>(bytes).unwrap(); 84 | let thread_id = fb_blocks.thread_id().unwrap().id(); 85 | let level_id = fb_blocks.level_id().unwrap().id(); 86 | let blocks = fb_blocks 87 | .blocks() 88 | .unwrap() 89 | .into_iter() 90 | .cloned() 91 | .map(BlockInfo::from) 92 | .collect(); 93 | 94 | Self { 95 | thread_id, 96 | level_id, 97 | blocks, 98 | } 99 | } 100 | 101 | /// Get blocks info of given block type. There may have many blocks with 102 | /// the same types (but not tested yet. this is a todo). 103 | pub fn get_block_info(&self, block_type: BlockType) -> Vec { 104 | let mut result = vec![]; 105 | for block in &self.blocks { 106 | if block.block_type == block_type { 107 | result.push(*block); 108 | } 109 | } 110 | 111 | result 112 | } 113 | } 114 | 115 | #[derive(Debug, PartialEq, Eq)] 116 | pub(crate) struct IndexBlockEntry { 117 | pub value_offset: Offset, 118 | pub timestamp: Timestamp, 119 | pub key: Bytes, 120 | } 121 | 122 | impl IndexBlockEntry { 123 | pub fn encode(&self) -> Bytes { 124 | let mut fbb = FlatBufferBuilder::new(); 125 | 126 | let value_offset = protos::Offset::new(self.value_offset); 127 | let timestamp = protos::Timestamp::new(self.timestamp); 128 | let key_bytes = fbb.create_vector(&self.key); 129 | 130 | let entry = protos::IndexBlockEntry::create( 131 | &mut fbb, 132 | &protos::IndexBlockEntryArgs { 133 | value_offset: Some(&value_offset), 134 | timestamp: Some(×tamp), 135 | key: Some(key_bytes), 136 | }, 137 | ); 138 | 139 | fbb.finish(entry, None); 140 | fbb.finished_data().to_vec() 141 | } 142 | 143 | pub fn decode(bytes: &[u8]) -> Self { 144 | let fb_entry = flatbuffers::root::>(bytes).unwrap(); 145 | 146 | Self { 147 | value_offset: fb_entry.value_offset().unwrap().offset(), 148 | timestamp: fb_entry.timestamp().unwrap().timestamp(), 149 | key: fb_entry.key().unwrap().bytes().to_vec(), 150 | } 151 | } 152 | } 153 | 154 | #[cfg(test)] 155 | mod test { 156 | use super::*; 157 | 158 | #[test] 159 | fn sstable_super_block_codec() { 160 | let block_info = BlockInfo { 161 | block_type: BlockType::IndexBlock, 162 | offset: 40960, 163 | length: 10240, 164 | }; 165 | let sb = SSTableSuperBlock { 166 | thread_id: 3, 167 | level_id: 5, 168 | blocks: vec![block_info], 169 | }; 170 | 171 | let bytes = sb.encode(); 172 | assert_eq!(bytes.len(), SSTableSuperBlock::LENGTH); 173 | assert_eq!(sb, SSTableSuperBlock::decode(&bytes)); 174 | } 175 | 176 | #[test] 177 | fn sstable_index_entry_codec() { 178 | let entry = IndexBlockEntry { 179 | value_offset: 40960, 180 | timestamp: 12345, 181 | key: b"value".to_vec(), 182 | }; 183 | 184 | let bytes = entry.encode(); 185 | assert_eq!(entry, IndexBlockEntry::decode(&bytes)); 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Borrow; 2 | use std::cmp::Ordering; 3 | use std::convert::TryInto; 4 | use std::marker::PhantomData; 5 | use std::ops::Index; 6 | 7 | use crate::error::{HelixError, Result}; 8 | use crate::types::{Bytes, Entry}; 9 | 10 | pub(crate) trait KeyExtractor>: Eq { 11 | fn key(data: &T) -> &[u8]; 12 | } 13 | 14 | impl + Borrow>> KeyExtractor for Vec { 15 | fn key(data: &T) -> &[u8] { 16 | &data.index(0).key 17 | } 18 | } 19 | 20 | // todo: remove Eq bound? 21 | pub trait Comparator: Send + Sync + Eq { 22 | fn cmp(lhs: &[u8], rhs: &[u8]) -> Ordering 23 | where 24 | Self: Sized; 25 | } 26 | 27 | #[derive(Eq, PartialEq)] 28 | pub(crate) struct OrderingHelper> { 29 | pub data: T, 30 | _c: PhantomData, 31 | } 32 | 33 | impl> Ord for OrderingHelper { 34 | fn cmp(&self, other: &Self) -> Ordering { 35 | C::cmp(T::key(&self.data), T::key(&other.data)) 36 | } 37 | } 38 | 39 | impl> PartialOrd for OrderingHelper { 40 | fn partial_cmp(&self, other: &Self) -> Option { 41 | Some(self.cmp(other)) 42 | } 43 | } 44 | 45 | impl> From for OrderingHelper { 46 | fn from(data: T) -> Self { 47 | Self { 48 | data, 49 | _c: PhantomData, 50 | } 51 | } 52 | } 53 | 54 | #[derive(Eq, PartialEq)] 55 | /// This comparator returns `Ordering::Equal` for every operands. 56 | /// Which will ignore the provided left and right bound and result a full table 57 | /// scan. 58 | /// 59 | /// # Example 60 | /// ```rust 61 | /// # use std::cmp::Ordering; 62 | /// # use helixdb::NoOrderComparator; 63 | /// # use crate::helixdb::Comparator; 64 | /// assert_eq!( 65 | /// NoOrderComparator::cmp(&[1, 2, 3], &[2, 3, 3]), 66 | /// Ordering::Equal 67 | /// ); 68 | /// assert_eq!(NoOrderComparator::cmp(&[1, 2, 3], &[1, 2]), Ordering::Equal); 69 | /// assert_eq!( 70 | /// NoOrderComparator::cmp(&[1, 2, 3], &[1, 2, 3]), 71 | /// Ordering::Equal 72 | /// ); 73 | /// ``` 74 | pub struct NoOrderComparator {} 75 | 76 | impl Comparator for NoOrderComparator { 77 | fn cmp(_: &[u8], _: &[u8]) -> Ordering { 78 | Ordering::Equal 79 | } 80 | } 81 | 82 | #[derive(PartialEq, Eq)] 83 | /// This comparator describes lexicographical order on `[u8]` 84 | /// 85 | /// # Example 86 | /// ```rust 87 | /// # use std::cmp::Ordering; 88 | /// # use helixdb::LexicalComparator; 89 | /// # use crate::helixdb::Comparator; 90 | /// assert_eq!( 91 | /// LexicalComparator::cmp(&[1, 2, 3], &[2, 3, 3]), 92 | /// Ordering::Less 93 | /// ); 94 | /// assert_eq!( 95 | /// LexicalComparator::cmp(&[1, 2, 3], &[1, 2]), 96 | /// Ordering::Greater 97 | /// ); 98 | /// assert_eq!( 99 | /// LexicalComparator::cmp(&[1, 2, 3], &[1, 2, 3]), 100 | /// Ordering::Equal 101 | /// ); 102 | /// ``` 103 | pub struct LexicalComparator {} 104 | 105 | impl Comparator for LexicalComparator { 106 | fn cmp(lhs: &[u8], rhs: &[u8]) -> Ordering { 107 | lhs.cmp(rhs) 108 | } 109 | } 110 | 111 | pub fn encode_u64(data: u64) -> Bytes { 112 | data.to_le_bytes().to_vec() 113 | } 114 | 115 | pub fn decode_u64(data: &[u8]) -> u64 { 116 | u64::from_le_bytes(data.try_into().unwrap()) 117 | } 118 | 119 | /// Check the length of data. Return `HelixError::IncompatibleLength` 120 | pub fn check_bytes_length(data: &[u8], length: usize) -> Result<()> { 121 | if data.len() == length { 122 | Ok(()) 123 | } else { 124 | Err(HelixError::IncompatibleLength(length, data.len())) 125 | } 126 | } 127 | 128 | pub(crate) trait AssertSend: Send {} 129 | 130 | pub(crate) trait AssertSync: Sync {} 131 | --------------------------------------------------------------------------------