├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── benchmarks
    ├── Cargo.toml
    ├── microbenches
    │   └── entry_codec.rs
    ├── src
    │   ├── load.rs
    │   ├── main.rs
    │   ├── panel.rs
    │   ├── read.rs
    │   └── scan.rs
    └── ssd-test.fio
├── docs
    └── image
    │   └── helix_logo.jpg
├── protos
    ├── Cargo.toml
    ├── build.rs
    ├── flatbuffer
    │   └── helix.fbs
    └── src
    │   └── lib.rs
├── rust-toolchain.toml
├── rustfmt.toml
└── src
    ├── blocks
        ├── block.rs
        └── mod.rs
    ├── cache.rs
    ├── compact_sched.rs
    ├── context.rs
    ├── db.rs
    ├── error.rs
    ├── file
        ├── file_manager.rs
        ├── mod.rs
        ├── rick.rs
        └── sstable.rs
    ├── fn_registry.rs
    ├── index.rs
    ├── io.rs
    ├── io_worker.rs
    ├── iterator.rs
    ├── level.rs
    ├── lib.rs
    ├── option.rs
    ├── table.rs
    ├── types
        ├── entry.rs
        ├── level_info.rs
        ├── mod.rs
        ├── rick.rs
        └── sstable.rs
    └── util.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | **/target
2 | Cargo.lock
3 | 
4 | .vscode
5 | 
6 | # flatbuffer generated
7 | *_generated.rs


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "helixdb"
 3 | version = "0.1.0"
 4 | authors = ["Ruihang Xia <waynestxia@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | [dependencies]
 8 | thiserror = "1.0"
 9 | protos = { path = "protos" }
10 | flatbuffers = "24"
11 | io-uring = "0.5.0"
12 | tokio = { version = "1.4", features = ["full"] }
13 | glommio = "0.9.0"
14 | async-trait = "0.1.51"
15 | lru = "0.6"
16 | futures-util = "0.3"
17 | crossbeam-channel = "0.5"
18 | tracing = "0.1.26"
19 | jemallocator = "0.3.2"
20 | num_cpus = "1.13"
21 | 
22 | [dev-dependencies]
23 | tempfile = "3.2"
24 | tracing-subscriber = "0.2.18"
25 | 
26 | [workspace]
27 | members = ["benchmarks"]
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="docs/image/helix_logo.jpg" height="100" width="100" style="display: block;  margin-left: auto;  margin-right: auto;">
 2 | 
 3 | # HelixDB
 4 | 
 5 | HelixDB is a Key-Value store written in Rust. Checkout our [wiki](https://github.com/waynexia/helixdb/wiki) to get more!
 6 | 
 7 | # Features
 8 | ## Time Series
 9 | HelixDB is designed to serve time-series data. "Key-Value" definition here is (`User Key`, `Logical Timestamp`) => `Data`
10 | 
11 | ## Time aware
12 | HelixDB organizes data in a time-aware way. This gives HelixDB the ability to efficiently processing time related requests like "Hierarchy" or "Outdate".
13 | 
14 | ## Custom Compression
15 | HelixDB gives users an interface to customize their compression method that best suits their data. 
16 | 
17 | ## Async I/O & Thread-Per-Core
18 | HelixDB use io-uring provided by glommio as IO library. The thread-per-core architecture is also built on top of glommio.
19 | 
20 | HelixDB provides async interface, which is `Send` and can be spawned into other async runtime like tokio.
21 | 
22 | # Status
23 | *This project is still in the early stages.* Laking of test coverage, robust functionality, documentation and other things. So
24 | 
25 | Any discussion / suggestions / pull requests / issues / ... are welcome :heart:
26 | 


--------------------------------------------------------------------------------
/benchmarks/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "db_bench"
 3 | version = "0.1.0"
 4 | authors = ["Ruihang Xia <waynestxia@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | helixdb = { path = "../" }
11 | tokio = { version = "1.4", features = ["full"] }
12 | indicatif = "0.15.0"
13 | clap = "2.33"
14 | procfs = "0.9.1"
15 | pprof = { version = "0.4.3", features = ["flamegraph"] }
16 | tracing-subscriber = "0.2.18"
17 | tracing = "0.1.26"
18 | rand = { version = "0.8" }
19 | 
20 | [dev-dependencies]
21 | criterion = "0.3"
22 | 
23 | [[bench]]
24 | name = "entry_codec"
25 | path = "microbenches/entry_codec.rs"
26 | harness = false
27 | 


--------------------------------------------------------------------------------
/benchmarks/microbenches/entry_codec.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
 2 | use helixdb::Entry;
 3 | use rand::{thread_rng, Rng};
 4 | 
 5 | fn do_benchmark(key_size: usize, value_size: usize, c: &mut Criterion) {
 6 |     let mut rng = thread_rng();
 7 |     let key = (0..key_size).map(|_| rng.gen()).collect();
 8 |     let value = (0..value_size).map(|_| rng.gen()).collect();
 9 |     let entry = Entry {
10 |         timestamp: 1234423,
11 |         key,
12 |         value,
13 |     };
14 | 
15 |     c.bench_function(
16 |         &format!("encode {}B / {}B entry", key_size, value_size),
17 |         |b| b.iter(|| entry.encode()),
18 |     );
19 |     let bytes = entry.encode();
20 |     c.bench_function(
21 |         &format!("decode {}B / {}B entry", key_size, value_size),
22 |         |b| b.iter(|| Entry::decode(&bytes)),
23 |     );
24 | }
25 | 
26 | fn entry_codec_benchmark(c: &mut Criterion) {
27 |     do_benchmark(64, 8, c);
28 |     do_benchmark(64, 32, c);
29 |     do_benchmark(64, 1024, c);
30 |     do_benchmark(64, 4096, c);
31 | }
32 | 
33 | fn fibonacci(n: u64) -> u64 {
34 |     let mut a = 0;
35 |     let mut b = 1;
36 | 
37 |     match n {
38 |         0 => b,
39 |         _ => {
40 |             for _ in 0..n {
41 |                 let c = a + b;
42 |                 a = b;
43 |                 b = c;
44 |             }
45 |             b
46 |         }
47 |     }
48 | }
49 | 
50 | fn some_bench(c: &mut Criterion) {
51 |     c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20))));
52 | }
53 | 
54 | criterion_group!(benches, entry_codec_benchmark, some_bench);
55 | criterion_main!(benches);
56 | 


--------------------------------------------------------------------------------
/benchmarks/src/load.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::atomic::{AtomicU64, Ordering};
 2 | use std::sync::Arc;
 3 | 
 4 | use helixdb::{Entry, HelixDB};
 5 | use tokio::runtime::Builder;
 6 | 
 7 | use crate::panel::Panel;
 8 | 
 9 | fn generate_entry(timestamp: i64, key: u64, value_size: usize) -> Entry {
10 |     let key = key.to_le_bytes().to_vec();
11 |     let mut value = Vec::with_capacity(value_size);
12 |     value.resize_with(value_size, Default::default);
13 | 
14 |     Entry {
15 |         timestamp,
16 |         key,
17 |         value,
18 |     }
19 | }
20 | 
21 | pub fn load(
22 |     helixdb: HelixDB,
23 |     num_thread: usize,
24 |     batch_size: usize,
25 |     num_key: usize,
26 |     num_timestamp: usize,
27 |     value_size: usize,
28 | ) {
29 |     let total_entry = num_key * num_timestamp;
30 |     let mut panel = Panel::with_amount(total_entry as u64);
31 | 
32 |     let rt = Builder::new_multi_thread()
33 |         .worker_threads(num_thread)
34 |         .build()
35 |         .unwrap();
36 |     let progress = Arc::new(AtomicU64::new(0));
37 |     panel.start();
38 | 
39 |     for ts in 0..num_timestamp as i64 {
40 |         let keys = (0..num_key as u64).collect::<Vec<_>>();
41 |         // todo: shuffle keys for "random write".
42 |         for keys in keys.chunks(batch_size) {
43 |             let keys_len = keys.len() as u64;
44 |             let helixdb = helixdb.clone();
45 |             let progress = progress.clone();
46 |             let write_batch = keys
47 |                 .iter()
48 |                 .map(|key| generate_entry(ts, *key, value_size))
49 |                 .collect();
50 |             rt.spawn(async move {
51 |                 helixdb.put(write_batch).await.unwrap();
52 |                 progress.fetch_add(keys_len, Ordering::Relaxed);
53 |             });
54 |         }
55 |     }
56 | 
57 |     loop {
58 |         let progress = progress.load(Ordering::Relaxed);
59 |         panel.observe(progress);
60 |         if progress >= total_entry as u64 {
61 |             break;
62 |         }
63 |         std::thread::sleep(std::time::Duration::from_millis(100));
64 |     }
65 | 
66 |     rt.block_on(helixdb.close());
67 | }
68 | 


--------------------------------------------------------------------------------
/benchmarks/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryInto;
  2 | use std::io::Write;
  3 | use std::path::Path;
  4 | use std::sync::Arc;
  5 | 
  6 | use clap::{App, Arg, SubCommand};
  7 | 
  8 | mod load;
  9 | mod panel;
 10 | mod read;
 11 | mod scan;
 12 | 
 13 | use helixdb::option::Options;
 14 | use helixdb::{FnRegistry, HelixDB, SimpleTimestampReviewer};
 15 | use load::load;
 16 | use read::read;
 17 | use scan::scan;
 18 | use tracing::Level;
 19 | 
 20 | fn main() {
 21 |     tracing_subscriber::fmt()
 22 |         .with_max_level(Level::TRACE)
 23 |         .init();
 24 | 
 25 |     let matches = App::new("db_bench")
 26 |         .about("HelixDB benchmark tool")
 27 |         .arg(
 28 |             Arg::with_name("dir")
 29 |                 .long("dir")
 30 |                 .help("Database directory")
 31 |                 .required(true)
 32 |                 .takes_value(true),
 33 |         )
 34 |         .arg(
 35 |             Arg::with_name("thread")
 36 |                 .long("thread")
 37 |                 .help("Working threads number")
 38 |                 .default_value("8")
 39 |                 .takes_value(true),
 40 |         )
 41 |         .arg(
 42 |             Arg::with_name("shard")
 43 |                 .long("shard")
 44 |                 .help("Shards number")
 45 |                 .default_value("8")
 46 |                 .takes_value(true),
 47 |         )
 48 |         .arg(
 49 |             Arg::with_name("compact_interval")
 50 |                 .long("compact_interval")
 51 |                 .help("Timestamp range (interval) of each compacted level")
 52 |                 .default_value("1024"),
 53 |         )
 54 |         .subcommand(
 55 |             SubCommand::with_name("fill")
 56 |                 .about("Write data")
 57 |                 .arg(
 58 |                     Arg::with_name("batch_size")
 59 |                         .long("batch_size")
 60 |                         .help("batch size of each put request")
 61 |                         .default_value("1024"),
 62 |                 )
 63 |                 .arg(
 64 |                     Arg::with_name("num_key")
 65 |                         .long("num_key")
 66 |                         .help("Number of different keys to fill")
 67 |                         .default_value("1024"),
 68 |                 )
 69 |                 .arg(
 70 |                     Arg::with_name("num_timestamp")
 71 |                         .long("num_timestamp")
 72 |                         .help("Number of timestamp per key to fill")
 73 |                         .default_value("1024"),
 74 |                 )
 75 |                 .arg(
 76 |                     Arg::with_name("value_size")
 77 |                         .long("value_size")
 78 |                         .help("Size of each value in Bytes")
 79 |                         .default_value("1024"),
 80 |                 ),
 81 |         )
 82 |         .subcommand(
 83 |             SubCommand::with_name("read")
 84 |                 .about("Read data")
 85 |                 .arg(
 86 |                     Arg::with_name("max_key")
 87 |                         .long("max_key")
 88 |                         .help("The max user key in database. This is used to specify key range.")
 89 |                         .takes_value(true),
 90 |                 )
 91 |                 .arg(
 92 |                     Arg::with_name("max_timestamp")
 93 |                         .long("max_timestamp")
 94 |                         .help(
 95 |                             "The max timestamp in database. This is used to specify timestamp \
 96 |                              range.",
 97 |                         )
 98 |                         .takes_value(true),
 99 |                 )
100 |                 .arg(
101 |                     Arg::with_name("repeat_time")
102 |                         .long("repeat_time")
103 |                         .default_value("1024")
104 |                         .help("Repeat times"),
105 |                 ),
106 |         )
107 |         .subcommand(
108 |             SubCommand::with_name("scan")
109 |                 .about("Scan data")
110 |                 .arg(
111 |                     Arg::with_name("key_start")
112 |                         .long("key_start")
113 |                         .help("Start key of this scan (inclusive)"),
114 |                 )
115 |                 .arg(
116 |                     Arg::with_name("key_end")
117 |                         .long("key_end")
118 |                         .help("End key of this scan (inclusive)"),
119 |                 )
120 |                 .arg(
121 |                     Arg::with_name("timestamp_start")
122 |                         .long("timestamp_start")
123 |                         .help("Start timestamp of this scan (inclusive)"),
124 |                 )
125 |                 .arg(
126 |                     Arg::with_name("timestamp_end")
127 |                         .long("timestamp_end")
128 |                         .help("End timestamp of this scan (inclusive)"),
129 |                 )
130 |                 .arg(
131 |                     Arg::with_name("prefetch_size")
132 |                         .long("prefetch_size")
133 |                         .help("Prefetch buffer size")
134 |                         .default_value("8"),
135 |                 )
136 |                 .arg(
137 |                     Arg::with_name("repeat_time")
138 |                         .long("repeat_time")
139 |                         .help("Repeat times")
140 |                         .default_value("1024"),
141 |                 ),
142 |         )
143 |         .get_matches();
144 | 
145 |     let dir = matches.value_of("dir").unwrap();
146 |     let num_thread = matches.value_of("thread").unwrap().parse().unwrap();
147 |     let num_shard = matches.value_of("shard").unwrap().parse().unwrap();
148 |     let compact_interval = matches
149 |         .value_of("compact_interval")
150 |         .unwrap()
151 |         .parse()
152 |         .unwrap();
153 |     let db = open_helix(dir, num_shard, compact_interval);
154 |     // let guard = pprof::ProfilerGuard::new(100).unwrap();
155 | 
156 |     match matches.subcommand() {
157 |         ("fill", Some(sub_matches)) => {
158 |             let batch_size = sub_matches.value_of("batch_size").unwrap().parse().unwrap();
159 |             let num_key = sub_matches.value_of("num_key").unwrap().parse().unwrap();
160 |             let num_timestamp = sub_matches
161 |                 .value_of("num_timestamp")
162 |                 .unwrap()
163 |                 .parse()
164 |                 .unwrap();
165 |             let value_size = sub_matches.value_of("value_size").unwrap().parse().unwrap();
166 | 
167 |             load(
168 |                 db,
169 |                 num_thread,
170 |                 batch_size,
171 |                 num_key,
172 |                 num_timestamp,
173 |                 value_size,
174 |             );
175 |         }
176 | 
177 |         ("read", Some(sub_matches)) => {
178 |             let max_key = sub_matches.value_of("max_key").unwrap().parse().unwrap();
179 |             let max_ts = sub_matches
180 |                 .value_of("max_timestamp")
181 |                 .unwrap()
182 |                 .parse()
183 |                 .unwrap();
184 |             let repeat_time = sub_matches
185 |                 .value_of("repeat_time")
186 |                 .unwrap()
187 |                 .parse()
188 |                 .unwrap();
189 | 
190 |             read(db, num_thread, max_key, max_ts, repeat_time);
191 |         }
192 | 
193 |         ("scan", Some(sub_matches)) => {
194 |             let prefetch_size = sub_matches
195 |                 .value_of("prefetch_size")
196 |                 .unwrap()
197 |                 .parse()
198 |                 .unwrap();
199 |             let repeat_time = sub_matches
200 |                 .value_of("repeat_time")
201 |                 .unwrap()
202 |                 .parse()
203 |                 .unwrap();
204 | 
205 |             scan(db, num_thread, repeat_time, prefetch_size)
206 |         }
207 | 
208 |         _ => unreachable!(),
209 |     }
210 | 
211 |     // post process
212 |     // todo: make flamegraph a option
213 |     // if let Ok(report) = guard.report().build() {
214 |     //     let file = File::create("flamegraph.svg").unwrap();
215 |     //     report.flamegraph(file).unwrap();
216 |     // };
217 |     std::io::stdout().flush().unwrap();
218 | }
219 | 
220 | fn open_helix<P: AsRef<Path>>(path: P, num_shard: usize, compact_interval: i64) -> HelixDB {
221 |     let simple_tsr = SimpleTimestampReviewer::new(compact_interval, 8192);
222 |     let mut fn_registry = FnRegistry::new_noop();
223 |     fn_registry.register_sharding_key_fn(Arc::new(move |key| {
224 |         u64::from_le_bytes(key.to_owned().try_into().unwrap()) as usize % num_shard
225 |     }));
226 | 
227 |     let opts = Options::default()
228 |         .shards(num_shard)
229 |         .set_timestamp_reviewer(Box::new(simple_tsr))
230 |         .set_fn_registry(fn_registry)
231 |         .set_task_buffer_size(1024);
232 | 
233 |     HelixDB::open(path, opts)
234 | }
235 | 


--------------------------------------------------------------------------------
/benchmarks/src/panel.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Instant;
  2 | 
  3 | use indicatif::{ProgressBar, ProgressStyle};
  4 | use procfs::{diskstats, DiskStat};
  5 | 
  6 | pub struct Panel {
  7 |     amount: u64,
  8 |     processed: u64,
  9 | 
 10 |     bar: ProgressBar,
 11 |     timer: Instant,
 12 |     disk_monitor: DiskMonitor,
 13 | }
 14 | 
 15 | impl Panel {
 16 |     pub fn with_amount(amount: u64) -> Self {
 17 |         let bar = ProgressBar::new(amount);
 18 |         bar.set_style(
 19 |             ProgressStyle::default_bar()
 20 |                 .template("{prefix:.bold.dim} [{bar:60}] ({pos}/{len}) {msg}")
 21 |                 .progress_chars("=> "),
 22 |         );
 23 | 
 24 |         Self {
 25 |             amount,
 26 |             processed: 0,
 27 |             bar,
 28 |             timer: Instant::now(),
 29 |             disk_monitor: DiskMonitor::new(),
 30 |         }
 31 |     }
 32 | 
 33 |     pub fn start(&mut self) {
 34 |         self.timer = Instant::now();
 35 |     }
 36 | 
 37 |     pub fn observe(&mut self, progress: u64) {
 38 |         self.bar.set_position(progress);
 39 |         self.processed = progress;
 40 | 
 41 |         if progress >= self.amount {
 42 |             self.finish();
 43 |         }
 44 |     }
 45 | 
 46 |     pub fn increase(&mut self, delta: u64) {
 47 |         self.bar.inc(delta);
 48 |         self.processed += delta;
 49 | 
 50 |         if self.processed >= self.amount {
 51 |             self.finish();
 52 |         }
 53 |     }
 54 | 
 55 |     #[allow(dead_code)]
 56 |     pub fn reset(&mut self, _amount: usize) {
 57 |         todo!()
 58 |     }
 59 | 
 60 |     fn finish(&mut self) {
 61 |         let elapsed_ms = self.timer.elapsed().as_millis();
 62 |         self.bar.finish_with_message("done");
 63 | 
 64 |         println!("elapsed: {:?} ms", elapsed_ms);
 65 |         println!(
 66 |             "average: {:.2} op/sec",
 67 |             self.amount as f64 / (elapsed_ms as f64 / 1_000.0)
 68 |         );
 69 | 
 70 |         self.disk_monitor.finish();
 71 |     }
 72 | 
 73 |     fn conclude(&mut self) {
 74 |         // todo!()
 75 |     }
 76 | }
 77 | 
 78 | impl Drop for Panel {
 79 |     fn drop(&mut self) {
 80 |         self.conclude()
 81 |     }
 82 | }
 83 | 
 84 | struct DiskMonitor {
 85 |     records: Vec<DiskRecord>,
 86 | }
 87 | 
 88 | impl DiskMonitor {
 89 |     pub fn new() -> Self {
 90 |         let records = DiskMonitor::stats_iter()
 91 |             .map(DiskRecord::from_stat)
 92 |             .collect();
 93 | 
 94 |         Self { records }
 95 |     }
 96 | 
 97 |     pub fn finish(&self) {
 98 |         let delta = self
 99 |             .records
100 |             .iter()
101 |             .zip(DiskMonitor::stats_iter())
102 |             .map(|(record, stat)| record.delta(stat))
103 |             .collect::<Vec<_>>();
104 | 
105 |         println!("{:?}", delta);
106 |     }
107 | 
108 |     /// Return a iterator of disk stat. Only stats that minor number equals to 0
109 |     /// will be preserve. This means to only read the root devices' stat.
110 |     fn stats_iter() -> impl Iterator<Item = DiskStat> {
111 |         diskstats()
112 |             .unwrap()
113 |             .into_iter()
114 |             .filter(|stat| stat.minor == 0)
115 |     }
116 | }
117 | 
118 | #[allow(dead_code)]
119 | #[derive(Debug)]
120 | struct DiskRecord {
121 |     read_req: usize,
122 |     read_sec: usize,
123 |     time_reading_ms: usize,
124 |     write_req: usize,
125 |     write_sec: usize,
126 |     time_writing_ms: usize,
127 |     flush_req: Option<usize>,
128 |     time_flushing_ms: Option<usize>,
129 | }
130 | 
131 | impl DiskRecord {
132 |     pub fn from_stat(stat: DiskStat) -> Self {
133 |         Self {
134 |             read_req: stat.reads,
135 |             read_sec: stat.sectors_read,
136 |             time_reading_ms: stat.time_reading,
137 |             write_req: stat.writes,
138 |             write_sec: stat.sectors_written,
139 |             time_writing_ms: stat.time_writing,
140 |             flush_req: stat.flushes,
141 |             time_flushing_ms: stat.time_flushing,
142 |         }
143 |     }
144 | 
145 |     pub fn delta(&self, stat: DiskStat) -> Self {
146 |         Self {
147 |             read_req: stat.reads.wrapping_sub(self.read_req),
148 |             read_sec: stat.sectors_read.wrapping_sub(self.read_sec),
149 |             time_reading_ms: stat.time_reading.wrapping_sub(self.time_reading_ms),
150 |             write_sec: stat.sectors_written.wrapping_sub(self.write_sec),
151 |             write_req: stat.writes.wrapping_sub(self.write_req),
152 |             time_writing_ms: stat.time_writing.wrapping_sub(self.time_writing_ms),
153 |             // todo: option sub
154 |             flush_req: stat.flushes,
155 |             time_flushing_ms: stat.time_flushing,
156 |         }
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/benchmarks/src/read.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::atomic::{AtomicU64, Ordering};
 2 | use std::sync::Arc;
 3 | 
 4 | use helixdb::option::ReadOption;
 5 | use helixdb::HelixDB;
 6 | use rand::{thread_rng, Rng};
 7 | use tokio::runtime::Builder;
 8 | 
 9 | use crate::panel::Panel;
10 | 
11 | fn generate_key(key: u64) -> Vec<u8> {
12 |     key.to_le_bytes().to_vec()
13 | }
14 | 
15 | pub fn read(helixdb: HelixDB, num_thread: usize, max_key: u64, max_ts: i64, repeat_time: usize) {
16 |     let mut panel = Panel::with_amount(repeat_time as u64);
17 |     let found = Arc::new(AtomicU64::new(0));
18 | 
19 |     let rt = Builder::new_multi_thread()
20 |         .worker_threads(num_thread)
21 |         .build()
22 |         .unwrap();
23 |     let progress = Arc::new(AtomicU64::new(0));
24 |     panel.start();
25 | 
26 |     // todo: shuffle keys for "random write".
27 |     for _ in 0..repeat_time {
28 |         let helixdb = helixdb.clone();
29 |         let progress = progress.clone();
30 |         let mut rng = thread_rng();
31 |         let key = generate_key(rng.gen_range(0..max_key));
32 |         let ts = rng.gen_range(0..max_ts);
33 |         let found = found.clone();
34 |         rt.spawn(async move {
35 |             if helixdb
36 |                 .get(ts, key, ReadOption::default())
37 |                 .await
38 |                 .unwrap()
39 |                 .is_some()
40 |             {
41 |                 found.fetch_add(1, Ordering::Relaxed);
42 |             }
43 |             progress.fetch_add(1, Ordering::Relaxed);
44 |         });
45 |     }
46 | 
47 |     loop {
48 |         let progress = progress.load(Ordering::Relaxed);
49 |         panel.observe(progress);
50 |         if progress >= repeat_time as u64 {
51 |             break;
52 |         }
53 |     }
54 | 
55 |     println!("found {} / {}", found.load(Ordering::Relaxed), repeat_time);
56 | }
57 | 


--------------------------------------------------------------------------------
/benchmarks/src/scan.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::mpsc::channel;
 2 | 
 3 | use helixdb::iterator::Iterator;
 4 | use helixdb::option::ScanOption;
 5 | use helixdb::{HelixDB, NoOrderComparator};
 6 | use tokio::runtime::Builder;
 7 | 
 8 | use crate::panel::Panel;
 9 | 
10 | pub fn scan(helixdb: HelixDB, num_thread: usize, repeat_time: usize, prefetch_buf_size: usize) {
11 |     let (tx, rx) = channel();
12 | 
13 |     let mut panel = Panel::with_amount(repeat_time as u64);
14 |     let rt = Builder::new_multi_thread()
15 |         .worker_threads(num_thread)
16 |         .build()
17 |         .unwrap();
18 | 
19 |     for _ in 0..repeat_time as u64 {
20 |         let helixdb = helixdb.clone();
21 |         let tx = tx.clone();
22 |         rt.spawn(async move {
23 |             let mut iter = helixdb
24 |                 .scan::<NoOrderComparator>(
25 |                     (0, 4).into(),
26 |                     (
27 |                         0usize.to_le_bytes().to_vec(),
28 |                         1024usize.to_le_bytes().to_vec(),
29 |                     ),
30 |                     ScanOption { prefetch_buf_size },
31 |                 )
32 |                 .await
33 |                 .unwrap();
34 |             let mut scan_cnt = 0;
35 |             while iter.next().await.unwrap().is_some() {
36 |                 scan_cnt += 1
37 |             }
38 |             println!("scanned {} item", scan_cnt);
39 |             tx.send(()).unwrap();
40 |         });
41 |     }
42 | 
43 |     for _ in rx.iter().take(repeat_time) {
44 |         panel.increase(1);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/benchmarks/ssd-test.fio:
--------------------------------------------------------------------------------
 1 | # Do some important numbers on SSD drives, to gauge what kind of
 2 | # performance you might get out of them.
 3 | #
 4 | # Sequential read and write speeds are tested, these are expected to be
 5 | # high. Random reads should also be fast, random writes are where crap
 6 | # drives are usually separated from the good drives.
 7 | #
 8 | # This uses a queue depth of 4. New SATA SSD's will support up to 32
 9 | # in flight commands, so it may also be interesting to increase the queue
10 | # depth and compare. Note that most real-life usage will not see that
11 | # large of a queue depth, so 4 is more representative of normal use.
12 | #
13 | [global]
14 | bs=4k
15 | ioengine=io_uring
16 | iodepth=64
17 | size=10g
18 | direct=1
19 | runtime=60
20 | directory=/home/wayne/repo/helixdb/benchmarks/target/fio
21 | filename=ssd.test.file
22 | 
23 | [seq-read]
24 | rw=read
25 | stonewall
26 | 
27 | [rand-read]
28 | rw=randread
29 | stonewall
30 | 
31 | [seq-write]
32 | rw=write
33 | stonewall
34 | 
35 | [rand-write]
36 | rw=randwrite
37 | stonewall


--------------------------------------------------------------------------------
/docs/image/helix_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/waynexia/helixdb/5efb4a4b42f15561428bbabbe89ec70eb8248871/docs/image/helix_logo.jpg


--------------------------------------------------------------------------------
/protos/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "protos"
 3 | version = "0.1.0"
 4 | authors = ["Ruihang Xia <waynestxia@gmail.com>"]
 5 | edition = "2018"
 6 | description = "Generated rust flatbuffer code."
 7 | 
 8 | [dependencies]
 9 | flatbuffers = "24"
10 | 


--------------------------------------------------------------------------------
/protos/build.rs:
--------------------------------------------------------------------------------
 1 | use std::process::Command;
 2 | 
 3 | type Error = Box<dyn std::error::Error>;
 4 | type Result<T, E = Error> = std::result::Result<T, E>;
 5 | 
 6 | const IN_DIR: &str = "flatbuffer/helix.fbs";
 7 | const OUT_DIR: &str = "src/";
 8 | 
 9 | fn main() -> Result<()> {
10 |     let status = Command::new("flatc")
11 |         .arg("--rust")
12 |         .arg("-o")
13 |         .arg(OUT_DIR)
14 |         .arg(IN_DIR)
15 |         .status();
16 | 
17 |     match status {
18 |         Ok(status) if !status.success() => panic!("`flatc` failed to compile the .fbs to Rust"),
19 |         Ok(_status) => Ok(()), // Successfully compiled
20 |         Err(err) => panic!("Could not execute `flatc`: {}", err),
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/protos/flatbuffer/helix.fbs:
--------------------------------------------------------------------------------
 1 | namespace Helix;
 2 | 
 3 | table Entry {
 4 |     timestamp: Timestamp ;
 5 |     key: [ubyte];
 6 |     value: [ubyte];
 7 | }
 8 | 
 9 | // todo: no need?
10 | struct Timestamp {
11 |     timestamp: long;
12 | }
13 | 
14 | struct TimeRange {
15 |     start: Timestamp;
16 |     end: Timestamp;
17 | }
18 | 
19 | struct ThreadId{
20 |     id: uint64;
21 | }
22 | 
23 | struct LevelId {
24 |     id: uint64;
25 | }
26 | 
27 | struct LevelDesc {
28 |     time_range: TimeRange;
29 |     id: LevelId;
30 | }
31 | 
32 | table LevelInfo {
33 |     infos: [LevelDesc];
34 | }
35 | 
36 | struct Offset {
37 |     offset: uint64;
38 | }
39 | 
40 | enum ValueFormat : uint32 {
41 |     RawValue,
42 |     CompressedValue,
43 | }
44 | 
45 | table RickSuperBlock {
46 |     is_ordered: bool;
47 |     legal_offset_start: Offset;
48 |     legal_offset_end: Offset;
49 |     value_format: ValueFormat;
50 |     align_timestamp: Timestamp;
51 | }
52 | 
53 | enum BlockType: uint64 {
54 |     SuperBlock,
55 |     IndexBlock,
56 |     FilterBlock,
57 | }
58 | 
59 | struct BlockInfo {
60 |     block_type: BlockType;
61 |     offset: Offset;
62 |     length: uint64;
63 | }
64 | 
65 | table SSTableSuperBlock {
66 |     thread_id: ThreadId;
67 |     level_id: LevelId;
68 |     blocks: [BlockInfo];
69 | }
70 | 
71 | table IndexBlockEntry {
72 |     value_offset: Offset;
73 |     timestamp: Timestamp;
74 |     key: [ubyte];
75 | }
76 | 


--------------------------------------------------------------------------------
/protos/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused_imports)]
2 | #![allow(clippy::all)]
3 | 
4 | pub mod helix_generated;
5 | 
6 | pub use helix_generated::helix::*;
7 | 


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "nightly-2024-10-19"
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | imports_layout = "HorizontalVertical"
2 | imports_granularity = "Module"
3 | group_imports = "StdExternalCrate"
4 | edition = "2018"
5 | format_code_in_doc_comments = true
6 | format_macro_matchers = true
7 | format_strings = true
8 | wrap_comments = true
9 | 


--------------------------------------------------------------------------------
/src/blocks/block.rs:
--------------------------------------------------------------------------------
 1 | pub struct Block {
 2 |     data: Vec<u8>,
 3 | }
 4 | 
 5 | impl Block {
 6 |     pub fn data(&self) -> &[u8] {
 7 |         &self.data
 8 |     }
 9 | }
10 | 
11 | pub struct BlockBuilder {
12 |     buf: Vec<u8>,
13 | }
14 | 
15 | impl BlockBuilder {
16 |     pub fn new(block_size: usize) -> Self {
17 |         todo!();
18 |     }
19 | 
20 |     /// Return whether block_size exceeded.
21 |     /// This is a no-op when returning true.
22 |     pub fn add(&mut self, data: &[u8]) -> bool {
23 |         todo!()
24 |     }
25 | 
26 |     pub fn curr_size(&self) -> usize {
27 |         todo!()
28 |     }
29 | 
30 |     pub fn finish(&mut self) -> Block {
31 |         todo!()
32 |     }
33 | 
34 |     pub fn reset(&mut self) {}
35 | }
36 | 
37 | pub trait BlockIter {
38 |     type Key;
39 |     type Value;
40 | 
41 |     fn new(block: Block) -> Self;
42 | 
43 |     fn seek(&mut self, key: &Self::Key) -> Option<()>;
44 | 
45 |     fn next(&mut self);
46 | 
47 |     fn value(&self) -> &Self::Value;
48 | }
49 | 


--------------------------------------------------------------------------------
/src/blocks/mod.rs:
--------------------------------------------------------------------------------
1 | mod block;
2 | 


--------------------------------------------------------------------------------
/src/cache.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::RefCell;
  2 | use std::fmt::Debug;
  3 | use std::rc::Rc;
  4 | 
  5 | use lru::LruCache;
  6 | 
  7 | use crate::error::Result;
  8 | use crate::table::{TableIdentifier, TableReadHandle};
  9 | use crate::types::{Bytes, LevelId, ThreadId, Timestamp};
 10 | 
 11 | #[derive(Debug, Clone, Copy)]
 12 | pub struct CacheConfig {
 13 |     /// Number of `SSTableHandle` cache entries.
 14 |     pub table_handle_size: usize,
 15 |     /// Number of "Key - Value" cache entries.
 16 |     pub kv_cache_size: usize,
 17 |     /// Number of "Key - Compressed values" cache entries.
 18 |     pub kc_cache_size: usize,
 19 |     /// Number of "Key - Position in value log" cache entries.
 20 |     pub kp_cache_size: usize,
 21 | 
 22 |     /// The largest entry size will be held by kv_cache.
 23 |     pub kv_cache_threshold: usize,
 24 |     /// The largest entry size will be held by kc_cache.
 25 |     pub kc_cache_threshold: usize,
 26 | }
 27 | 
 28 | impl Default for CacheConfig {
 29 |     fn default() -> Self {
 30 |         Self {
 31 |             table_handle_size: 32,
 32 |             kp_cache_size: 512,
 33 |             kv_cache_size: 256,
 34 |             kc_cache_size: 64,
 35 |             kv_cache_threshold: 1024,
 36 |             kc_cache_threshold: 4096,
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | /// # Entry Cache
 42 | /// There are three types of entry cache: kv (for Key to Value), kc (for Key to
 43 | /// Compressed value bytes) and kp (for Key to corresponding value's Position in
 44 | /// value log).
 45 | ///
 46 | /// As the total space for caching is limited, cache small and frequent (or hot)
 47 | /// is better.
 48 | pub struct Cache {
 49 |     config: CacheConfig,
 50 |     handle_cache: RefCell<LruCache<TableIdentifier, Rc<TableReadHandle>>>,
 51 | 
 52 |     kv_cache: RefCell<LruCache<(Timestamp, Bytes), Bytes>>,
 53 |     kc_cache: RefCell<LruCache<(Timestamp, Bytes), Bytes>>,
 54 |     // todo: make it a `VLogIdentifier`.
 55 |     #[allow(clippy::type_complexity)]
 56 |     kp_cache: RefCell<LruCache<(Timestamp, Bytes), (ThreadId, LevelId, usize)>>,
 57 | }
 58 | 
 59 | impl Cache {
 60 |     pub fn with_config(config: CacheConfig) -> Self {
 61 |         Self {
 62 |             handle_cache: RefCell::new(LruCache::new(config.table_handle_size)),
 63 |             kv_cache: RefCell::new(LruCache::new(config.kv_cache_size)),
 64 |             kc_cache: RefCell::new(LruCache::new(config.kc_cache_size)),
 65 |             kp_cache: RefCell::new(LruCache::new(config.kp_cache_size)),
 66 | 
 67 |             config,
 68 |         }
 69 |     }
 70 | 
 71 |     pub fn default() -> Self {
 72 |         Self::with_config(CacheConfig::default())
 73 |     }
 74 | 
 75 |     pub fn get_table_handle(&self, table_id: &TableIdentifier) -> Option<Rc<TableReadHandle>> {
 76 |         self.handle_cache.borrow_mut().get(table_id).cloned()
 77 |     }
 78 | 
 79 |     pub async fn put_table_handle(
 80 |         &self,
 81 |         table_id: TableIdentifier,
 82 |         handle: Rc<TableReadHandle>,
 83 |     ) -> Result<()> {
 84 |         self.handle_cache.borrow_mut().put(table_id, handle);
 85 | 
 86 |         Ok(())
 87 |     }
 88 | 
 89 |     // todo: use `TimeKey` struct instead.
 90 |     pub fn get_key(&self, time_key: &(Timestamp, Bytes)) -> KeyCacheResult {
 91 |         if let Some(value) = self.kv_cache.borrow_mut().get(time_key) {
 92 |             return KeyCacheResult::Value(value.to_owned());
 93 |         } else if let Some(compressed) = self.kc_cache.borrow_mut().get(time_key) {
 94 |             return KeyCacheResult::Compressed(compressed.to_owned());
 95 |         } else if let Some((tid, lid, offset)) = self.kp_cache.borrow_mut().get(time_key) {
 96 |             return KeyCacheResult::Position(*tid, *lid, *offset);
 97 |         }
 98 | 
 99 |         KeyCacheResult::NotFound
100 |     }
101 | 
102 |     pub fn put_key(&self, key_entry: KeyCacheEntry) {
103 |         if let Some(value) = key_entry.value {
104 |             if value.len() < self.config.kv_cache_threshold {
105 |                 self.kv_cache
106 |                     .borrow_mut()
107 |                     .put(key_entry.key.to_owned(), value.to_owned());
108 |             }
109 |         } else if let Some(compressed) = key_entry.compressed {
110 |             if compressed.len() < self.config.kv_cache_threshold {
111 |                 self.kc_cache
112 |                     .borrow_mut()
113 |                     .put(key_entry.key.to_owned(), compressed.to_owned());
114 |             }
115 |         } else if let Some(position) = key_entry.position {
116 |             self.kp_cache
117 |                 .borrow_mut()
118 |                 .put(key_entry.key.to_owned(), position);
119 |         }
120 |     }
121 | }
122 | 
123 | pub enum KeyCacheResult {
124 |     Value(Bytes),
125 |     Compressed(Bytes),
126 |     /// Thread id and level id is for constructing rick file's identifier.
127 |     /// The third `usize` is offset.
128 |     Position(ThreadId, LevelId, usize),
129 |     NotFound,
130 | }
131 | 
132 | impl Debug for KeyCacheResult {
133 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
134 |         let mut f = f.debug_struct("KeyCacheResult");
135 |         match self {
136 |             KeyCacheResult::Value(bytes) => f.field("Value", &bytes.len()),
137 |             KeyCacheResult::Compressed(bytes) => f.field("Compressed", &bytes.len()),
138 |             KeyCacheResult::Position(tid, lid, offset) => f.field("Position", &(tid, lid, offset)),
139 |             KeyCacheResult::NotFound => f.field("NotFound", &()),
140 |         };
141 |         f.finish()
142 |     }
143 | }
144 | 
145 | /// For inserting Key into cache.
146 | pub struct KeyCacheEntry<'a> {
147 |     pub key: &'a (Timestamp, Bytes),
148 |     pub value: Option<&'a Bytes>,
149 |     pub compressed: Option<&'a Bytes>,
150 |     pub position: Option<(ThreadId, LevelId, usize)>,
151 | }
152 | 
153 | impl<'a> KeyCacheEntry<'a> {
154 |     pub fn new(key: &'a (Timestamp, Bytes)) -> Self {
155 |         Self {
156 |             key,
157 |             value: None,
158 |             compressed: None,
159 |             position: None,
160 |         }
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/compact_sched.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::RefCell;
  2 | use std::collections::VecDeque;
  3 | use std::mem;
  4 | use std::rc::Rc;
  5 | use std::time::Duration;
  6 | 
  7 | use async_trait::async_trait;
  8 | use glommio::timer::TimerActionRepeat;
  9 | use glommio::TaskQueueHandle;
 10 | 
 11 | use crate::error::Result;
 12 | use crate::level::Levels;
 13 | use crate::types::LevelId;
 14 | 
 15 | #[async_trait(?Send)]
 16 | pub(crate) trait CompactScheduler: 'static {
 17 |     fn enqueue(&self, l_id: LevelId);
 18 | 
 19 |     fn finished(&self, l_id: LevelId);
 20 | 
 21 |     async fn schedule(self: Rc<Self>) -> Option<Duration>;
 22 | 
 23 |     fn install(self: Rc<Self>, tq: TaskQueueHandle) -> Result<()> {
 24 |         let sched = self.clone();
 25 |         TimerActionRepeat::repeat_into(move || sched.clone().schedule(), tq)?;
 26 | 
 27 |         Ok(())
 28 |     }
 29 | }
 30 | 
 31 | pub(crate) struct QueueUpCompSched {
 32 |     is_compacting: RefCell<bool>,
 33 |     interval: Duration,
 34 |     queue: RefCell<VecDeque<LevelId>>,
 35 |     delay_num: usize,
 36 |     levels: Rc<Levels<Self>>,
 37 |     tq: TaskQueueHandle,
 38 | }
 39 | 
 40 | impl QueueUpCompSched {
 41 |     /// Create a not fully initialized instance. The return value should be
 42 |     /// `init()` first.
 43 |     ///
 44 |     /// This is expected to create a "memory leak" manifests as cyclic reference
 45 |     /// ([Self] and [Levels]) after `init()`.
 46 |     pub(crate) unsafe fn new_zeroed(
 47 |         interval: Duration,
 48 |         delay_num: usize,
 49 |         tq: TaskQueueHandle,
 50 |     ) -> Rc<Self> {
 51 |         Rc::new(Self {
 52 |             is_compacting: RefCell::new(false),
 53 |             interval,
 54 |             queue: RefCell::new(VecDeque::new()),
 55 |             delay_num,
 56 |             levels: mem::transmute::<Rc<()>, Rc<Levels<QueueUpCompSched>>>(Rc::new(())),
 57 |             tq,
 58 |         })
 59 |     }
 60 | 
 61 |     /// Initialize this with given levels.
 62 |     pub(crate) fn init(self: Rc<Self>, levels: Rc<Levels<Self>>) {
 63 |         unsafe {
 64 |             let empty_rc = mem::replace(
 65 |                 &mut (*(Rc::as_ptr(&self) as *mut QueueUpCompSched)).levels,
 66 |                 levels.clone(),
 67 |             );
 68 |             let _ = mem::transmute::<Rc<Levels<QueueUpCompSched>>, Rc<()>>(empty_rc);
 69 |         }
 70 |     }
 71 | 
 72 |     fn enqueue(&self, l_id: LevelId) {
 73 |         self.queue.borrow_mut().push_back(l_id);
 74 |     }
 75 | 
 76 |     fn finished(&self, l_id: LevelId) {
 77 |         *self.is_compacting.borrow_mut() = false;
 78 |     }
 79 | 
 80 |     async fn schedule(self: Rc<Self>) -> Option<Duration> {
 81 |         if *self.is_compacting.borrow() || self.queue.borrow().len() < self.delay_num {
 82 |             return Some(self.interval);
 83 |         }
 84 | 
 85 |         let level_id = self.queue.borrow_mut().pop_front().unwrap();
 86 |         *self.is_compacting.borrow_mut() = true;
 87 | 
 88 |         let levels = self.levels.clone();
 89 |         glommio::spawn_local_into(
 90 |             async move {
 91 |                 // todo: propagate Error?
 92 |                 let _ = levels.compact_level(level_id).await;
 93 |             },
 94 |             self.tq,
 95 |         )
 96 |         .unwrap()
 97 |         .detach();
 98 | 
 99 |         Some(self.interval)
100 |     }
101 | 
102 |     /// For writing mock test.
103 |     ///
104 |     /// # Panic
105 |     /// `levels` in the returning object is not initialize (an empty `Weak`).
106 |     /// Any operations make this to call `levels` will panic due to
107 |     /// the attempt of trying to upgrade that empty weak pointer.
108 |     #[cfg(test)]
109 |     pub(crate) fn default() -> (Rc<Self>, TaskQueueHandle) {
110 |         let tq = glommio::executor().create_task_queue(
111 |             glommio::Shares::default(),
112 |             glommio::Latency::NotImportant,
113 |             "test_comp_tq",
114 |         );
115 |         let this = Self {
116 |             is_compacting: RefCell::new(false),
117 |             interval: Duration::from_secs(1),
118 |             queue: RefCell::new(VecDeque::new()),
119 |             delay_num: 3,
120 |             levels: unsafe {
121 |                 std::mem::transmute::<Rc<()>, Rc<Levels<QueueUpCompSched>>>(Rc::new(()))
122 |             },
123 |             tq,
124 |         };
125 | 
126 |         (Rc::new(this), tq)
127 |     }
128 | }
129 | 
130 | #[async_trait(?Send)]
131 | impl CompactScheduler for QueueUpCompSched {
132 |     fn enqueue(&self, l_id: LevelId) {
133 |         self.enqueue(l_id)
134 |     }
135 | 
136 |     fn finished(&self, l_id: LevelId) {
137 |         self.finished(l_id)
138 |     }
139 | 
140 |     async fn schedule(self: Rc<Self>) -> Option<Duration> {
141 |         self.schedule().await
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/src/context.rs:
--------------------------------------------------------------------------------
1 | use crate::file::FileManager;
2 | use crate::fn_registry::FnRegistry;
3 | 
4 | pub struct Context {
5 |     pub fn_registry: FnRegistry,
6 |     pub(crate) file_manager: FileManager,
7 | }
8 | 


--------------------------------------------------------------------------------
/src/db.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::intrinsics::unlikely;
  3 | use std::path::Path;
  4 | use std::sync::atomic::{AtomicBool, Ordering};
  5 | use std::sync::Arc;
  6 | 
  7 | use futures_util::future::{join_all, try_join_all};
  8 | use glommio::channels::channel_mesh::MeshBuilder;
  9 | use glommio::{ExecutorJoinHandle, LocalExecutor, LocalExecutorBuilder};
 10 | use tokio::sync::mpsc::{channel as bounded_channel, Sender};
 11 | use tokio::sync::oneshot::channel as oneshot;
 12 | use tokio::sync::Mutex;
 13 | use tracing::info;
 14 | 
 15 | use crate::context::Context;
 16 | use crate::error::{HelixError, Result};
 17 | use crate::file::FileManager;
 18 | use crate::io_worker::{IOWorker, Task};
 19 | use crate::iterator::{Iterator, ShardMuxTimeIterator, ShardTimeIterator, TimeIterator};
 20 | use crate::option::{Options, ReadOption, ScanOption};
 21 | use crate::types::{Bytes, Entry, TimeRange};
 22 | use crate::util::Comparator;
 23 | 
 24 | /// Size of channels that used to do IPC between shards.
 25 | const CHANNEL_MESH_SIZE: usize = 128;
 26 | 
 27 | #[derive(Clone)]
 28 | pub struct HelixDB {
 29 |     core: Arc<HelixCore>,
 30 | }
 31 | 
 32 | impl HelixDB {
 33 |     pub fn open<P: AsRef<Path>>(path: P, opts: Options) -> Self {
 34 |         Self {
 35 |             core: Arc::new(HelixCore::new(path, opts)),
 36 |         }
 37 |     }
 38 | 
 39 |     /// Open HelixDB with default [Options]
 40 |     pub fn open_default<P: AsRef<Path>>(path: P) -> Self {
 41 |         let opts = Options::default();
 42 |         Self::open(path, opts)
 43 |     }
 44 | 
 45 |     pub async fn put(&self, write_batch: Vec<Entry>) -> Result<()> {
 46 |         self.core.sharding_put(write_batch).await
 47 |     }
 48 | 
 49 |     pub async fn direct_put(&self, shard_id: usize, write_batch: Vec<Entry>) -> Result<()> {
 50 |         self.core.put_unchecked(shard_id, write_batch).await
 51 |     }
 52 | 
 53 |     pub async fn get(&self, timestamp: i64, key: Bytes, opt: ReadOption) -> Result<Option<Entry>> {
 54 |         self.core.sharding_get(timestamp, key, opt).await
 55 |     }
 56 | 
 57 |     pub async fn direct_get(
 58 |         &self,
 59 |         shard_id: usize,
 60 |         timestamp: i64,
 61 |         key: Bytes,
 62 |         opt: ReadOption,
 63 |     ) -> Result<Option<Entry>> {
 64 |         self.core.get_unchecked(shard_id, timestamp, key, opt).await
 65 |     }
 66 | 
 67 |     pub async fn scan<C: Comparator + 'static>(
 68 |         &self,
 69 |         time_range: TimeRange,
 70 |         key_range: (Bytes, Bytes),
 71 |         opt: ScanOption,
 72 |     ) -> Result<impl Iterator> {
 73 |         self.core.scan::<C>(time_range, key_range, opt).await
 74 |     }
 75 | 
 76 |     pub async fn close(self) {
 77 |         info!("Closing HelixDB");
 78 |         self.core.close().await;
 79 |     }
 80 | }
 81 | 
 82 | unsafe impl Send for HelixDB {}
 83 | unsafe impl Sync for HelixDB {}
 84 | 
 85 | pub(crate) struct HelixCore {
 86 |     /// Join handles of shards' working threads.
 87 |     worker_handle: Vec<ExecutorJoinHandle<()>>,
 88 |     task_txs: Vec<Sender<Task>>,
 89 |     ctx: Arc<Context>,
 90 |     is_closed: AtomicBool,
 91 | }
 92 | 
 93 | impl HelixCore {
 94 |     fn new<P: AsRef<Path>>(path: P, mut opts: Options) -> Self {
 95 |         let file_manager = FileManager::with_base_dir(path, opts.num_shard).unwrap();
 96 |         let ctx = Arc::new(Context {
 97 |             file_manager,
 98 |             fn_registry: opts.fn_registry.take().unwrap(),
 99 |         });
100 |         let tsr = Arc::new(Mutex::new(opts.tsr.take().unwrap()));
101 |         let level_info = LocalExecutor::default().run(async {
102 |             // initialize components requiring runtime.
103 |             Arc::new(Mutex::new(
104 |                 ctx.file_manager.open_level_info().await.unwrap(),
105 |             ))
106 |         });
107 | 
108 |         let mut worker_handle = Vec::with_capacity(opts.num_shard);
109 |         let mut task_txs = Vec::with_capacity(opts.num_shard);
110 |         let mesh_builder = MeshBuilder::full(opts.num_shard, CHANNEL_MESH_SIZE);
111 | 
112 |         for tid in 0..opts.num_shard as u64 {
113 |             let tsr = tsr.clone();
114 |             let ctx = ctx.clone();
115 |             let opts = opts.clone_partial();
116 |             let (tx, rx) = bounded_channel(opts.task_buffer_size);
117 |             let mesh_builder = mesh_builder.clone();
118 |             let level_info = level_info.clone();
119 | 
120 |             let handle = LocalExecutorBuilder::new(glommio::Placement::Fixed(tid as usize))
121 |                 .spawn(move || async move {
122 |                     let (sender, receiver) = mesh_builder.join().await.unwrap();
123 |                     let worker = IOWorker::try_new(tid, opts, tsr, level_info, ctx, sender)
124 |                         .await
125 |                         .unwrap();
126 |                     worker.run(rx, receiver).await
127 |                 })
128 |                 .unwrap();
129 | 
130 |             worker_handle.push(handle);
131 |             task_txs.push(tx);
132 |         }
133 | 
134 |         Self {
135 |             worker_handle,
136 |             task_txs,
137 |             ctx,
138 |             is_closed: AtomicBool::new(false),
139 |         }
140 |     }
141 | 
142 |     /// Dispatch entries in write batch to corresponding shards.
143 |     async fn sharding_put(&self, write_batch: Vec<Entry>) -> Result<()> {
144 |         self.check_closed()?;
145 | 
146 |         let mut tasks = HashMap::<usize, Vec<_>>::new();
147 | 
148 |         for entry in write_batch {
149 |             let shard_id = self.ctx.fn_registry.sharding_fn()(&entry.key);
150 |             tasks.entry(shard_id).or_default().push(entry);
151 |         }
152 | 
153 |         let mut futures = Vec::with_capacity(tasks.len());
154 |         for (shard_id, write_batch) in tasks {
155 |             futures.push(self.put_unchecked(shard_id, write_batch));
156 |         }
157 | 
158 |         try_join_all(futures).await?;
159 |         Ok(())
160 |     }
161 | 
162 |     /// Put on specified shard without routing.
163 |     async fn put_unchecked(&self, worker: usize, write_batch: Vec<Entry>) -> Result<()> {
164 |         self.check_closed()?;
165 | 
166 |         let (tx, rx) = oneshot();
167 |         let task = Task::Put(write_batch, tx);
168 | 
169 |         self.task_txs[worker].send(task).await?;
170 | 
171 |         rx.await?
172 |     }
173 | 
174 |     async fn sharding_get(
175 |         &self,
176 |         timestamp: i64,
177 |         key: Bytes,
178 |         opt: ReadOption,
179 |     ) -> Result<Option<Entry>> {
180 |         self.check_closed()?;
181 | 
182 |         let shard_id = self.ctx.fn_registry.sharding_fn()(&key);
183 |         self.get_unchecked(shard_id, timestamp, key, opt).await
184 |     }
185 | 
186 |     /// Get on specified shard without routing.
187 |     async fn get_unchecked(
188 |         &self,
189 |         worker: usize,
190 |         timestamp: i64,
191 |         key: Bytes,
192 |         opt: ReadOption,
193 |     ) -> Result<Option<Entry>> {
194 |         self.check_closed()?;
195 | 
196 |         let (tx, rx) = oneshot();
197 |         let task = Task::Get(timestamp, key, tx, opt);
198 | 
199 |         self.task_txs[worker].send(task).await?;
200 | 
201 |         rx.await?
202 |     }
203 | 
204 |     async fn scan<C: Comparator + 'static>(
205 |         &self,
206 |         time_range: TimeRange,
207 |         key_range: (Bytes, Bytes),
208 |         opt: ScanOption,
209 |     ) -> Result<impl Iterator> {
210 |         self.check_closed()?;
211 | 
212 |         let iters: Vec<_> = (0..self.shards())
213 |             .map(|worker| (worker, key_range.clone()))
214 |             .map(async |(worker, key_range)| -> Result<_> {
215 |                 let (tx, rx) = bounded_channel(opt.prefetch_buf_size);
216 | 
217 |                 self.task_txs[worker]
218 |                     .send(Task::Scan(
219 |                         time_range,
220 |                         key_range.0,
221 |                         key_range.1,
222 |                         tx,
223 |                         Arc::new(C::cmp),
224 |                     ))
225 |                     .await?;
226 |                 Ok(ShardTimeIterator::new(rx).await)
227 |             })
228 |             .collect();
229 | 
230 |         let iters = try_join_all(iters).await?;
231 |         let mux_iter = ShardMuxTimeIterator::<C>::new(iters, opt.prefetch_buf_size).await;
232 |         let iter = TimeIterator::new(mux_iter);
233 | 
234 |         Ok(iter)
235 |     }
236 | 
237 |     async fn close(&self) {
238 |         self.is_closed.store(true, Ordering::SeqCst);
239 | 
240 |         for index in 0..self.shards() {
241 |             let _ = self.task_txs[index].send(Task::Shutdown).await;
242 |         }
243 | 
244 |         join_all(self.task_txs.iter().map(|sender| sender.closed())).await;
245 |     }
246 | 
247 |     fn shards(&self) -> usize {
248 |         self.worker_handle.len()
249 |     }
250 | 
251 |     fn check_closed(&self) -> Result<()> {
252 |         // false positive
253 |         #[allow(unused_unsafe)]
254 |         if unsafe { unlikely(self.is_closed.load(Ordering::SeqCst)) } {
255 |             return Err(HelixError::Closed);
256 |         }
257 | 
258 |         Ok(())
259 |     }
260 | }
261 | 
262 | impl Drop for HelixCore {
263 |     fn drop(&mut self) {
264 |         drop(std::mem::take(&mut self.task_txs));
265 | 
266 |         for handle in std::mem::take(&mut self.worker_handle) {
267 |             let _ = handle.join();
268 |         }
269 |     }
270 | }
271 | 
272 | #[cfg(test)]
273 | mod test {
274 |     use std::convert::TryInto;
275 | 
276 |     use tempfile::tempdir;
277 | 
278 |     use super::*;
279 |     use crate::{FnRegistry, LexicalComparator, SimpleTimestampReviewer};
280 | 
281 |     #[tokio::test]
282 |     async fn example() {
283 |         let base_dir = tempdir().unwrap();
284 |         let db = HelixDB::open_default(base_dir.path());
285 | 
286 |         let entry = Entry {
287 |             timestamp: 0,
288 |             key: b"key".to_vec(),
289 |             value: b"value".to_vec(),
290 |         };
291 |         db.put(vec![entry.clone()]).await.unwrap();
292 | 
293 |         let result = db
294 |             .get(0, b"key".to_vec(), ReadOption::default())
295 |             .await
296 |             .unwrap();
297 |         assert_eq!(result.unwrap(), entry);
298 |     }
299 | 
300 |     async fn scan_test_scaffold(
301 |         num_shard: usize,
302 |         num_timestamp: i64,
303 |         num_key: u64,
304 |         compact_interval: i64,
305 |     ) {
306 |         assert!(num_timestamp > 0, "timestamp number should be positive");
307 | 
308 |         let mut fn_registry = FnRegistry::new_noop();
309 |         fn_registry.register_sharding_key_fn(Arc::new(move |key| {
310 |             u64::from_le_bytes(key.to_owned().try_into().unwrap()) as usize % num_shard
311 |         }));
312 |         let simple_tsr = SimpleTimestampReviewer::new(compact_interval, i64::MAX);
313 |         let opts = Options::default()
314 |             .shards(num_shard)
315 |             .set_fn_registry(fn_registry)
316 |             .set_timestamp_reviewer(Box::new(simple_tsr));
317 |         let base_dir = tempdir().unwrap();
318 |         let db = HelixDB::open(base_dir.path(), opts);
319 | 
320 |         // write
321 |         for timestamp in 0..num_timestamp {
322 |             let entries = (0..num_key)
323 |                 .into_iter()
324 |                 .map(|key| Entry {
325 |                     timestamp,
326 |                     key: key.to_le_bytes().to_vec(),
327 |                     value: b"value".to_vec(),
328 |                 })
329 |                 .collect();
330 |             db.put(entries).await.unwrap();
331 |         }
332 | 
333 |         println!("write finished");
334 | 
335 |         // scan
336 |         let mut iter = db
337 |             .scan::<LexicalComparator>(
338 |                 (0, num_timestamp).into(),
339 |                 (0u64.to_le_bytes().to_vec(), num_key.to_le_bytes().to_vec()),
340 |                 ScanOption {
341 |                     prefetch_buf_size: 1,
342 |                 },
343 |             )
344 |             .await
345 |             .unwrap();
346 | 
347 |         let mut count = 0;
348 |         while iter.is_valid() {
349 |             iter.next().await.unwrap();
350 |             count += 1;
351 |         }
352 | 
353 |         assert_eq!(num_timestamp as u64 * num_key, count);
354 |     }
355 | 
356 |     #[tokio::test]
357 |     async fn scan_1_shard_without_compaction() {
358 |         scan_test_scaffold(1, 10, 128, 1024).await;
359 |     }
360 | 
361 |     #[tokio::test]
362 |     async fn scan_many_shards_without_compaction() {
363 |         scan_test_scaffold(num_cpus::get(), 10, 128, 1024).await;
364 |     }
365 | 
366 |     #[tokio::test]
367 |     async fn scan_many_shards_with_compaction() {
368 |         scan_test_scaffold(2, 64, 8, 32).await;
369 |     }
370 | 
371 |     #[tokio::test]
372 |     async fn recover_from_restart() {
373 |         let base_dir = tempdir().unwrap();
374 |         let opts = Options::default()
375 |             .set_timestamp_reviewer(Box::new(SimpleTimestampReviewer::new(5, 100)))
376 |             .shards(1);
377 |         let db = HelixDB::open(base_dir.path(), opts);
378 | 
379 |         let tasks = (0..50)
380 |             .map(|ts| {
381 |                 db.put(vec![Entry {
382 |                     timestamp: ts,
383 |                     key: b"key".to_vec(),
384 |                     value: b"value".to_vec(),
385 |                 }])
386 |             })
387 |             .collect::<Vec<_>>();
388 |         try_join_all(tasks).await.unwrap();
389 |         db.close().await;
390 | 
391 |         let opts = Options::default()
392 |             .set_timestamp_reviewer(Box::new(SimpleTimestampReviewer::new(5, 100)))
393 |             .shards(1);
394 |         let db = HelixDB::open(base_dir.path(), opts);
395 |         for ts in 0..50 {
396 |             let result = db
397 |                 .get(ts, b"key".to_vec(), ReadOption::default())
398 |                 .await
399 |                 .unwrap();
400 |             assert_eq!(result.unwrap().value, b"value".to_vec());
401 |         }
402 |     }
403 | }
404 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use thiserror::Error;
 4 | 
 5 | use crate::io_worker::Task;
 6 | use crate::types::Entry;
 7 | 
 8 | pub type Result<T> = std::result::Result<T, HelixError>;
 9 | 
10 | #[derive(Error, Debug)]
11 | pub enum HelixError {
12 |     #[error("IO error {0}")]
13 |     IO(#[from] io::Error),
14 |     #[error("Glommio error {0}")]
15 |     Glommio(#[from] glommio::GlommioError<()>),
16 |     #[error("Common HelixDB error")]
17 |     Common,
18 |     #[error("Element not found")]
19 |     NotFound,
20 |     #[error("Task dropped")]
21 |     Dropped(#[from] tokio::sync::oneshot::error::RecvError),
22 |     #[error("Failed to send due to Helix is stopped")]
23 |     Stopped(#[from] tokio::sync::mpsc::error::SendError<Task>),
24 |     #[error("Operation {0} is poisoned")]
25 |     Poisoned(String),
26 |     // todo: review this usage.
27 |     #[error("Internal channel disconnected")]
28 |     Disconnected(#[from] tokio::sync::mpsc::error::SendError<Vec<Entry>>),
29 |     #[error("Incompatible length or size, expect {0}, got {1}")]
30 |     IncompatibleLength(usize, usize),
31 |     #[error("Helix is closed")]
32 |     Closed,
33 |     #[error("Running into unreachable situation {0}")]
34 |     Unreachable(String),
35 | }
36 | 


--------------------------------------------------------------------------------
/src/file/file_manager.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::fs;
  3 | use std::path::{Path, PathBuf};
  4 | use std::rc::Rc;
  5 | use std::sync::Arc;
  6 | 
  7 | use tokio::sync::Mutex;
  8 | use tracing::warn;
  9 | 
 10 | use crate::error::{HelixError, Result};
 11 | use crate::io::File;
 12 | use crate::types::{Bytes, LevelId, LevelInfo, ThreadId};
 13 | use crate::util::{AssertSend, AssertSync};
 14 | 
 15 | const COMMON_FILE_PREFIX: &str = "helix";
 16 | const COMMON_FILE_EXTENSION: &str = "hlx";
 17 | const BINARY_FILE_EXTENSION: &str = "bin";
 18 | 
 19 | const LEVEL_INFO_FILENAME: &str = "LEVEL_INFO";
 20 | 
 21 | pub(crate) enum FileType {
 22 |     Rick,
 23 |     VLog,
 24 |     SSTable,
 25 |     Manifest,
 26 |     Others(String),
 27 | }
 28 | 
 29 | impl FileType {
 30 |     fn file_name_desc(&self) -> &str {
 31 |         match self {
 32 |             FileType::Rick => "rick",
 33 |             FileType::VLog => "vlog",
 34 |             FileType::SSTable => "sst",
 35 |             FileType::Manifest => "manifest",
 36 |             FileType::Others(name) => name,
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
 42 | pub(crate) enum FileNo {
 43 |     LevelInfo,
 44 |     Rick(LevelId),
 45 |     SSTable(LevelId),
 46 | }
 47 | 
 48 | impl FileNo {
 49 |     fn name(&self) -> String {
 50 |         match self {
 51 |             FileNo::LevelInfo => "LEVEL_INFO".to_string(),
 52 |             FileNo::Rick(l_id) => format!("rick-{}.{}", l_id, BINARY_FILE_EXTENSION),
 53 |             FileNo::SSTable(l_id) => format!("sst-{}.{}", l_id, BINARY_FILE_EXTENSION),
 54 |         }
 55 |     }
 56 | }
 57 | 
 58 | pub(crate) enum OtherType {
 59 |     /// Timestamp range of each level.
 60 |     LevelInfo,
 61 |     /// Thread Identifier.
 62 |     TId,
 63 | }
 64 | 
 65 | #[derive(Clone)]
 66 | struct RawFilePtr(Rc<File>);
 67 | 
 68 | unsafe impl Send for RawFilePtr {}
 69 | unsafe impl Sync for RawFilePtr {}
 70 | 
 71 | /// Proxy for all file open/create operations.
 72 | ///
 73 | /// It will keep opened file until a explicit garbage collection. So others
 74 | /// needn't to close file.
 75 | pub(crate) struct FileManager {
 76 |     base_dir: PathBuf,
 77 |     // todo: GC. maybe do it when outdating some level.
 78 |     fd_pool: Arc<Mutex<HashMap<(ThreadId, FileNo), RawFilePtr>>>,
 79 | }
 80 | 
 81 | impl AssertSync for FileManager {}
 82 | impl AssertSend for FileManager {}
 83 | 
 84 | impl FileManager {
 85 |     pub(crate) fn with_base_dir<P: AsRef<Path>>(base_dir: P, shards: usize) -> Result<Self> {
 86 |         fs::create_dir_all(base_dir.as_ref())?;
 87 | 
 88 |         // check dir
 89 |         let dir_num = fs::read_dir(base_dir.as_ref())?
 90 |             .map(|dir| Ok(dir?.file_type()?.is_dir()))
 91 |             .collect::<Result<Vec<_>>>()?
 92 |             .len();
 93 |         if dir_num != shards {
 94 |             warn!(
 95 |                 "Detected {} folder in {:?}, which isn't equal to given shard number {}",
 96 |                 dir_num,
 97 |                 base_dir.as_ref().to_str(),
 98 |                 shards
 99 |             );
100 |         }
101 | 
102 |         // create sub dir
103 |         for id in 0..shards {
104 |             match fs::create_dir(base_dir.as_ref().join(id.to_string())) {
105 |                 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {}
106 |                 other => other?,
107 |             }
108 |         }
109 | 
110 |         Ok(Self {
111 |             base_dir: base_dir.as_ref().to_path_buf(),
112 |             fd_pool: Arc::default(),
113 |         })
114 |     }
115 | 
116 |     pub(crate) async fn open(&self, tid: ThreadId, file_no: FileNo) -> Result<Rc<File>> {
117 |         if let Some(file_ptr) = self.fd_pool.lock().await.get(&(tid, file_no)) {
118 |             return Ok(file_ptr.0.clone());
119 |         }
120 | 
121 |         let name = file_no.name();
122 |         let path = self.base_dir.join(tid.to_string()).join(name);
123 |         let file = Rc::new(File::open(path).await?);
124 |         let cache_file = file.clone();
125 |         self.fd_pool
126 |             .lock()
127 |             .await
128 |             .insert((tid, file_no), RawFilePtr(cache_file));
129 | 
130 |         Ok(file)
131 |     }
132 | 
133 |     /// Close files not used by others, i.e., strong count is 1.
134 |     ///
135 |     /// # Notice
136 |     /// As [FileManager] is shared between all shards, it keep all files that
137 |     /// should not be visible to other shards. Trying to close with wrong `tid`
138 |     /// is undefined behavior.
139 |     pub(crate) async fn close_some(&self, tid: ThreadId) -> Result<()> {
140 |         let free_list = self
141 |             .fd_pool
142 |             .lock()
143 |             .await
144 |             .extract_if(|(thread_id, _), file| *thread_id == tid && Rc::strong_count(&file.0) == 1)
145 |             .collect::<Vec<_>>();
146 | 
147 |         for (_, file) in free_list {
148 |             match Rc::try_unwrap(file.0) {
149 |                 Ok(file) => file.close().await?,
150 |                 Err(file) => {
151 |                     return Err(HelixError::Unreachable(
152 |                         "Going to close a file which is still referenced".to_string(),
153 |                     ));
154 |                 }
155 |             }
156 |         }
157 | 
158 |         Ok(())
159 |     }
160 | 
161 |     // todo: deprecate this.
162 |     /// Open or create [LevelInfo].
163 |     pub(crate) async fn open_level_info(&self) -> Result<LevelInfo> {
164 |         let filename = self.base_dir.join(LEVEL_INFO_FILENAME);
165 |         let file = File::open(filename).await?;
166 | 
167 |         // read all
168 |         let size = file.size().await?;
169 |         let buf = file.read(0, size).await?;
170 |         file.close().await?;
171 | 
172 |         let level_info = LevelInfo::decode(&buf);
173 |         Ok(level_info)
174 |     }
175 | 
176 |     // todo: correct this.
177 |     /// Refresh (overwrite) level info file.
178 |     pub(crate) async fn sync_level_info(&self, bytes: Bytes) -> Result<()> {
179 |         let filename = self.base_dir.join(LEVEL_INFO_FILENAME);
180 |         let file = File::open(filename).await?;
181 | 
182 |         file.write(bytes, 0).await?;
183 |         file.sync().await?;
184 |         file.close().await?;
185 | 
186 |         Ok(())
187 |     }
188 | }
189 | 
190 | #[cfg(test)]
191 | mod test {
192 |     use std::os::unix::io::AsRawFd;
193 | 
194 |     use glommio::LocalExecutor;
195 |     use tempfile::tempdir;
196 | 
197 |     use super::*;
198 | 
199 |     #[test]
200 |     fn new_file_manager() {
201 |         let ex = LocalExecutor::default();
202 |         let base_dir = tempdir().unwrap();
203 |         let file_manager = FileManager::with_base_dir(base_dir.path(), 8).unwrap();
204 | 
205 |         ex.run(async {
206 |             assert_eq!(base_dir.path().read_dir().unwrap().count(), 8);
207 |         });
208 |     }
209 | 
210 |     #[test]
211 |     fn reopen_file() {
212 |         let ex = LocalExecutor::default();
213 |         let base_dir = tempdir().unwrap();
214 |         let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
215 | 
216 |         ex.run(async {
217 |             let info_file = file_manager.open(0, FileNo::LevelInfo).await.unwrap();
218 |             let first_fd = info_file.as_raw_fd();
219 | 
220 |             drop(info_file);
221 |             let info_file = file_manager.open(0, FileNo::LevelInfo).await.unwrap();
222 |             let second_fd = info_file.as_raw_fd();
223 | 
224 |             assert_eq!(first_fd, second_fd);
225 |         });
226 |     }
227 | }
228 | 


--------------------------------------------------------------------------------
/src/file/mod.rs:
--------------------------------------------------------------------------------
1 | mod file_manager;
2 | mod rick;
3 | mod sstable;
4 | 
5 | pub(crate) use file_manager::{FileManager, FileNo};
6 | pub use rick::Rick;
7 | pub use sstable::{IndexBlockBuilder, SSTable, TableBuilder};
8 | 


--------------------------------------------------------------------------------
/src/file/rick.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::BTreeMap;
  2 | use std::rc::Rc;
  3 | use std::time::Instant;
  4 | 
  5 | use tracing::trace;
  6 | 
  7 | use crate::error::Result;
  8 | use crate::index::MemIndex;
  9 | use crate::io::File;
 10 | use crate::types::{
 11 |     Bytes,
 12 |     Entry,
 13 |     EntryMeta,
 14 |     Offset,
 15 |     RickSuperBlock,
 16 |     TimeRange,
 17 |     Timestamp,
 18 |     ValueFormat,
 19 | };
 20 | use crate::util::check_bytes_length;
 21 | 
 22 | /// Handles to entries in rick (level 0).
 23 | ///
 24 | /// Every shard will only have up to one rick file at any time.
 25 | ///
 26 | /// (above is out-of-date)
 27 | ///
 28 | /// Rick file may contains "hole" due to garbage collection.
 29 | /// It will have a [RickSuperBlock] at the very beginning (offset 0)
 30 | /// contains two pointers "start" and "end" (start < end)
 31 | /// which can tell where the legal range is. The start pointer should
 32 | /// points to a record's beginning. The start pointer is pushed by GC procedure
 33 | /// and end pointer is pushed by both `append()` method and GC procedure.
 34 | ///
 35 | /// Rick can be either ordered or disordered, dependents on which level
 36 | /// it sites.
 37 | pub struct Rick {
 38 |     file: Rc<File>,
 39 |     sb: RickSuperBlock,
 40 | }
 41 | 
 42 | impl Rick {
 43 |     /// Open a `Rick` from given file.
 44 |     ///
 45 |     /// Optional parameter `value_format` will be used to initialize a rick
 46 |     /// file. If the rick file is not empty it will be ignored. If `None` is
 47 |     /// provided, the `value_format` field in super block will be set to
 48 |     /// default value, which is `RawValue`.
 49 |     pub async fn open(file: Rc<File>, value_format: Option<ValueFormat>) -> Result<Self> {
 50 |         let sb = Self::read_super_block(&file, value_format).await?;
 51 | 
 52 |         Ok(Self { file, sb })
 53 |     }
 54 | 
 55 |     /// Returns vector of (timestamp, key, entry's offset) to update index.
 56 |     ///
 57 |     /// `sync()` will be called before return.
 58 |     ///
 59 |     /// Once this method return, this `append` operation is considered finished
 60 |     /// on rick file. Even if it crashed before returned indices are
 61 |     /// persist.
 62 |     ///
 63 |     /// Encoding format: | payload length (u64) | payload |
 64 |     // todo: is it necessary to return inserted timestamp and key?
 65 |     pub async fn append(&mut self, entries: Vec<Entry>) -> Result<Vec<(Timestamp, Bytes, u64)>> {
 66 |         let mut positions = Vec::with_capacity(entries.len());
 67 |         let file_length = self.sb.legal_offset_end;
 68 | 
 69 |         // construct binary buffer.
 70 |         let mut buf = vec![];
 71 |         for entry in entries {
 72 |             let bytes = entry.encode();
 73 |             let length = EntryMeta::new(bytes.len() as u64).encode();
 74 |             let buf_len_before = buf.len() as u64;
 75 |             buf.extend_from_slice(&length);
 76 |             buf.extend_from_slice(&bytes);
 77 |             positions.push((entry.timestamp, entry.key, file_length + buf_len_before));
 78 |         }
 79 | 
 80 |         // write to file
 81 |         let new_file_length = file_length + buf.len() as u64;
 82 |         self.file.write(buf, file_length).await?;
 83 | 
 84 |         // update super block and sync
 85 |         self.sb.legal_offset_end = new_file_length;
 86 |         self.sync_super_block().await?;
 87 |         self.sync().await?;
 88 | 
 89 |         Ok(positions)
 90 |     }
 91 | 
 92 |     /// Read from a offset.
 93 |     ///
 94 |     /// Entry not found will be return as a error.
 95 |     ///
 96 |     /// Maybe verify key here?
 97 |     pub async fn read(&self, offset: u64) -> Result<Entry> {
 98 |         let meta_buf = self
 99 |             .file
100 |             .read(offset, EntryMeta::meta_size() as u64)
101 |             .await?;
102 |         check_bytes_length(&meta_buf, EntryMeta::meta_size())?;
103 |         let meta = EntryMeta::decode(&meta_buf);
104 | 
105 |         let offload_buf = self
106 |             .file
107 |             .read(offset + EntryMeta::meta_size() as u64, meta.length)
108 |             .await?;
109 |         check_bytes_length(&offload_buf, meta.length as usize)?;
110 | 
111 |         Ok(Entry::decode(&offload_buf))
112 |     }
113 | 
114 |     /// Reads offsets.
115 |     // todo: this might be refined by batching io.
116 |     pub async fn reads(&mut self, mut offsets: Vec<u64>) -> Result<Vec<Entry>> {
117 |         // fast pass
118 |         if offsets.len() < 2 {
119 |             return match offsets.first() {
120 |                 Some(offset) => Ok(vec![self.read(*offset).await?]),
121 |                 None => Ok(vec![]),
122 |             };
123 |         }
124 | 
125 |         let now = Instant::now();
126 |         trace!("[rick] start reads {} entries", offsets.len());
127 | 
128 |         offsets.sort_unstable();
129 |         let min = *offsets.first().unwrap();
130 |         let max = offsets.remove(offsets.len() - 1);
131 |         let bytes = self.file.read(min, max - min).await?;
132 |         let mut entries_iter = Self::decode_entries(&bytes)?.into_iter().peekable();
133 |         let mut entries = Vec::with_capacity(offsets.len() + 1);
134 | 
135 |         trace!(
136 |             "[rick] read and decode takes {:?} ms",
137 |             now.elapsed().as_millis()
138 |         );
139 | 
140 |         // filter decoded entries via given offsets
141 |         for offset in &offsets {
142 |             while entries_iter.peek().unwrap().1 + min != *offset {
143 |                 entries_iter.next();
144 |             }
145 |             entries.push(entries_iter.next().unwrap().0);
146 |         }
147 | 
148 |         trace!("[rick] filter takes {:?} ms", now.elapsed().as_millis());
149 | 
150 |         // read the last offset
151 |         entries.push(self.read(max).await?);
152 | 
153 |         Ok(entries)
154 |     }
155 | 
156 |     pub fn is_compressed(&self) -> bool {
157 |         self.sb.value_format == ValueFormat::CompressedValue
158 |     }
159 | 
160 |     /// Scan this rick file and construct its memindex
161 |     ///
162 |     /// Generally, Rick file will couple with a persisted index file SSTable.
163 |     /// Except those new ricks that memindex is not flushed to disk yet.
164 |     pub async fn construct_index(&self) -> Result<MemIndex> {
165 |         let contents = self
166 |             .file
167 |             .read(self.start(), self.end() - self.start())
168 |             .await?;
169 |         let mut index = 0;
170 | 
171 |         let mut indices = BTreeMap::new();
172 |         let mut offset = RickSuperBlock::LENGTH;
173 | 
174 |         while index < contents.len() {
175 |             let prefix_buf = &contents[index..index + EntryMeta::meta_size()];
176 |             index += EntryMeta::meta_size();
177 |             let meta = EntryMeta::decode(prefix_buf);
178 |             let offload_length = meta.length as usize;
179 |             let offload_buf = &contents[index..index + offload_length];
180 |             index += offload_length;
181 |             let entry = Entry::decode(offload_buf);
182 | 
183 |             indices.insert((entry.timestamp, entry.key), offset as u64);
184 |             offset += EntryMeta::meta_size() + offload_length;
185 |         }
186 | 
187 |         let mem_index = MemIndex::from_existing(indices);
188 |         Ok(mem_index)
189 |     }
190 | 
191 |     pub async fn sync(&self) -> Result<()> {
192 |         self.file.sync().await?;
193 | 
194 |         Ok(())
195 |     }
196 | 
197 |     /// Recycle entries in given `range` by free them using `fallocate` syscall.
198 |     ///
199 |     /// The general procedure would be like:
200 |     /// - Traverse some entries from "start", for each entry
201 |     ///     - suit in `range`, should be recycle.
202 |     ///     - not suit, query index (if have) whether it is legal. Put it into
203 |     ///       "need rewrite" buffer if is, and discard if not.
204 |     /// - Acquire write lock and write those "need rewrite" to the end of file.
205 |     ///   Then update index (if have) and sync index (if need).
206 |     /// - Sync super block to update "start" and "end" pointer to make above
207 |     ///   change visible. After this the write l ock can be released.
208 |     /// - Recycle space occupied by those offset is smaller than "start"
209 |     ///   pointer.
210 |     pub async fn garbage_collect(&self, range: TimeRange) -> Result<()> {
211 |         // yield control to executor.
212 |         glommio::yield_if_needed().await;
213 | 
214 |         todo!()
215 |     }
216 | 
217 |     // This is a temporary work around. Should be replaced by `garbage_collect()`
218 |     // above.
219 |     pub async fn clean(&mut self) -> Result<()> {
220 |         // mark as illegal
221 |         self.sb.legal_offset_start = self.sb.legal_offset_end;
222 |         self.sync_super_block().await
223 |     }
224 | 
225 |     pub async fn push_legal_offset_start(&mut self, new_offset_start: Offset) -> Result<()> {
226 |         if new_offset_start <= self.sb.legal_offset_start {
227 |             return Ok(());
228 |         }
229 | 
230 |         self.sb.legal_offset_start = new_offset_start;
231 |         self.sync_super_block().await
232 |     }
233 | 
234 |     pub fn get_legal_offset_end(&self) -> Offset {
235 |         self.sb.legal_offset_end
236 |     }
237 | 
238 |     /// Read super block from the first 4KB block of file.
239 |     /// And if file is empty a new super block will be created.
240 |     ///
241 |     /// `value_format` only works when initializing rick file.
242 |     /// Default value is `RawValue`.
243 |     async fn read_super_block(
244 |         file: &File,
245 |         value_format: Option<ValueFormat>,
246 |     ) -> Result<RickSuperBlock> {
247 |         // check whether super block exist.
248 |         let file_length = file.size().await?;
249 |         if file_length == 0 {
250 |             let value_format = value_format.unwrap_or(ValueFormat::RawValue);
251 |             // create super block and write it to file.
252 |             let sb = RickSuperBlock {
253 |                 // todo: make it a parameter.
254 |                 is_ordered: false,
255 |                 legal_offset_start: RickSuperBlock::LENGTH as u64,
256 |                 legal_offset_end: RickSuperBlock::LENGTH as u64,
257 |                 // todo: make it a parameter.
258 |                 value_format,
259 |                 align_timestamp: 0,
260 |             };
261 | 
262 |             let buf = sb.encode();
263 |             file.write(buf, 0).await?;
264 | 
265 |             Ok(sb)
266 |         } else {
267 |             // otherwise read from head.
268 |             let buf = file.read(0, RickSuperBlock::LENGTH as u64).await?;
269 |             let sb = RickSuperBlock::decode(&buf);
270 | 
271 |             Ok(sb)
272 |         }
273 |     }
274 | 
275 |     // todo: check crash consistency.
276 |     async fn sync_super_block(&self) -> Result<()> {
277 |         let buf = self.sb.encode();
278 |         self.file.write(buf, 0).await?;
279 | 
280 |         Ok(())
281 |     }
282 | 
283 |     /// Decode to entries and the offset over input bytes.
284 |     // todo: let `construct_index()` use this
285 |     fn decode_entries(contents: &[u8]) -> Result<Vec<(Entry, u64)>> {
286 |         let mut index = 0;
287 |         let mut offset = 0;
288 |         let mut entries = vec![];
289 | 
290 |         while index < contents.len() {
291 |             let prefix_buf = &contents[index..index + EntryMeta::meta_size()];
292 |             index += EntryMeta::meta_size();
293 |             let meta = EntryMeta::decode(prefix_buf);
294 |             let offload_length = meta.length as usize;
295 |             let offload_buf = &contents[index..index + offload_length];
296 |             index += offload_length;
297 |             let entry = Entry::decode(offload_buf);
298 |             entries.push((entry, offset as u64));
299 | 
300 |             offset += EntryMeta::meta_size() + offload_length;
301 |         }
302 | 
303 |         Ok(entries)
304 |     }
305 | 
306 |     /// Get rick's start offset
307 |     #[inline]
308 |     pub fn start(&self) -> Offset {
309 |         self.sb.legal_offset_start
310 |     }
311 | 
312 |     /// Get rick's end offset.
313 |     #[inline]
314 |     pub fn end(&self) -> Offset {
315 |         self.sb.legal_offset_end
316 |     }
317 | 
318 |     pub fn get_align_ts(&self) -> Timestamp {
319 |         self.sb.align_timestamp
320 |     }
321 | 
322 |     pub async fn set_align_ts(&mut self, ts: Timestamp) -> Result<()> {
323 |         self.sb.align_timestamp = ts;
324 |         self.sync_super_block().await
325 |     }
326 | }
327 | 
328 | #[cfg(test)]
329 | mod test {
330 |     use glommio::LocalExecutor;
331 |     use tempfile::tempdir;
332 | 
333 |     use super::*;
334 |     use crate::file::file_manager::FileManager;
335 |     use crate::file::FileNo;
336 | 
337 |     #[test]
338 |     fn new_super_block() {
339 |         let ex = LocalExecutor::default();
340 | 
341 |         ex.run(async {
342 |             let base_dir = tempdir().unwrap();
343 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
344 |             let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap();
345 |             let mut rick = Rick::open(rick_file, None).await.unwrap();
346 | 
347 |             assert_eq!(RickSuperBlock::LENGTH, rick.start() as usize);
348 |             assert_eq!(RickSuperBlock::LENGTH, rick.end() as usize);
349 | 
350 |             // write something
351 |             let entry = Entry {
352 |                 timestamp: 1,
353 |                 key: b"key".to_vec(),
354 |                 value: b"value".to_vec(),
355 |             };
356 |             rick.append(vec![entry.clone()]).await.unwrap();
357 |             let new_rick_end = rick.end();
358 |             assert_ne!(RickSuperBlock::LENGTH, rick.end() as usize);
359 | 
360 |             // close and open again
361 |             drop(rick);
362 |             file_manager.close_some(0).await.unwrap();
363 |             let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap();
364 |             let rick = Rick::open(rick_file, None).await.unwrap();
365 | 
366 |             assert_eq!(RickSuperBlock::LENGTH, rick.start() as usize);
367 |             assert_eq!(new_rick_end, rick.end());
368 |         });
369 |     }
370 | 
371 |     #[test]
372 |     fn read_write_one_entry() {
373 |         let ex = LocalExecutor::default();
374 | 
375 |         ex.run(async {
376 |             let base_dir = tempdir().unwrap();
377 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
378 |             let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap();
379 |             let mut rick = Rick::open(rick_file, None).await.unwrap();
380 | 
381 |             let entry = Entry {
382 |                 timestamp: 1,
383 |                 key: b"key".to_vec(),
384 |                 value: b"value".to_vec(),
385 |             };
386 |             rick.append(vec![entry.clone()]).await.unwrap();
387 | 
388 |             let read_entry = rick.read(RickSuperBlock::LENGTH as u64).await.unwrap();
389 |             assert_eq!(entry, read_entry);
390 |         });
391 |     }
392 | 
393 |     #[test]
394 |     fn reconstruct_memindex() {
395 |         let ex = LocalExecutor::default();
396 | 
397 |         ex.run(async {
398 |             let base_dir = tempdir().unwrap();
399 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
400 |             let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap();
401 |             let mut rick = Rick::open(rick_file, None).await.unwrap();
402 | 
403 |             let entries = vec![
404 |                 // one key with three timestamps.
405 |                 (1, b"key1".to_vec(), b"value".to_vec()).into(),
406 |                 (2, b"key1".to_vec(), b"value".to_vec()).into(),
407 |                 (3, b"key1".to_vec(), b"value".to_vec()).into(),
408 |                 // overwrite
409 |                 (1, b"key2".to_vec(), b"value1".to_vec()).into(),
410 |                 (1, b"key2".to_vec(), b"value2".to_vec()).into(),
411 |             ];
412 |             rick.append(entries.clone()).await.unwrap();
413 | 
414 |             let memindex = rick.construct_index().await.unwrap();
415 | 
416 |             assert_eq!(3, *memindex.user_keys.get(&b"key1".to_vec()).unwrap());
417 |             assert_eq!(1, *memindex.user_keys.get(&b"key2".to_vec()).unwrap());
418 | 
419 |             for index in memindex.into_iter() {
420 |                 rick.read(index.1).await.unwrap();
421 |             }
422 |         });
423 |     }
424 | 
425 |     #[test]
426 |     fn rick_reads_method() {
427 |         let ex = LocalExecutor::default();
428 | 
429 |         ex.run(async {
430 |             let base_dir = tempdir().unwrap();
431 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
432 |             let rick_file = file_manager.open(0, FileNo::Rick(0)).await.unwrap();
433 |             let mut rick = Rick::open(rick_file, None).await.unwrap();
434 | 
435 |             let mut entries = vec![
436 |                 (1, b"key1".to_vec(), b"value".to_vec()).into(),
437 |                 (2, b"key1".to_vec(), b"value".to_vec()).into(),
438 |                 (3, b"key1".to_vec(), b"value".to_vec()).into(),
439 |                 (1, b"key2".to_vec(), b"value".to_vec()).into(),
440 |                 (2, b"key2".to_vec(), b"value".to_vec()).into(),
441 |                 (3, b"key2".to_vec(), b"value".to_vec()).into(),
442 |                 (1, b"key3".to_vec(), b"value".to_vec()).into(),
443 |                 (2, b"key3".to_vec(), b"value".to_vec()).into(),
444 |                 (3, b"key3".to_vec(), b"value".to_vec()).into(),
445 |             ];
446 |             let mut offsets: Vec<u64> = rick
447 |                 .append(entries.clone())
448 |                 .await
449 |                 .unwrap()
450 |                 .into_iter()
451 |                 .map(|item| item.2)
452 |                 .collect();
453 | 
454 |             // all entries
455 |             let reads_result = rick.reads(offsets.clone()).await.unwrap();
456 |             assert_eq!(entries, reads_result);
457 | 
458 |             // eliminate some in the middle
459 |             entries.remove(entries.len() / 2);
460 |             offsets.remove(offsets.len() / 2);
461 |             let reads_result = rick.reads(offsets.clone()).await.unwrap();
462 |             assert_eq!(entries, reads_result);
463 |         });
464 |     }
465 | }
466 | 


--------------------------------------------------------------------------------
/src/file/sstable.rs:
--------------------------------------------------------------------------------
  1 | use std::rc::Rc;
  2 | use std::sync::Arc;
  3 | 
  4 | use tracing::error;
  5 | 
  6 | use crate::context::Context;
  7 | use crate::error::{HelixError, Result};
  8 | use crate::file::{FileNo, Rick};
  9 | use crate::index::MemIndex;
 10 | use crate::io::File;
 11 | use crate::table::TableReadHandle;
 12 | use crate::types::sstable::{BlockInfo, BlockType, IndexBlockEntry, SSTableSuperBlock};
 13 | use crate::types::{Bytes, LevelId, Offset, ThreadId, Timestamp};
 14 | use crate::util::{check_bytes_length, decode_u64, encode_u64};
 15 | 
 16 | pub struct SSTable {
 17 |     file: Rc<File>,
 18 |     sb: SSTableSuperBlock,
 19 | }
 20 | 
 21 | impl SSTable {
 22 |     pub async fn open(file: Rc<File>) -> Result<Self> {
 23 |         let sb = Self::read_super_block(&file).await?;
 24 | 
 25 |         Ok(Self { file, sb })
 26 |     }
 27 | 
 28 |     pub async fn into_read_handle(self, ctx: Arc<Context>) -> Result<TableReadHandle> {
 29 |         // read index block
 30 |         let index_blocks = self.sb.get_block_info(BlockType::IndexBlock);
 31 |         if index_blocks.is_empty() {
 32 |             error!("index block is empty");
 33 |             return Err(HelixError::NotFound);
 34 |         }
 35 |         let mut indices = vec![];
 36 |         for block in index_blocks {
 37 |             let block_buf = self.file.read(block.offset, block.length).await?;
 38 |             check_bytes_length(&block_buf, block.length as usize)?;
 39 |             let memindex = IndexBlockReader::read(block_buf)?;
 40 |             indices.push(memindex);
 41 |         }
 42 |         // todo: merge multi mem-indices
 43 |         let memindex = indices.pop().unwrap();
 44 | 
 45 |         // open rick file
 46 |         let rick_file = ctx
 47 |             .file_manager
 48 |             .open(self.sb.thread_id, FileNo::Rick(self.sb.level_id))
 49 |             .await?;
 50 |         let rick = Rick::open(rick_file, None).await?;
 51 | 
 52 |         let handle = TableReadHandle::new(memindex, self, rick, ctx);
 53 |         Ok(handle)
 54 |     }
 55 | 
 56 |     /// Read super block from the first 4KB block of file.
 57 |     /// And if file is empty a new super block will be created.
 58 |     // todo: duplicate code with `Rick::read_super_block()`
 59 |     async fn read_super_block(file: &File) -> Result<SSTableSuperBlock> {
 60 |         // check whether super block exist.
 61 |         let file_length = file.size().await?;
 62 |         if file_length == 0 {
 63 |             // create super block and write it to file.
 64 |             let sb = SSTableSuperBlock {
 65 |                 // todo: which default value?
 66 |                 thread_id: 0,
 67 |                 level_id: 0,
 68 |                 blocks: vec![],
 69 |             };
 70 | 
 71 |             let buf = sb.encode();
 72 |             file.write(buf, 0).await?;
 73 | 
 74 |             Ok(sb)
 75 |         } else {
 76 |             // otherwise read from head.
 77 |             let buf = file.read(0, SSTableSuperBlock::LENGTH as u64).await?;
 78 |             let sb = SSTableSuperBlock::decode(&buf);
 79 | 
 80 |             Ok(sb)
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | pub struct TableBuilder {
 86 |     thread_id: ThreadId,
 87 |     level_id: LevelId,
 88 |     file: Rc<File>,
 89 |     block_buffer: Bytes,
 90 |     blocks: Vec<BlockInfo>,
 91 |     tail_offset: Offset,
 92 | }
 93 | 
 94 | impl TableBuilder {
 95 |     /// Start to build table.
 96 |     pub fn begin(thread_id: ThreadId, level_id: LevelId, file: Rc<File>) -> Self {
 97 |         Self {
 98 |             thread_id,
 99 |             level_id,
100 |             file,
101 |             block_buffer: vec![],
102 |             blocks: vec![],
103 |             tail_offset: SSTableSuperBlock::LENGTH as u64,
104 |         }
105 |     }
106 | 
107 |     pub fn add_block(&mut self, block_builder: impl BlockBuilder) {
108 |         let (block_type, mut block_data) = block_builder.finish();
109 | 
110 |         let block_size = block_data.len() as u64;
111 |         self.blocks.push(BlockInfo {
112 |             block_type,
113 |             offset: self.tail_offset,
114 |             length: block_size,
115 |         });
116 |         self.block_buffer.append(&mut block_data);
117 |         self.tail_offset += block_size;
118 |     }
119 | 
120 |     /// Consume this builder to build a SSTable.
121 |     pub async fn finish(self) -> Result<()> {
122 |         // write super block
123 |         let sb = SSTableSuperBlock {
124 |             thread_id: self.thread_id,
125 |             level_id: self.level_id,
126 |             blocks: self.blocks,
127 |         };
128 |         self.file.write(sb.encode(), 0).await?;
129 | 
130 |         debug_assert_eq!(
131 |             self.block_buffer.len(),
132 |             self.tail_offset as usize - SSTableSuperBlock::LENGTH
133 |         );
134 |         // write other blocks
135 |         // todo: finish this in one write req
136 |         self.file
137 |             .write(self.block_buffer, SSTableSuperBlock::LENGTH as u64)
138 |             .await?;
139 | 
140 |         self.file.sync().await?;
141 | 
142 |         Ok(())
143 |     }
144 | }
145 | 
146 | pub trait BlockBuilder {
147 |     fn finish(self) -> (BlockType, Bytes);
148 | }
149 | 
150 | pub struct IndexBlockBuilder {
151 |     entry_buffer: Bytes,
152 | }
153 | 
154 | impl IndexBlockBuilder {
155 |     pub fn new() -> Self {
156 |         Self {
157 |             entry_buffer: vec![],
158 |         }
159 |     }
160 | 
161 |     pub fn from_memindex() -> Self {
162 |         todo!()
163 |     }
164 | 
165 |     pub fn add_entry(&mut self, key: &[u8], timestamp: Timestamp, offset: Offset) {
166 |         let index_entry = IndexBlockEntry {
167 |             value_offset: offset,
168 |             timestamp,
169 |             key: key.to_owned(),
170 |         };
171 |         let mut entry_bytes = index_entry.encode();
172 |         let bytes_len = entry_bytes.len() as u64;
173 | 
174 |         self.entry_buffer.append(&mut encode_u64(bytes_len));
175 |         self.entry_buffer.append(&mut entry_bytes);
176 |     }
177 | }
178 | 
179 | impl BlockBuilder for IndexBlockBuilder {
180 |     fn finish(self) -> (BlockType, Bytes) {
181 |         (BlockType::IndexBlock, self.entry_buffer)
182 |     }
183 | }
184 | 
185 | pub trait BlockReader {
186 |     type Output;
187 | 
188 |     fn read(_: Bytes) -> Result<Self::Output>;
189 | }
190 | 
191 | pub struct IndexBlockReader {}
192 | 
193 | impl BlockReader for IndexBlockReader {
194 |     type Output = MemIndex;
195 | 
196 |     fn read(mut data: Bytes) -> Result<MemIndex> {
197 |         let mut memindex = MemIndex::default();
198 | 
199 |         // todo: benchmark this
200 |         while !data.is_empty() {
201 |             // read length
202 |             let length_buf: Vec<_> = data.drain(..std::mem::size_of::<u64>()).collect();
203 |             check_bytes_length(&length_buf, std::mem::size_of::<u64>())?;
204 |             let length = decode_u64(&length_buf) as usize;
205 | 
206 |             // read index entry
207 |             let data_buf: Vec<_> = data.drain(..length).collect();
208 |             check_bytes_length(&data_buf, length)?;
209 |             let index_entry = IndexBlockEntry::decode(&data_buf);
210 | 
211 |             memindex.insert((
212 |                 index_entry.timestamp,
213 |                 index_entry.key,
214 |                 index_entry.value_offset,
215 |             ))?;
216 |         }
217 | 
218 |         Ok(memindex)
219 |     }
220 | }
221 | 
222 | #[cfg(test)]
223 | mod test {
224 |     use glommio::LocalExecutor;
225 |     use tempfile::tempdir;
226 | 
227 |     use super::*;
228 |     use crate::file::FileManager;
229 |     use crate::fn_registry::FnRegistry;
230 | 
231 |     #[test]
232 |     fn index_block_builder_and_reader() {
233 |         let ex = LocalExecutor::default();
234 |         ex.run(async {
235 |             let mut builder = IndexBlockBuilder::new();
236 | 
237 |             builder.add_entry(&b"key1".to_vec(), 1, 3);
238 |             builder.add_entry(&b"key2".to_vec(), 1, 10);
239 | 
240 |             let (block_type, bytes) = builder.finish();
241 |             assert_eq!(BlockType::IndexBlock, block_type);
242 | 
243 |             let memindex = IndexBlockReader::read(bytes).unwrap();
244 |             assert_eq!(memindex.get(&(1, b"key1".to_vec())).unwrap(), Some(3));
245 |             assert_eq!(memindex.get(&(1, b"key2".to_vec())).unwrap(), Some(10));
246 |         });
247 |     }
248 | 
249 |     #[test]
250 |     fn simple_table_builder() {
251 |         let ex = LocalExecutor::default();
252 |         ex.run(async {
253 |             let base_dir = tempdir().unwrap();
254 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
255 |             let ctx = Arc::new(Context {
256 |                 file_manager,
257 |                 fn_registry: FnRegistry::new_noop(),
258 |             });
259 |             let mut table_builder = TableBuilder::begin(
260 |                 0,
261 |                 1,
262 |                 ctx.file_manager.open(0, FileNo::SSTable(1)).await.unwrap(),
263 |             );
264 |             let mut index_bb = IndexBlockBuilder::new();
265 | 
266 |             let indices = vec![
267 |                 (b"key1".to_vec(), 1, 1),
268 |                 (b"key2key2".to_vec(), 2, 2),
269 |                 (b"key333".to_vec(), 3, 3),
270 |             ];
271 | 
272 |             for index in &indices {
273 |                 index_bb.add_entry(&index.0, index.1, index.2);
274 |             }
275 |             table_builder.add_block(index_bb);
276 |             table_builder.finish().await.unwrap();
277 | 
278 |             let table_handle =
279 |                 SSTable::open(ctx.file_manager.open(0, FileNo::SSTable(1)).await.unwrap())
280 |                     .await
281 |                     .unwrap()
282 |                     .into_read_handle(ctx)
283 |                     .await
284 |                     .unwrap();
285 | 
286 |             for index in indices {
287 |                 assert_eq!(
288 |                     table_handle.get_offset(&(index.1, index.0)).unwrap(),
289 |                     Some(index.2)
290 |                 );
291 |             }
292 |             assert_eq!(
293 |                 table_handle
294 |                     .get_offset(&(233, b"not exist".to_vec()))
295 |                     .unwrap(),
296 |                 None
297 |             );
298 |         });
299 |     }
300 | }
301 | 


--------------------------------------------------------------------------------
/src/fn_registry.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{HashMap, VecDeque};
  2 | use std::convert::TryInto;
  3 | use std::sync::Arc;
  4 | 
  5 | use crate::error::{HelixError, Result};
  6 | use crate::types::{Bytes, Timestamp};
  7 | 
  8 | /// Custom compaction function. This will be called when compacting L0
  9 | /// files to L1.
 10 | ///
 11 | /// The inputs are key, [(timestamp, values),].
 12 | pub type CompressFn = Arc<dyn Fn(Bytes, Vec<(Timestamp, Bytes)>) -> Bytes + Send + Sync>;
 13 | 
 14 | /// The inputs are key and compressed bytes.
 15 | /// Output is [(timestamp, values),]
 16 | pub type DecompressFn = Arc<dyn Fn(Bytes, &[u8]) -> Vec<(Timestamp, Bytes)> + Send + Sync>;
 17 | 
 18 | /// `UDCF` stands for "User Defined Compress Function".
 19 | /// Includes compress and decompress implementation.
 20 | #[derive(Clone)]
 21 | #[allow(clippy::upper_case_acronyms)]
 22 | pub struct UDCF {
 23 |     name: String,
 24 |     compress_fn: CompressFn,
 25 |     decompress_fn: DecompressFn,
 26 | }
 27 | 
 28 | impl UDCF {
 29 |     pub fn new(name: String, compress_fn: CompressFn, decompress_fn: DecompressFn) -> Self {
 30 |         Self {
 31 |             name,
 32 |             compress_fn,
 33 |             decompress_fn,
 34 |         }
 35 |     }
 36 | 
 37 |     pub fn compress(&self) -> CompressFn {
 38 |         self.compress_fn.clone()
 39 |     }
 40 | 
 41 |     pub fn decompress(&self) -> DecompressFn {
 42 |         self.decompress_fn.clone()
 43 |     }
 44 | }
 45 | 
 46 | /// Determine compress function based on key.
 47 | ///
 48 | /// This will be called on each key that going to be compressed.
 49 | pub type CompressDispatchFn = Arc<dyn Fn(&[u8]) -> &str + Send + Sync>;
 50 | 
 51 | /// Dispatch key to different shards. Called "sharding a key".
 52 | ///
 53 | /// Input type is a reference to a key in `Bytes` and output is which shard this
 54 | /// key belongs to.
 55 | pub type ShardingKeyFn = Arc<dyn Fn(&Bytes) -> usize + Send + Sync>;
 56 | 
 57 | pub struct FnRegistry {
 58 |     sharding_key_fn: ShardingKeyFn,
 59 |     dispatch_fn: CompressDispatchFn,
 60 |     compress_functions: HashMap<String, UDCF>,
 61 | }
 62 | 
 63 | impl FnRegistry {
 64 |     // #[cfg(test)]
 65 |     pub fn new_noop() -> Self {
 66 |         let mut compress_functions = HashMap::new();
 67 |         compress_functions.insert("noop".to_string(), noop_udcf());
 68 |         Self {
 69 |             sharding_key_fn: noop_sharding_key_fn(),
 70 |             dispatch_fn: noop_dispatch_fn(),
 71 |             compress_functions,
 72 |         }
 73 |     }
 74 | 
 75 |     pub fn register_udcf(&mut self, udcf: UDCF) {
 76 |         self.compress_functions.insert(udcf.name.clone(), udcf);
 77 |     }
 78 | 
 79 |     pub fn register_dispatch_fn(&mut self, dispatch_fn: CompressDispatchFn) {
 80 |         self.dispatch_fn = dispatch_fn;
 81 |     }
 82 | 
 83 |     pub fn register_sharding_key_fn(&mut self, sharding_key_fn: ShardingKeyFn) {
 84 |         self.sharding_key_fn = sharding_key_fn;
 85 |     }
 86 | 
 87 |     pub fn dispatch_fn(&self) -> CompressDispatchFn {
 88 |         self.dispatch_fn.clone()
 89 |     }
 90 | 
 91 |     pub fn sharding_fn(&self) -> &ShardingKeyFn {
 92 |         &self.sharding_key_fn
 93 |     }
 94 | 
 95 |     pub fn udcf(&self, name: &str) -> Result<UDCF> {
 96 |         self.compress_functions
 97 |             .get(name)
 98 |             .cloned()
 99 |             .ok_or(HelixError::NotFound)
100 |     }
101 | 
102 |     pub fn compress_entries(&self, key: Bytes, data: Vec<(i64, Vec<u8>)>) -> Result<Bytes> {
103 |         let compress_fn_name = self.dispatch_fn()(&key);
104 |         let compress_fn = self.udcf(compress_fn_name)?.compress();
105 |         Ok(compress_fn(key.clone(), data))
106 |     }
107 | 
108 |     pub fn decompress_entries(&self, key: &[u8], data: &[u8]) -> Result<Vec<(Timestamp, Bytes)>> {
109 |         let name = self.dispatch_fn()(key);
110 |         let udcf = self.udcf(name)?;
111 |         Ok(udcf.decompress()(key.to_owned(), data))
112 |     }
113 | }
114 | 
115 | pub fn noop_sharding_key_fn() -> ShardingKeyFn {
116 |     Arc::new(|_| 0)
117 | }
118 | 
119 | /// Dispatch all keys to [noop_udcf].
120 | pub fn noop_dispatch_fn() -> CompressDispatchFn {
121 |     Arc::new(|_| "noop")
122 | }
123 | 
124 | /// A No-Op compress function.
125 | ///
126 | /// Compress: first put all entries' bytes together. Then followed a block of
127 | /// bytes records each entry's length in u64. The last 8 bytes is how many
128 | /// entries sited. ```text
129 | /// | N var-length bytes | N * u64 for length | N as u64 |
130 | /// ```
131 | pub fn noop_udcf() -> UDCF {
132 |     // todo switch to `byteorder` crate.
133 |     let compress_fn: CompressFn = Arc::new(|_key, ts_values| {
134 |         let value_num = ts_values.len() as u64;
135 | 
136 |         // concat timestamp and value together.
137 |         let ts_values: Vec<Bytes> = ts_values
138 |             .into_iter()
139 |             .map(|(ts, mut value)| {
140 |                 let mut ts_bytes = ts.to_le_bytes().to_vec();
141 |                 ts_bytes.append(&mut value);
142 |                 ts_bytes
143 |             })
144 |             .collect();
145 | 
146 |         // calculate length for every ts_value's bytes and put them together.
147 |         let mut value_length = Vec::with_capacity(value_num as usize);
148 |         for bytes in &ts_values {
149 |             value_length.append(&mut (bytes.len() as u64).to_le_bytes().to_vec())
150 |         }
151 | 
152 |         // concat all ts_values, lengths, and number of entries
153 |         let mut concated_value = ts_values.concat();
154 |         concated_value.append(&mut value_length);
155 |         concated_value.append(&mut value_num.to_le_bytes().to_vec());
156 | 
157 |         concated_value
158 |     });
159 | 
160 |     const TIMESTAMP_SIZE: usize = std::mem::size_of::<Timestamp>();
161 |     const U64_SIZE: usize = std::mem::size_of::<u64>();
162 |     let decompress_fn: DecompressFn = Arc::new(|_key, raw_values| {
163 |         let mut raw_values: VecDeque<u8> = raw_values.iter().cloned().collect();
164 |         let mut len = raw_values.len();
165 | 
166 |         // decode `N`
167 |         let value_num_bytes: Vec<u8> = raw_values.drain(len - U64_SIZE..).collect();
168 |         len -= U64_SIZE;
169 |         let value_num = u64::from_le_bytes(value_num_bytes.try_into().unwrap()) as usize;
170 | 
171 |         // decode lengths
172 |         let mut value_length = Vec::with_capacity(value_num);
173 |         // start from 1
174 |         for i in 1..=value_num {
175 |             let length_bytes: Vec<u8> = raw_values.drain(len - i * U64_SIZE..).collect();
176 |             let length = u64::from_le_bytes(length_bytes.try_into().unwrap()) as usize;
177 |             value_length.push(length);
178 |         }
179 |         len -= U64_SIZE * value_num;
180 | 
181 |         // slice values
182 |         let mut values = VecDeque::with_capacity(value_num);
183 |         for length in value_length {
184 |             let mut ts_value_bytes: Vec<u8> = raw_values.drain(len - length..).collect();
185 |             len -= length;
186 |             let value_bytes = ts_value_bytes.drain(TIMESTAMP_SIZE..).collect();
187 |             let timestamp = Timestamp::from_le_bytes(ts_value_bytes.try_into().unwrap());
188 |             values.push_front((timestamp, value_bytes));
189 |         }
190 | 
191 |         // convert VecDeque to Vec.
192 |         // the decompress procedure is in reverse order.
193 |         values.into()
194 |     });
195 | 
196 |     UDCF::new("noop".to_string(), compress_fn, decompress_fn)
197 | }
198 | 
199 | #[cfg(test)]
200 | mod test {
201 |     use super::*;
202 | 
203 |     #[test]
204 |     fn noop_udcf_compress_decompress() {
205 |         let udcf = noop_udcf();
206 | 
207 |         let key = b"key".to_vec();
208 |         let ts_values = vec![
209 |             (1, b"value1".to_vec()),
210 |             (2, b"value2".to_vec()),
211 |             (3, b"value3".to_vec()),
212 |             (4, b"value1".to_vec()),
213 |             (5, b"value3".to_vec()),
214 |             (6, b"value2".to_vec()),
215 |         ];
216 | 
217 |         let compressed = udcf.compress()(key.clone(), ts_values.clone());
218 |         let decompressed = udcf.decompress()(key, &compressed);
219 | 
220 |         assert_eq!(ts_values, decompressed);
221 |     }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/index.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | use std::collections::btree_map;
 3 | use std::collections::{BTreeMap, HashMap};
 4 | use std::ops::AddAssign;
 5 | 
 6 | use crate::error::Result;
 7 | use crate::types::{Bytes, Offset, TimeRange, Timestamp};
 8 | 
 9 | #[derive(Default, Debug)]
10 | pub struct MemIndex {
11 |     /// (timestamp, key) => value's position in rick file.
12 |     pub index: BTreeMap<(Timestamp, Bytes), Offset>,
13 |     /// Counting user key.
14 |     pub user_keys: HashMap<Bytes, usize>,
15 | }
16 | 
17 | impl MemIndex {
18 |     pub fn from_existing(index: BTreeMap<(Timestamp, Bytes), u64>) -> Self {
19 |         let user_keys = HashMap::new();
20 |         let mut result = Self {
21 |             index: BTreeMap::new(),
22 |             user_keys,
23 |         };
24 | 
25 |         for (_, user_key) in index.keys() {
26 |             result.update_user_key(user_key);
27 |         }
28 |         result.index = index;
29 | 
30 |         result
31 |     }
32 | 
33 |     pub fn insert(&mut self, entry: (Timestamp, Bytes, u64)) -> Result<()> {
34 |         let (timestamp, key, value) = entry;
35 |         self.update_user_key(&key);
36 |         self.index.insert((timestamp, key), value);
37 | 
38 |         Ok(())
39 |     }
40 | 
41 |     pub fn insert_entries(&mut self, entries: Vec<(Timestamp, Bytes, u64)>) -> Result<()> {
42 |         for entry in entries {
43 |             let (timestamp, key, value) = entry;
44 |             self.update_user_key(&key);
45 |             self.index.insert((timestamp, key), value);
46 |         }
47 | 
48 |         Ok(())
49 |     }
50 | 
51 |     pub fn get(&self, time_key: &(Timestamp, Bytes)) -> Result<Option<Offset>> {
52 |         Ok(self.index.get(time_key).copied())
53 |     }
54 | 
55 |     #[cfg(test)]
56 |     pub fn into_iter(self) -> btree_map::IntoIter<(i64, std::vec::Vec<u8>), u64> {
57 |         self.index.into_iter()
58 |     }
59 | 
60 |     /// Get all existing user keys.
61 |     pub fn user_keys(&self) -> Vec<Bytes> {
62 |         self.user_keys.keys().cloned().collect()
63 |     }
64 | 
65 |     pub fn load_time_range(&self, range: TimeRange) -> Vec<u64> {
66 |         let mut offsets = vec![];
67 |         for ((ts, _), offset) in &self.index {
68 |             if range.contains(*ts) {
69 |                 offsets.push(*offset);
70 |             }
71 |         }
72 | 
73 |         offsets
74 |     }
75 | 
76 |     pub fn purge_time_range(&mut self, range: TimeRange) {
77 |         self.index.retain(|(ts, _), _| !range.contains(*ts));
78 |     }
79 | 
80 |     fn update_user_key(&mut self, user_key: &[u8]) {
81 |         if !self.user_keys.contains_key(user_key) {
82 |             self.user_keys.insert(user_key.to_vec(), 1);
83 |         } else {
84 |             self.user_keys.get_mut(user_key).unwrap().add_assign(1);
85 |         }
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/io.rs:
--------------------------------------------------------------------------------
 1 | use std::os::unix::prelude::{AsRawFd, RawFd};
 2 | use std::path::Path;
 3 | 
 4 | use glommio::io::{DmaFile, OpenOptions};
 5 | use glommio::ByteSliceMutExt;
 6 | 
 7 | use crate::error::Result;
 8 | use crate::types::Bytes;
 9 | 
10 | pub struct File(DmaFile);
11 | 
12 | // todo: check these. required by async trait `Iterator`
13 | unsafe impl Send for File {}
14 | unsafe impl Sync for File {}
15 | 
16 | impl File {
17 |     /// Open or create on given path.
18 |     #[inline]
19 |     pub async fn open<P: AsRef<Path>>(path: P) -> Result<File> {
20 |         let file = OpenOptions::new()
21 |             .create(true)
22 |             .read(true)
23 |             .write(true)
24 |             .dma_open(path)
25 |             .await?;
26 | 
27 |         Ok(File(file))
28 |     }
29 | 
30 |     #[inline]
31 |     pub async fn read(&self, offset: u64, size: u64) -> Result<Bytes> {
32 |         let read_result = self.0.read_at(offset, size as usize).await?;
33 | 
34 |         // todo: remove this copy
35 |         Ok(read_result.to_vec())
36 |     }
37 | 
38 |     #[inline]
39 |     pub async fn write(&self, bytes: Bytes, offset: u64) -> Result<()> {
40 |         let mut buf = self.0.alloc_dma_buffer(bytes.len());
41 |         buf.as_bytes_mut().write_at(0, &bytes);
42 | 
43 |         self.0.write_at(buf, offset).await?;
44 | 
45 |         Ok(())
46 |     }
47 | 
48 |     #[inline]
49 |     pub async fn sync(&self) -> Result<()> {
50 |         self.0.fdatasync().await?;
51 | 
52 |         Ok(())
53 |     }
54 | 
55 |     #[inline]
56 |     pub async fn size(&self) -> Result<u64> {
57 |         Ok(self.0.file_size().await?)
58 |     }
59 | 
60 |     /// Synchronous operation.
61 |     #[inline]
62 |     pub async fn truncate(&self, size: u64) -> Result<()> {
63 |         self.0.truncate(size).await?;
64 | 
65 |         Ok(())
66 |     }
67 | 
68 |     #[inline]
69 |     pub async fn close(self) -> Result<()> {
70 |         self.0.close().await?;
71 | 
72 |         Ok(())
73 |     }
74 | }
75 | 
76 | impl AsRawFd for File {
77 |     fn as_raw_fd(&self) -> RawFd {
78 |         self.0.as_raw_fd()
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/io_worker.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::rc::Rc;
  3 | use std::sync::Arc;
  4 | 
  5 | use glommio::channels::channel_mesh::{
  6 |     Receivers as ChannelMeshReceiver,
  7 |     Senders as ChannelMeshSender,
  8 | };
  9 | use glommio::sync::Gate;
 10 | use glommio::{Latency, Shares};
 11 | use tokio::sync::mpsc::{Receiver, Sender};
 12 | use tokio::sync::oneshot::Sender as Notifier;
 13 | use tokio::sync::Mutex;
 14 | use tracing::trace;
 15 | 
 16 | use crate::compact_sched::{CompactScheduler, QueueUpCompSched};
 17 | use crate::context::Context;
 18 | use crate::error::Result;
 19 | use crate::level::{Levels, TimestampReviewer};
 20 | use crate::option::{Options, ReadOption};
 21 | use crate::types::{Bytes, Entry, LevelInfo, ThreadId, TimeRange, Timestamp};
 22 | use crate::TimestampAction;
 23 | 
 24 | thread_local!(
 25 |     // todo: the api of glommio::Gate seems not very suit for our use case.
 26 |     // Expecting for a more ergonomic way to register critical task.
 27 |     // It is essentially a counter.
 28 |     /// A TLS variable for graceful shutdown.
 29 |     ///
 30 |     /// It will wait until all tasks spawned via it are finished when closing.
 31 |     pub static GATE: Rc<Gate> = Rc::new(Gate::new())
 32 | );
 33 | 
 34 | /// A un-Send handle to accept and process requests.
 35 | pub struct IOWorker {
 36 |     tid: ThreadId,
 37 |     levels: Rc<Levels<QueueUpCompSched>>,
 38 |     // todo: maybe add channel mesh for scan
 39 | }
 40 | 
 41 | impl IOWorker {
 42 |     pub async fn try_new(
 43 |         tid: ThreadId,
 44 |         opts: Options,
 45 |         timestamp_reviewer: Arc<Mutex<Box<dyn TimestampReviewer + 'static>>>,
 46 |         level_info: Arc<Mutex<LevelInfo>>,
 47 |         ctx: Arc<Context>,
 48 |         ts_action_sender: ChannelMeshSender<TimestampAction>,
 49 |     ) -> Result<Self> {
 50 |         let compact_task_queue = glommio::executor().create_task_queue(
 51 |             Shares::default(),
 52 |             Latency::NotImportant,
 53 |             "compact_tq",
 54 |         );
 55 |         // Safety:
 56 |         // this is initialized below.
 57 |         let sched = unsafe {
 58 |             QueueUpCompSched::new_zeroed(opts.compact_prompt_interval, 2, compact_task_queue)
 59 |         };
 60 | 
 61 |         let levels = Levels::try_new(
 62 |             tid,
 63 |             opts.clone(),
 64 |             timestamp_reviewer,
 65 |             ctx,
 66 |             ts_action_sender,
 67 |             level_info,
 68 |             sched.clone(),
 69 |         )
 70 |         .await?;
 71 | 
 72 |         sched.clone().init(levels.clone());
 73 |         sched.install(compact_task_queue)?;
 74 | 
 75 |         Ok(Self { tid, levels })
 76 |     }
 77 | 
 78 |     /// Won't return until shut down.
 79 |     pub async fn run(
 80 |         self,
 81 |         mut rx: Receiver<Task>,
 82 |         mut ts_action_receiver: ChannelMeshReceiver<TimestampAction>,
 83 |     ) {
 84 |         let connected_receivers: Vec<_> = ts_action_receiver
 85 |             .streams()
 86 |             .into_iter()
 87 |             .map(|(_, rx)| rx)
 88 |             .collect();
 89 | 
 90 |         for rx in connected_receivers {
 91 |             let levels = self.levels.clone();
 92 |             let tid = self.tid;
 93 |             glommio::spawn_local(async move {
 94 |                 while let Some(action) = rx.recv().await {
 95 |                     trace!("{} received action {:?}", tid, action);
 96 |                     let _ = levels.handle_actions(vec![action]).await;
 97 |                 }
 98 |             })
 99 |             .detach();
100 |         }
101 | 
102 |         // the `Error` case of `Gate::spawn()` is glommio runtime cannot find given task
103 |         // queue which needn't to take into consideration since we don't specify
104 |         // task queue.
105 |         while let Some(task) = rx.recv().await {
106 |             match task {
107 |                 Task::Put(entries, tx) => {
108 |                     let levels = self.levels.clone();
109 |                     GATE.with(|gate| {
110 |                         gate.spawn(async move {
111 |                             levels.put(entries, tx).await;
112 |                         })
113 |                         .unwrap()
114 |                         .detach()
115 |                     });
116 |                 }
117 |                 Task::Get(ts, key, tx, opt) => {
118 |                     let levels = self.levels.clone();
119 |                     GATE.with(|gate| {
120 |                         gate.spawn(async move {
121 |                             let result = levels.get(&(ts, key), opt).await;
122 |                             let _ = tx.send(result);
123 |                         })
124 |                         .unwrap()
125 |                         .detach()
126 |                     });
127 |                 }
128 |                 Task::Scan(time_range, key_start, key_end, sender, cmp) => {
129 |                     let levels = self.levels.clone();
130 |                     GATE.with(|gate| {
131 |                         gate.spawn(async move {
132 |                             let _ = levels
133 |                                 .scan(time_range, key_start, key_end, sender, cmp)
134 |                                 .await;
135 |                         })
136 |                         .unwrap()
137 |                         .detach()
138 |                     });
139 |                 }
140 |                 Task::Shutdown => {
141 |                     trace!("going to close shard {}", self.tid);
142 | 
143 |                     let gate = GATE.with(|gate| gate.clone());
144 |                     let _ = gate.close().await;
145 | 
146 |                     trace!("shard {} is closed", self.tid);
147 |                     break;
148 |                 }
149 |             }
150 |         }
151 |     }
152 | }
153 | 
154 | pub enum Task {
155 |     // todo: add put option
156 |     Put(Vec<Entry>, Notifier<Result<()>>),
157 |     Get(
158 |         Timestamp,
159 |         Bytes,
160 |         Notifier<Result<Option<Entry>>>,
161 |         ReadOption,
162 |     ),
163 |     /// time range, start key, end key, result sender, comparator
164 |     Scan(
165 |         TimeRange,
166 |         Bytes,
167 |         Bytes,
168 |         Sender<Vec<Entry>>,
169 |         Arc<dyn Fn(&[u8], &[u8]) -> Ordering + Send + Sync>,
170 |     ),
171 |     Shutdown,
172 | }
173 | 
174 | // todo: finish this
175 | impl std::fmt::Debug for Task {
176 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
177 |         f.debug_struct("HelixTask").finish()
178 |     }
179 | }
180 | 
181 | #[cfg(test)]
182 | mod test {
183 |     use futures_util::future::select_all;
184 |     use glommio::channels::channel_mesh::{Full, MeshBuilder};
185 |     use glommio::LocalExecutorBuilder;
186 | 
187 |     // todo: investigate this. The receiver will receive lots of message from "peer
188 |     // 0" without any sender.
189 |     #[test]
190 |     #[ignore]
191 |     fn channel_mesh_select_recv_loop() {
192 |         let mesh_builder: MeshBuilder<(), Full> = MeshBuilder::full(8, 2);
193 | 
194 |         for _ in 0..7 {
195 |             let mesh_builder = mesh_builder.clone();
196 |             LocalExecutorBuilder::new(glommio::Placement::Unbound)
197 |                 .spawn(move || async move {
198 |                     let (_, mut rx) = mesh_builder.join().await.unwrap();
199 | 
200 |                     let connected_receivers: Vec<_> =
201 |                         rx.streams().into_iter().map(|(_, rx)| rx).collect();
202 | 
203 |                     loop {
204 |                         let recvs = connected_receivers
205 |                             .iter()
206 |                             .map(|rx| {
207 |                                 let fut = rx.recv();
208 |                                 Box::pin(fut)
209 |                             })
210 |                             .collect::<Vec<_>>();
211 | 
212 |                         let action = select_all(recvs).await;
213 |                         let (_, index, _) = action;
214 |                         println!("{} received action from {}", rx.peer_id(), index);
215 |                     }
216 |                 })
217 |                 .unwrap();
218 |         }
219 | 
220 |         LocalExecutorBuilder::new(glommio::Placement::Unbound)
221 |             .spawn(move || async move {
222 |                 let (tx, _) = mesh_builder.join().await.unwrap();
223 | 
224 |                 for _ in 0..4 {
225 |                     for peer in 0..7 {
226 |                         tx.send_to(peer, ()).await.unwrap();
227 |                     }
228 |                     println!("finished once")
229 |                 }
230 |             })
231 |             .unwrap()
232 |             .join()
233 |             .unwrap();
234 |     }
235 | }
236 | 


--------------------------------------------------------------------------------
/src/iterator.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::BinaryHeap;
  2 | 
  3 | use async_trait::async_trait;
  4 | use tokio::sync::mpsc::Receiver;
  5 | 
  6 | use crate::error::Result;
  7 | use crate::types::Entry;
  8 | use crate::util::{Comparator, KeyExtractor, OrderingHelper};
  9 | 
 10 | // todo: add type param
 11 | #[async_trait]
 12 | pub trait Iterator {
 13 |     async fn next(&mut self) -> Result<Option<Entry>>;
 14 | 
 15 |     fn is_valid(&self) -> bool;
 16 | }
 17 | 
 18 | /// Iterate over timestamp. i.e, (ts 0, key 1) -> (ts 1, key 1) -> (ts 2, key
 19 | /// 1)...
 20 | ///
 21 | /// "Scan" is achieved via (lots of) `get()`
 22 | pub struct TimeIterator<C: Comparator> {
 23 |     inner: ShardMuxTimeIterator<C>,
 24 |     buf: Vec<Entry>,
 25 | }
 26 | 
 27 | impl<C: Comparator> TimeIterator<C> {
 28 |     pub(crate) fn new(mux_iter: ShardMuxTimeIterator<C>) -> Self {
 29 |         Self {
 30 |             inner: mux_iter,
 31 |             buf: vec![],
 32 |         }
 33 |     }
 34 | 
 35 |     pub(crate) async fn next(&mut self) -> Result<Option<Entry>> {
 36 |         if self.buf.is_empty() {
 37 |             self.buf = ok_unwrap!(self.inner.next().await);
 38 |         }
 39 | 
 40 |         Ok(self.buf.pop())
 41 |     }
 42 | 
 43 |     /// Valid when inner iterator is valid or its own buffer still contains
 44 |     /// things.
 45 |     pub(crate) fn is_valid(&self) -> bool {
 46 |         self.inner.is_valid() || !self.buf.is_empty()
 47 |     }
 48 | }
 49 | 
 50 | #[async_trait]
 51 | impl<C: Comparator + Eq> Iterator for TimeIterator<C> {
 52 |     async fn next(&mut self) -> Result<Option<Entry>> {
 53 |         self.next().await
 54 |     }
 55 | 
 56 |     fn is_valid(&self) -> bool {
 57 |         self.is_valid()
 58 |     }
 59 | }
 60 | 
 61 | pub(crate) struct ShardTimeIterator {
 62 |     ready: Option<Vec<Entry>>,
 63 |     source: Receiver<Vec<Entry>>,
 64 |     is_finished: bool,
 65 | }
 66 | 
 67 | impl ShardTimeIterator {
 68 |     // will wait source to yield the first element.
 69 |     pub(crate) async fn new(mut source: Receiver<Vec<Entry>>) -> Self {
 70 |         let ready = source.recv().await;
 71 |         let is_finished = ready.is_none();
 72 | 
 73 |         Self {
 74 |             ready,
 75 |             source,
 76 |             is_finished,
 77 |         }
 78 |     }
 79 | 
 80 |     // todo: maybe add a trait `PeekableIterator` : `Iterator`
 81 |     pub fn peek(&self) -> Option<&Vec<Entry>> {
 82 |         self.ready.as_ref()
 83 |     }
 84 | 
 85 |     /// Take current value but not step iterator after that.
 86 |     pub async fn take(&mut self) -> Result<Option<Vec<Entry>>> {
 87 |         let ready = self.ready.take();
 88 |         self.step().await?;
 89 | 
 90 |         Ok(ready)
 91 |     }
 92 | 
 93 |     async fn step(&mut self) -> Result<()> {
 94 |         if self.is_finished {
 95 |             return Ok(());
 96 |         }
 97 | 
 98 |         match self.source.recv().await {
 99 |             Some(item) => self.ready = Some(item),
100 |             None => self.is_finished = true,
101 |         }
102 | 
103 |         Ok(())
104 |     }
105 | 
106 |     pub fn is_valid(&self) -> bool {
107 |         !self.is_finished
108 |     }
109 | }
110 | 
111 | pub struct ShardMuxTimeIterator<C: Comparator> {
112 |     iters: Vec<ShardTimeIterator>,
113 |     entry_buf: BinaryHeap<OrderingHelper<C, Vec<Entry>>>,
114 | }
115 | 
116 | impl<C: Comparator> ShardMuxTimeIterator<C> {
117 |     pub(crate) async fn new(iters: Vec<ShardTimeIterator>, buf_size: usize) -> Self {
118 |         let mut s = Self {
119 |             iters,
120 |             entry_buf: BinaryHeap::default(),
121 |         };
122 |         s.init(buf_size).await;
123 | 
124 |         s
125 |     }
126 | 
127 |     async fn next(&mut self) -> Option<Vec<Entry>> {
128 |         if self.entry_buf.is_empty() {
129 |             return None;
130 |         }
131 | 
132 |         let next = self.entry_buf.pop().unwrap().data;
133 |         // todo: check this Result
134 |         let _ = self.consume_one().await;
135 | 
136 |         Some(next)
137 |     }
138 | 
139 |     /// Valid when underlying iters aren't all consumed or `entry_buf` still
140 |     /// buffers some entries.
141 |     fn is_valid(&self) -> bool {
142 |         !self.iters.is_empty() || !self.entry_buf.is_empty()
143 |     }
144 | 
145 |     async fn init(&mut self, buf_size: usize) {
146 |         // sort underlying iterators
147 |         self.purge_finished();
148 |         self.iters.sort_by(|lhs, rhs| {
149 |             C::cmp(
150 |                 Vec::<Entry>::key(lhs.peek().unwrap()),
151 |                 Vec::<Entry>::key(rhs.peek().unwrap()),
152 |             )
153 |         });
154 | 
155 |         // fill `entry_buf`
156 |         while !self.iters.is_empty() && self.entry_buf.len() < buf_size {
157 |             // todo: check this Result
158 |             let _ = self.consume_one().await;
159 |         }
160 |     }
161 | 
162 |     /// Remove and deconstruct finished iterator to release source.
163 |     fn purge_finished(&mut self) {
164 |         self.iters.retain(|iter| iter.is_valid())
165 |     }
166 | 
167 |     /// Get one element from underlying iterators and put it into `entry_buf`.
168 |     /// Then step the iterator which supplies that element and reordering
169 |     /// the iterator list to keep them ordered.
170 |     async fn consume_one(&mut self) -> Result<()> {
171 |         if self.iters.is_empty() {
172 |             return Ok(());
173 |         }
174 | 
175 |         // consume
176 |         let mut first_iter = self.iters.pop().unwrap();
177 |         let item = first_iter.take().await?.unwrap();
178 |         self.entry_buf.push(item.into());
179 |         // this iterator is finished
180 |         if !first_iter.is_valid() {
181 |             return Ok(());
182 |         }
183 | 
184 |         // insert popped iterator to ordered position
185 |         let new_entry = first_iter.peek().unwrap();
186 |         let lhs = Vec::<Entry>::key(new_entry);
187 |         let index = self
188 |             .iters
189 |             .binary_search_by(|iter| C::cmp(lhs, Vec::<Entry>::key(iter.peek().unwrap())))
190 |             .unwrap_or_else(|x| x);
191 |         self.iters.insert(index, first_iter);
192 | 
193 |         Ok(())
194 |     }
195 | }
196 | 


--------------------------------------------------------------------------------
/src/level.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::RefCell;
  2 | use std::cmp::Ordering;
  3 | use std::collections::HashMap;
  4 | use std::rc::Rc;
  5 | use std::sync::Arc;
  6 | use std::time::Duration;
  7 | 
  8 | use glommio::channels::channel_mesh::Senders as ChannelMeshSender;
  9 | use glommio::sync::RwLock;
 10 | use glommio::timer::TimerActionOnce;
 11 | use tokio::sync::mpsc::Sender as BoundedSender;
 12 | use tokio::sync::oneshot::Sender;
 13 | use tokio::sync::Mutex;
 14 | use tracing::{debug, instrument, trace};
 15 | 
 16 | use crate::cache::{Cache, KeyCacheEntry, KeyCacheResult};
 17 | use crate::compact_sched::CompactScheduler;
 18 | use crate::context::Context;
 19 | use crate::error::{HelixError, Result};
 20 | use crate::file::{FileNo, IndexBlockBuilder, Rick, SSTable, TableBuilder};
 21 | use crate::index::MemIndex;
 22 | use crate::io_worker;
 23 | use crate::option::{Options, ReadOption};
 24 | use crate::types::{Bytes, Entry, LevelId, LevelInfo, ThreadId, TimeRange, Timestamp, ValueFormat};
 25 | 
 26 | pub struct LevelConfig {
 27 |     /// Use one file to store non-Rick (SSTable) entries or not.
 28 |     pub sharding_sstable: bool,
 29 |     /// Max levels can hold. This option should be greater than 0.
 30 |     /// Levels will be L0 to L`max_level` (inclusive).
 31 |     /// Might be useless due to TimeStamp Reviewer?
 32 |     pub max_level: usize,
 33 |     /// The max difference of timestamps inside one level.
 34 |     /// Might be useless due to TimeStamp Reviewer?
 35 |     pub level_duration: u64,
 36 | }
 37 | 
 38 | /// APIs require unique reference (&mut self) because this `Level` is designed
 39 | /// to be used inside one thread (!Send). The fields should also be !Send if
 40 | /// possible.
 41 | pub(crate) struct Levels<CS: CompactScheduler> {
 42 |     tid: ThreadId,
 43 |     // todo: remove this mutex
 44 |     timestamp_reviewer: Arc<Mutex<Box<dyn TimestampReviewer>>>,
 45 |     ctx: Arc<Context>,
 46 |     memindex: Mutex<MemIndex>,
 47 |     // todo: use group of ricks to achieve log-rotate/GC
 48 |     rick: Mutex<Rick>,
 49 |     level_info: Arc<Mutex<LevelInfo>>,
 50 |     cache: Cache,
 51 |     write_batch: Rc<WriteBatch>,
 52 |     ts_action_sender: ChannelMeshSender<TimestampAction>,
 53 |     compact_sched: Rc<CS>,
 54 | }
 55 | 
 56 | impl<CS: CompactScheduler> Levels<CS> {
 57 |     pub async fn try_new(
 58 |         tid: ThreadId,
 59 |         opts: Options,
 60 |         timestamp_reviewer: Arc<Mutex<Box<dyn TimestampReviewer>>>,
 61 |         ctx: Arc<Context>,
 62 |         ts_action_sender: ChannelMeshSender<TimestampAction>,
 63 |         level_info: Arc<Mutex<LevelInfo>>,
 64 |         compact_sched: Rc<CS>,
 65 |     ) -> Result<Rc<Self>> {
 66 |         // todo: remove the default rick. the number in `FileNo::Rick` shouldn't be 0.
 67 |         let rick_file = ctx.file_manager.open(tid, FileNo::Rick(0)).await.unwrap();
 68 |         let rick = Rick::open(rick_file, Some(ValueFormat::RawValue)).await?;
 69 |         let memindex = rick.construct_index().await?;
 70 | 
 71 |         let cache = Cache::with_config(opts.cache);
 72 |         let write_batch = WriteBatch::with_config(opts.write_batch);
 73 | 
 74 |         let levels = Self {
 75 |             tid,
 76 |             timestamp_reviewer,
 77 |             ctx,
 78 |             memindex: Mutex::new(memindex),
 79 |             rick: Mutex::new(rick),
 80 |             level_info,
 81 |             cache,
 82 |             write_batch: Rc::new(write_batch),
 83 |             ts_action_sender,
 84 |             compact_sched,
 85 |         };
 86 | 
 87 |         let levels = Rc::new(levels);
 88 | 
 89 |         Ok(levels)
 90 |     }
 91 | 
 92 |     pub async fn put(self: Rc<Self>, entries: Vec<Entry>, notifier: Sender<Result<()>>) {
 93 |         self.write_batch
 94 |             .clone()
 95 |             .enqueue(entries, notifier, self.clone())
 96 |             .await;
 97 |     }
 98 | 
 99 |     /// Put entries without batching them.
100 |     pub async fn put_internal(&self, entries: Vec<Entry>) -> Result<()> {
101 |         if entries.is_empty() {
102 |             return Ok(());
103 |         }
104 | 
105 |         let max_timestamp = entries
106 |             .iter()
107 |             .max_by_key(|entry| entry.timestamp)
108 |             .unwrap()
109 |             .timestamp;
110 | 
111 |         let indices = self.rick.lock().await.append(entries).await?;
112 |         self.memindex.lock().await.insert_entries(indices)?;
113 | 
114 |         // review timestamp and handle actions.
115 |         let review_actions = self.timestamp_reviewer.lock().await.observe(max_timestamp);
116 |         self.handle_actions(review_actions.clone()).await?;
117 | 
118 |         glommio::yield_if_needed().await;
119 | 
120 |         Ok(())
121 |     }
122 | 
123 |     pub async fn get(
124 |         &self,
125 |         time_key: &(Timestamp, Bytes),
126 |         opt: ReadOption,
127 |     ) -> Result<Option<Entry>> {
128 |         let level = self.level_info.lock().await.get_level_id(time_key.0);
129 |         match level {
130 |             None => Ok(None),
131 |             Some(0) => self.get_from_rick(time_key).await,
132 |             Some(l) => self.get_from_table(time_key, l, opt).await,
133 |         }
134 |     }
135 | 
136 |     // todo: handle multi level scan
137 |     pub async fn scan(
138 |         &self,
139 |         time_range: TimeRange,
140 |         key_start: Bytes,
141 |         key_end: Bytes,
142 |         sender: BoundedSender<Vec<Entry>>,
143 |         cmp: Arc<dyn Fn(&[u8], &[u8]) -> Ordering>,
144 |     ) -> Result<()> {
145 |         let mut user_keys = self.memindex.lock().await.user_keys();
146 |         // filter
147 |         user_keys.retain(|key| {
148 |             cmp(key, &key_start) != Ordering::Less && cmp(key, &key_end) != Ordering::Greater
149 |         });
150 |         // sort
151 |         user_keys.sort_by(|lhs, rhs| cmp(lhs, rhs));
152 | 
153 |         // todo: refine this
154 |         for user_key in user_keys {
155 |             let mut time_key = (0, user_key);
156 |             for ts in time_range.range() {
157 |                 time_key.0 = ts;
158 |                 if let Some(entry) = self.get(&time_key, ReadOption::default()).await? {
159 |                     sender.send(vec![entry]).await?;
160 |                 }
161 |             }
162 |         }
163 | 
164 |         Ok(())
165 |     }
166 | 
167 |     #[inline]
168 |     async fn get_from_rick(&self, time_key: &(Timestamp, Bytes)) -> Result<Option<Entry>> {
169 |         if let Some(offset) = self.memindex.lock().await.get(time_key)? {
170 |             let entry = self.rick.lock().await.read(offset).await?;
171 | 
172 |             return Ok(Some(entry));
173 |         }
174 | 
175 |         Ok(None)
176 |     }
177 | 
178 |     // todo: refine, split
179 |     #[inline]
180 |     async fn get_from_table(
181 |         &self,
182 |         time_key: &(Timestamp, Bytes),
183 |         level_id: LevelId,
184 |         opt: ReadOption,
185 |     ) -> Result<Option<Entry>> {
186 |         let mut key_cache_entry = KeyCacheEntry::new(time_key);
187 | 
188 |         let cache_result = self.cache.get_key(time_key);
189 |         trace!("cache result of {:?} : {:?}", time_key, cache_result);
190 |         match cache_result {
191 |             KeyCacheResult::Value(value) => Ok(Some(Entry {
192 |                 timestamp: time_key.0,
193 |                 key: time_key.1.to_owned(),
194 |                 value,
195 |             })),
196 |             KeyCacheResult::Compressed(compressed) => {
197 |                 let value =
198 |                     ok_unwrap!(self.decompress_and_find(time_key, &compressed, opt.decompress)?);
199 | 
200 |                 key_cache_entry.value = Some(&value);
201 |                 key_cache_entry.compressed = Some(&compressed);
202 |                 self.cache.put_key(key_cache_entry);
203 | 
204 |                 Ok(Some(Entry {
205 |                     timestamp: time_key.0,
206 |                     key: time_key.1.clone(),
207 |                     value: value.clone(),
208 |                 }))
209 |             }
210 |             KeyCacheResult::Position(tid, level_id, offset) => {
211 |                 let rick_file = self
212 |                     .ctx
213 |                     .file_manager
214 |                     .open(tid, FileNo::Rick(level_id))
215 |                     .await?;
216 |                 let rick = Rick::open(rick_file, None).await?;
217 |                 let raw_bytes = rick.read(offset as u64).await?;
218 | 
219 |                 let value = ok_unwrap!(self.decompress_and_find(
220 |                     time_key,
221 |                     &raw_bytes.value,
222 |                     opt.decompress
223 |                 )?);
224 | 
225 |                 key_cache_entry.value = Some(&value);
226 |                 key_cache_entry.compressed = Some(&raw_bytes.value);
227 |                 self.cache.put_key(key_cache_entry);
228 | 
229 |                 Ok(Some(Entry {
230 |                     timestamp: time_key.0,
231 |                     key: time_key.1.clone(),
232 |                     value: value.clone(),
233 |                 }))
234 |             }
235 |             KeyCacheResult::NotFound => {
236 |                 let handle = if let Some(handle) =
237 |                     self.cache.get_table_handle(&(self.tid, level_id).into())
238 |                 {
239 |                     handle
240 |                 } else {
241 |                     let table_file = self
242 |                         .ctx
243 |                         .file_manager
244 |                         .open(self.tid, FileNo::SSTable(level_id))
245 |                         .await?;
246 |                     // table file is empty, means this level haven't finished it compaction. Need to
247 |                     // read value from L0 rick.
248 |                     // But this check (via file's size) is not good. the write operation may not
249 |                     // guarantee to be atomic. todo: add a flag to indicate
250 |                     // whether a compact is finished.
251 |                     if table_file.size().await? == 0 {
252 |                         return self.get_from_rick(time_key).await;
253 |                     }
254 |                     let handle = SSTable::open(table_file)
255 |                         .await?
256 |                         .into_read_handle(self.ctx.clone())
257 |                         .await?;
258 | 
259 |                     let handle = Rc::new(handle);
260 |                     self.cache
261 |                         .put_table_handle((self.tid, level_id).into(), handle.clone())
262 |                         .await?;
263 |                     handle
264 |                 };
265 | 
266 |                 let entry = handle.get(time_key).await?;
267 |                 let is_compressed = handle.is_compressed();
268 |                 // update cache
269 |                 if let Some(mut entry) = entry {
270 |                     if is_compressed {
271 |                         let value = ok_unwrap!(self.decompress_and_find(
272 |                             time_key,
273 |                             &entry.value,
274 |                             opt.decompress,
275 |                         )?);
276 |                         key_cache_entry.compressed = Some(&entry.value);
277 |                         self.cache.put_key(key_cache_entry);
278 |                         entry.timestamp = time_key.0;
279 |                         entry.value = value;
280 |                     } else {
281 |                         key_cache_entry.value = Some(&entry.value);
282 |                         self.cache.put_key(key_cache_entry);
283 |                     }
284 |                     Ok(Some(entry))
285 |                 } else {
286 |                     Ok(None)
287 |                 }
288 |             }
289 |         }
290 |     }
291 | 
292 |     /// Propagate action to other peers.
293 |     async fn propagate_action(&self, action: TimestampAction) -> Result<()> {
294 |         for consumer_id in 0..self.ts_action_sender.nr_consumers() {
295 |             if consumer_id != self.ts_action_sender.peer_id() {
296 |                 self.ts_action_sender
297 |                     .send_to(consumer_id, action)
298 |                     .await
299 |                     // todo: check this unwrap
300 |                     .unwrap();
301 |             }
302 |         }
303 | 
304 |         Ok(())
305 |     }
306 | 
307 |     pub(crate) async fn handle_actions(&self, actions: Vec<TimestampAction>) -> Result<()> {
308 |         for action in actions {
309 |             debug!("tid: {}, action: {:?}", self.tid, action);
310 |             match action {
311 |                 TimestampAction::Compact(start_ts, end_ts, level_id) => {
312 |                     let level_id = match level_id {
313 |                         Some(id) => id,
314 |                         None => {
315 |                             // fetch new level id and update level info
316 |                             let level_id = self
317 |                                 .level_info
318 |                                 .lock()
319 |                                 .await
320 |                                 .add_level(start_ts, end_ts, &self.ctx.file_manager)
321 |                                 .await?;
322 |                             // propagate
323 |                             self.propagate_action(TimestampAction::Compact(
324 |                                 start_ts,
325 |                                 end_ts,
326 |                                 Some(level_id),
327 |                             ))
328 |                             .await?;
329 | 
330 |                             level_id
331 |                         }
332 |                     };
333 |                     self.compact(TimeRange::from((start_ts, end_ts)), level_id)
334 |                         .await?;
335 |                     // todo: enable this
336 |                     // self.compact_sched.enqueue(level_id);
337 |                 }
338 |                 TimestampAction::Outdate(_) => {
339 |                     self.propagate_action(action).await?;
340 |                     self.outdate().await?
341 |                 }
342 |             }
343 |         }
344 | 
345 |         Ok(())
346 |     }
347 | 
348 |     /// Compact entries from rick in [start_ts, end_ts] to next level.
349 |     ///
350 |     /// This function is wrapped by `Gate` which means HelixCore will wait it to
351 |     /// finish before close and shutdown. Whereas compactions that are invoked
352 |     /// after the gate is closing or closed will be ignored.
353 |     ///
354 |     /// todo: how to handle rick file is not fully covered by given time range?.
355 |     #[instrument]
356 |     pub(crate) async fn compact(&self, range: TimeRange, level_id: LevelId) -> Result<()> {
357 |         // Keep the gate open until compact finished. The question mark (try) indicates
358 |         // a early return once it's failed to spawn to the gate.
359 |         let (tx, rx) = glommio::channels::local_channel::new_bounded(1);
360 |         io_worker::GATE
361 |             .with(|gate| gate.spawn(async move { rx.recv().await }))?
362 |             .detach();
363 |         debug!(
364 |             "[compact] start compact. range {:?}, level {}",
365 |             range, level_id
366 |         );
367 | 
368 |         let mut table_builder = TableBuilder::begin(
369 |             self.tid,
370 |             level_id,
371 |             self.ctx
372 |                 .file_manager
373 |                 .open(self.tid, FileNo::SSTable(level_id))
374 |                 .await?,
375 |         );
376 | 
377 |         // make entry_map (from memindex) and purge
378 |         let memindex = self.memindex.lock().await;
379 |         let offsets = memindex.load_time_range(range);
380 |         drop(memindex);
381 |         let mut rick = self.rick.lock().await;
382 |         let entries = rick.reads(offsets).await?;
383 |         let offset_end = rick.get_legal_offset_end();
384 |         drop(rick);
385 |         trace!("[compact] level {}, rick reads", level_id);
386 | 
387 |         let mut entry_map = HashMap::new();
388 |         for entry in entries {
389 |             let Entry {
390 |                 timestamp,
391 |                 key,
392 |                 value,
393 |             } = entry;
394 | 
395 |             let pair_list: &mut Vec<_> = entry_map.entry(key).or_default();
396 |             pair_list.push((timestamp, value));
397 |         }
398 | 
399 |         trace!("[compact] level {}, make entry map", level_id);
400 | 
401 |         // prepare output files.
402 |         let mut index_bb = IndexBlockBuilder::new();
403 |         let rick_file = self
404 |             .ctx
405 |             .file_manager
406 |             .open(self.tid, FileNo::Rick(level_id))
407 |             .await?;
408 |         let mut rick = Rick::open(rick_file, Some(ValueFormat::CompressedValue)).await?;
409 |         rick.set_align_ts(range.start()).await?;
410 | 
411 |         // call compress_fn to compact points, build rick file and index block.
412 |         for (key, ts_value) in entry_map {
413 |             debug_assert!(!ts_value.is_empty());
414 |             let first_ts = ts_value[0].0;
415 | 
416 |             let compressed_data = self
417 |                 .ctx
418 |                 .fn_registry
419 |                 .compress_entries(key.clone(), ts_value)?;
420 | 
421 |             // todo: add rick builder
422 |             let mut position = rick
423 |                 .append(vec![Entry {
424 |                     timestamp: first_ts,
425 |                     key,
426 |                     value: compressed_data,
427 |                 }])
428 |                 .await?;
429 |             let (timestamp, key, offset) = position.pop().unwrap();
430 |             index_bb.add_entry(&key, timestamp, offset);
431 |         }
432 | 
433 |         trace!("[compact] level {}, build rick", level_id);
434 | 
435 |         // make sstable
436 |         // table_builder.add_entries(keys, value_positions);
437 |         table_builder.add_block(index_bb);
438 |         table_builder.finish().await?;
439 | 
440 |         trace!("[compact] level {}, build table", level_id);
441 | 
442 |         // todo: gc rick
443 |         // self.rick.lock().await.clean().await?;
444 |         // todo: gc memindex
445 |         // self.memindex.lock().await.purge_time_range(range);
446 |         let mut memindex = self.memindex.lock().await;
447 |         memindex.purge_time_range(range);
448 |         drop(memindex);
449 |         trace!("[compact] level {}, purge memindex", level_id);
450 |         let mut rick = self.rick.lock().await;
451 |         rick.push_legal_offset_start(offset_end).await?;
452 |         drop(rick);
453 |         trace!("[compact] level {}, clean rick", level_id);
454 | 
455 |         debug!("compact {} finish", level_id);
456 |         let _ = tx.send(()).await;
457 | 
458 |         Ok(())
459 |     }
460 | 
461 |     /// Perform compaction on the given level id.
462 |     ///
463 |     /// This procedure assume the level going to compact is inactive, which has
464 |     /// the level id assigned, has corresponding rick file, and may serving read
465 |     /// requests.
466 |     ///
467 |     /// It's not this procedure's response to switch active level. And it also
468 |     /// has nothing to do with memindex.
469 |     #[instrument]
470 |     pub(crate) async fn compact_level(&self, level_id: LevelId) -> Result<()> {
471 |         self.compact_sched.finished(level_id);
472 | 
473 |         Ok(())
474 |     }
475 | 
476 |     async fn outdate(&self) -> Result<()> {
477 |         self.level_info
478 |             .lock()
479 |             .await
480 |             .remove_last_level(&self.ctx.file_manager)
481 |             .await?;
482 | 
483 |         todo!()
484 |     }
485 | 
486 |     fn decompress_and_find(
487 |         &self,
488 |         time_key: &(Timestamp, Bytes),
489 |         raw_bytes: &[u8],
490 |         decompress: bool,
491 |     ) -> Result<Option<Bytes>> {
492 |         if !decompress {
493 |             return Ok(Some(raw_bytes.to_owned()));
494 |         }
495 | 
496 |         let mut entries = self
497 |             .ctx
498 |             .fn_registry
499 |             .decompress_entries(&time_key.1, raw_bytes)?;
500 | 
501 |         // todo: move this logic to UDCF
502 |         entries.sort_by_key(|e| e.0);
503 |         let index = ok_unwrap!(entries
504 |             .binary_search_by_key(&time_key.0, |(ts, _)| *ts)
505 |             .ok());
506 |         let (_, value) = &entries[index];
507 | 
508 |         Ok(Some(value.clone()))
509 |     }
510 | }
511 | 
512 | impl<CS: CompactScheduler> std::fmt::Debug for Levels<CS> {
513 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
514 |         f.debug_struct("Levels")
515 |             .field("thread id", &self.tid)
516 |             .finish()
517 |     }
518 | }
519 | 
520 | /// "Timestamp" in `HelixDB` is a logical concept. It is not bound with the real
521 | /// time. [TimestampReviewer] defines how timestamp should be considered.
522 | /// Including when to do a compaction, when to outdate a part of data etc.
523 | pub trait TimestampReviewer: Send + Sync {
524 |     fn observe(&mut self, timestamp: Timestamp) -> Vec<TimestampAction>;
525 | }
526 | 
527 | /// Actions given by [TimestampReviewer].
528 | #[derive(Debug, PartialEq, Eq, Clone, Copy)]
529 | pub enum TimestampAction {
530 |     /// Compact data between two timestamps (both inclusive).
531 |     /// The third parameter is the id of new level. This field is filled by the
532 |     /// peer who observed this original "compact action" (sent by
533 |     /// `TimestampReviewer`).
534 |     Compact(Timestamp, Timestamp, Option<LevelId>),
535 |     /// Outdate data which timestamp is smaller than given.
536 |     Outdate(Timestamp),
537 | }
538 | 
539 | /// A simple timestamp review implementation. It has two config entries
540 | /// `rick_range` and `outdate_range`. `rick_range` defines the range of
541 | /// rick and sstable files. `outdate_range` defines how much data should
542 | /// be kept. `outdate_range` should be integer times of `rick_range` even
543 | /// if it is unchecked.
544 | ///
545 | /// This implementation is not bound with real world time. It assume the
546 | /// timestamp comes from `observe()` call is the newest. And just triggers
547 | /// compaction and outdate only based on this. In real scenario
548 | /// when timestamp has more meaning or restriction, more complex logic can
549 | /// be achieved.
550 | pub struct SimpleTimestampReviewer {
551 |     // config part
552 |     rick_range: Timestamp,
553 |     outdate_range: Timestamp,
554 | 
555 |     // status part
556 |     last_compacted: Timestamp,
557 |     last_outdated: Timestamp,
558 | }
559 | 
560 | impl SimpleTimestampReviewer {
561 |     pub fn new(rick_range: Timestamp, outdate_range: Timestamp) -> Self {
562 |         Self {
563 |             rick_range,
564 |             outdate_range,
565 |             last_compacted: 0,
566 |             last_outdated: 0,
567 |         }
568 |     }
569 | }
570 | 
571 | impl TimestampReviewer for SimpleTimestampReviewer {
572 |     fn observe(&mut self, timestamp: Timestamp) -> Vec<TimestampAction> {
573 |         let mut actions = vec![];
574 |         if timestamp - self.last_compacted + 1 >= self.rick_range {
575 |             actions.push(TimestampAction::Compact(
576 |                 self.last_compacted,
577 |                 timestamp,
578 |                 None,
579 |             ));
580 |             self.last_compacted = timestamp + 1;
581 |         }
582 |         if timestamp - self.last_outdated + 1 >= self.outdate_range {
583 |             actions.push(TimestampAction::Outdate(
584 |                 self.last_outdated + self.rick_range - 1,
585 |             ));
586 |             self.last_outdated += self.rick_range;
587 |         }
588 | 
589 |         actions
590 |     }
591 | }
592 | 
593 | #[derive(Debug, Clone, Copy)]
594 | pub struct WriteBatchConfig {
595 |     /// The maximum number of entries can be hold in one batch.
596 |     pub batch_size: usize,
597 |     /// The longest time duration between two batch consumptions.
598 |     pub timeout: Duration,
599 | }
600 | 
601 | impl Default for WriteBatchConfig {
602 |     fn default() -> Self {
603 |         Self {
604 |             batch_size: 0,
605 |             timeout: Duration::from_millis(0),
606 |         }
607 |     }
608 | }
609 | 
610 | /// Batching write request
611 | struct WriteBatch {
612 |     notifier: RefCell<Vec<Sender<Result<()>>>>,
613 |     buf: RefCell<Vec<Entry>>,
614 |     timeout: Duration,
615 |     batch_size: usize,
616 |     /// Lock on two vectors `notifier` and `buf`.
617 |     lock: Mutex<()>,
618 |     /// Generated by `TimerActionOnce::do_in()` with the purpose of
619 |     /// consuming batched entries after some duration.
620 |     action: RwLock<Option<TimerActionOnce<()>>>,
621 |     // level: Rc<Levels>,
622 | }
623 | 
624 | impl WriteBatch {
625 |     pub fn with_config(config: WriteBatchConfig) -> Self {
626 |         Self {
627 |             notifier: RefCell::new(vec![]),
628 |             buf: RefCell::new(vec![]),
629 |             timeout: config.timeout,
630 |             batch_size: config.batch_size,
631 |             lock: Mutex::new(()),
632 |             action: RwLock::new(None),
633 |         }
634 |     }
635 | 
636 |     pub fn default() -> Self {
637 |         Self::with_config(WriteBatchConfig::default())
638 |     }
639 | 
640 |     /// Enqueue some write requests. Then check the size limit.
641 |     /// This will reset the timeout timer.
642 |     #[allow(clippy::branches_sharing_code)]
643 |     pub async fn enqueue<CS: CompactScheduler>(
644 |         self: Rc<Self>,
645 |         mut reqs: Vec<Entry>,
646 |         tx: Sender<Result<()>>,
647 |         level: Rc<Levels<CS>>,
648 |     ) {
649 |         // enqueue
650 |         let guard = self.lock.lock().await;
651 |         self.notifier.borrow_mut().push(tx);
652 |         self.buf.borrow_mut().append(&mut reqs);
653 | 
654 |         // check size limit
655 |         if self.buf.borrow().len() >= self.batch_size {
656 |             drop(guard);
657 |             self.consume(level).await;
658 |         } else {
659 |             drop(guard);
660 |             self.set_or_rearm(level).await;
661 |         }
662 |     }
663 | 
664 |     /// Consume all batched entries.
665 |     pub async fn consume<CS: CompactScheduler>(self: Rc<Self>, level: Rc<Levels<CS>>) {
666 |         // let mut action_guard = self.action.write().await.unwrap();
667 |         // take contents
668 |         let guard = self.lock.lock().await;
669 |         let notifier = self.notifier.take();
670 |         let buf = self.buf.take();
671 |         drop(guard);
672 | 
673 |         // write and reply
674 |         let result = io_worker::GATE
675 |             .with(|gate| {
676 |                 gate.spawn(async move { level.put_internal(buf).await })
677 |                     .unwrap()
678 |             })
679 |             .await;
680 |         if result.is_ok() {
681 |             for tx in notifier {
682 |                 let _ = tx.send(Ok(()));
683 |             }
684 |         } else {
685 |             for tx in notifier {
686 |                 let _ = tx.send(Err(HelixError::Poisoned("Put".to_string())));
687 |             }
688 |         }
689 | 
690 |         // todo: finish cancellation
691 |         // destroy action timer as this "consume action" is already triggered
692 |         // (regardless of it is triggered by timer or `Levels`').
693 |         // if let Some(action) = action_guard.take() {
694 |         //     action.cancel().await;
695 |         // }
696 |     }
697 | 
698 |     async fn destroy_action(&self) {
699 |         let mut action_guard = self.action.write().await.unwrap();
700 |         if let Some(action) = &*action_guard {
701 |             action.destroy();
702 |         }
703 |         drop(action_guard.take());
704 |     }
705 | 
706 |     async fn set_or_rearm<CS: CompactScheduler>(self: Rc<Self>, level: Rc<Levels<CS>>) {
707 |         let mut action = self.action.write().await.unwrap();
708 | 
709 |         // rearm timer
710 |         if let Some(action) = &*action {
711 |             action.rearm_in(self.timeout);
712 |             return;
713 |         }
714 | 
715 |         // otherwise set the action
716 |         *action = Some(TimerActionOnce::do_in(
717 |             self.timeout,
718 |             self.clone().consume(level),
719 |         ));
720 |     }
721 | }
722 | 
723 | #[cfg(test)]
724 | mod test {
725 |     use glommio::channels::channel_mesh::MeshBuilder;
726 |     use glommio::LocalExecutor;
727 |     use tempfile::tempdir;
728 | 
729 |     use super::*;
730 |     use crate::compact_sched::QueueUpCompSched;
731 |     use crate::file::FileManager;
732 |     use crate::fn_registry::FnRegistry;
733 | 
734 |     #[tokio::test]
735 |     async fn simple_timestamp_reviewer_trigger_compact_and_outdate() {
736 |         let mut tsr = SimpleTimestampReviewer::new(10, 30);
737 | 
738 |         let mut actions = vec![];
739 |         let expected = vec![
740 |             TimestampAction::Compact(0, 9, None),
741 |             TimestampAction::Compact(10, 19, None),
742 |             TimestampAction::Compact(20, 29, None),
743 |             TimestampAction::Outdate(9),
744 |             TimestampAction::Compact(30, 39, None),
745 |             TimestampAction::Outdate(19),
746 |         ];
747 | 
748 |         for i in 0..40 {
749 |             actions.append(&mut tsr.observe(i));
750 |         }
751 | 
752 |         assert_eq!(actions, expected);
753 |     }
754 | 
755 |     #[test]
756 |     fn put_get_on_rick() {
757 |         let ex = LocalExecutor::default();
758 |         ex.run(async {
759 |             let base_dir = tempdir().unwrap();
760 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
761 |             let fn_registry = FnRegistry::new_noop();
762 |             let ctx = Arc::new(Context {
763 |                 file_manager,
764 |                 fn_registry,
765 |             });
766 |             let timestamp_reviewer: Arc<Mutex<Box<dyn TimestampReviewer>>> =
767 |                 Arc::new(Mutex::new(Box::new(SimpleTimestampReviewer::new(10, 30))));
768 |             let sender = MeshBuilder::full(1, 1).join().await.unwrap().0;
769 |             let level_info = Arc::new(Mutex::new(
770 |                 ctx.file_manager.open_level_info().await.unwrap(),
771 |             ));
772 |             let (sched, tq) = QueueUpCompSched::default();
773 |             let levels = Levels::try_new(
774 |                 0,
775 |                 Options::default(),
776 |                 timestamp_reviewer,
777 |                 ctx,
778 |                 sender,
779 |                 level_info,
780 |                 sched.clone(),
781 |             )
782 |             .await
783 |             .unwrap();
784 |             sched.clone().init(levels.clone());
785 |             sched.install(tq).unwrap();
786 | 
787 |             let entries = vec![
788 |                 (1, b"key1".to_vec(), b"value1".to_vec()).into(),
789 |                 (2, b"key1".to_vec(), b"value1".to_vec()).into(),
790 |                 (3, b"key1".to_vec(), b"value1".to_vec()).into(),
791 |                 (1, b"key2".to_vec(), b"value2".to_vec()).into(),
792 |                 (2, b"key2".to_vec(), b"value2".to_vec()).into(),
793 |                 (3, b"key3".to_vec(), b"value1".to_vec()).into(),
794 |             ];
795 | 
796 |             levels.put_internal(entries.clone()).await.unwrap();
797 | 
798 |             for entry in entries {
799 |                 assert_eq!(
800 |                     entry,
801 |                     levels
802 |                         .get(entry.time_key(), ReadOption::default().no_decompress())
803 |                         .await
804 |                         .unwrap()
805 |                         .unwrap()
806 |                 );
807 |             }
808 | 
809 |             // overwrite a key
810 |             let new_entry: Entry = (1, b"key1".to_vec(), b"value3".to_vec()).into();
811 |             levels.put_internal(vec![new_entry.clone()]).await.unwrap();
812 |             assert_eq!(
813 |                 new_entry,
814 |                 levels
815 |                     .get(new_entry.time_key(), ReadOption::default().no_decompress())
816 |                     .await
817 |                     .unwrap()
818 |                     .unwrap()
819 |             );
820 |         });
821 |     }
822 | 
823 |     #[test]
824 |     fn put_get_with_compaction() {
825 |         let ex = LocalExecutor::default();
826 |         ex.run(async {
827 |             let base_dir = tempdir().unwrap();
828 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
829 |             let fn_registry = FnRegistry::new_noop();
830 |             let ctx = Arc::new(Context {
831 |                 file_manager,
832 |                 fn_registry,
833 |             });
834 |             let timestamp_reviewer: Arc<Mutex<Box<dyn TimestampReviewer>>> =
835 |                 Arc::new(Mutex::new(Box::new(SimpleTimestampReviewer::new(10, 30))));
836 |             let sender = MeshBuilder::full(1, 1).join().await.unwrap().0;
837 |             let level_info = Arc::new(Mutex::new(
838 |                 ctx.file_manager.open_level_info().await.unwrap(),
839 |             ));
840 |             let (sched, tq) = QueueUpCompSched::default();
841 |             let levels = Levels::try_new(
842 |                 0,
843 |                 Options::default(),
844 |                 timestamp_reviewer,
845 |                 ctx.clone(),
846 |                 sender,
847 |                 level_info,
848 |                 sched.clone(),
849 |             )
850 |             .await
851 |             .unwrap();
852 |             sched.clone().init(levels.clone());
853 |             sched.install(tq).unwrap();
854 | 
855 |             for timestamp in 0..25 {
856 |                 levels
857 |                     .put_internal(vec![(timestamp, b"key".to_vec(), b"value".to_vec()).into()])
858 |                     .await
859 |                     .unwrap();
860 |             }
861 | 
862 |             for timestamp in 0..25 {
863 |                 let result = levels
864 |                     .get(&(timestamp, b"key".to_vec()), ReadOption::default())
865 |                     .await
866 |                     .unwrap()
867 |                     .unwrap();
868 | 
869 |                 assert_eq!(
870 |                     result,
871 |                     (timestamp, b"key".to_vec(), b"value".to_vec()).into()
872 |                 );
873 |             }
874 |         });
875 |     }
876 | }
877 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! HelixDB is a KV-Engine designed for time-series data.
 2 | 
 3 | #![feature(vec_into_raw_parts)]
 4 | #![feature(trait_alias)]
 5 | #![feature(async_closure)]
 6 | #![allow(internal_features)]
 7 | #![feature(core_intrinsics)]
 8 | #![feature(hash_extract_if)]
 9 | // todo: open these lints
10 | #![allow(dead_code)]
11 | #![allow(unused_variables)]
12 | 
13 | /// Unwrap `Option` under the `Result<Option<T>>` return type requirement.
14 | /// The `None` case will early return with `Ok(None)`.
15 | ///
16 | /// # Example
17 | /// *Notice this macro is not exported via "`#[macro_export]`" so the following
18 | /// example will not be run as a test case.*
19 | /// ```ignore
20 | /// # #![feature(never_type)]
21 | /// # #[macro_use] extern crate helixdb;
22 | /// fn return_ok_none() -> Result<Option<usize>, !> {
23 | ///     let val: Option<usize> = None;
24 | ///     ok_unwrap!(val);
25 | ///     panic!("should have returned");
26 | /// }
27 | ///
28 | /// # fn container() -> Result<Option<()>, !> {
29 | /// let val = ok_unwrap!(Some(0usize));
30 | /// assert_eq!(val, 0usize);
31 | /// assert_eq!(return_ok_none(), Ok(None));
32 | /// #    Ok(None)
33 | /// # }
34 | ///
35 | /// # let _ = container();
36 | /// ```
37 | macro_rules! ok_unwrap {
38 |     ($e:expr) => {
39 |         match $e {
40 |             Some(thing) => thing,
41 |             None => return Ok(None),
42 |         }
43 |     };
44 | }
45 | 
46 | #[deprecated]
47 | mod blocks;
48 | mod cache;
49 | mod compact_sched;
50 | mod context;
51 | mod db;
52 | mod error;
53 | mod file;
54 | mod fn_registry;
55 | mod index;
56 | mod io;
57 | mod io_worker;
58 | pub mod iterator;
59 | mod level;
60 | pub mod option;
61 | mod table;
62 | mod types;
63 | mod util;
64 | 
65 | pub use db::*;
66 | pub use fn_registry::FnRegistry;
67 | pub use level::{SimpleTimestampReviewer, TimestampAction, TimestampReviewer};
68 | pub use types::{Entry, TimeRange};
69 | pub use util::{Comparator, LexicalComparator, NoOrderComparator};
70 | 
71 | #[global_allocator]
72 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
73 | 


--------------------------------------------------------------------------------
/src/option.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Duration;
  2 | 
  3 | use crate::cache::CacheConfig;
  4 | use crate::fn_registry::FnRegistry;
  5 | use crate::level::{SimpleTimestampReviewer, TimestampReviewer, WriteBatchConfig};
  6 | 
  7 | /// Options for opening HelixDB
  8 | pub struct Options {
  9 |     // parameters
 10 |     /// Number of shards. It is recommended to equal to the number of system
 11 |     /// processors.
 12 |     pub(crate) num_shard: usize,
 13 |     /// Queue length of each shard's task receiver.
 14 |     pub(crate) task_buffer_size: usize,
 15 |     /// Configurations of cache.
 16 |     pub(crate) cache: CacheConfig,
 17 |     pub(crate) write_batch: WriteBatchConfig,
 18 |     pub(crate) compact_prompt_interval: Duration,
 19 | 
 20 |     // helixdb context
 21 |     pub(crate) fn_registry: Option<FnRegistry>,
 22 |     pub(crate) tsr: Option<Box<dyn TimestampReviewer>>,
 23 | }
 24 | 
 25 | // todo: remove this
 26 | impl Clone for Options {
 27 |     /// a
 28 |     fn clone(&self) -> Self {
 29 |         Self {
 30 |             num_shard: self.num_shard,
 31 |             task_buffer_size: self.task_buffer_size,
 32 |             cache: self.cache,
 33 |             write_batch: self.write_batch,
 34 |             compact_prompt_interval: self.compact_prompt_interval,
 35 | 
 36 |             fn_registry: None,
 37 |             tsr: None,
 38 |         }
 39 |     }
 40 | }
 41 | 
 42 | impl Options {
 43 |     pub fn default() -> Self {
 44 |         Self {
 45 |             num_shard: num_cpus::get(),
 46 |             task_buffer_size: 128,
 47 |             cache: CacheConfig::default(),
 48 |             write_batch: WriteBatchConfig::default(),
 49 |             compact_prompt_interval: Duration::from_secs(1),
 50 | 
 51 |             fn_registry: Some(FnRegistry::new_noop()),
 52 |             tsr: Some(Box::new(SimpleTimestampReviewer::new(1024, 1024 * 8))),
 53 |         }
 54 |     }
 55 | 
 56 |     /// Returns a copy of the value.
 57 |     /// This function may not works as expected. It is a "partial" clone.
 58 |     ///
 59 |     /// Some fields in this [`Options`] struct isn't suit for clone, like
 60 |     /// `fn_registry` or `tsr`. They are wrapped by a `Option`, and will
 61 |     /// only leave a `None` after called `clone_partial()`.
 62 |     ///
 63 |     /// This is to making [`Options`] more general and unified. Other fields
 64 |     /// works as what common [`std::marker::Clone`] does.
 65 |     /// # Example
 66 |     /// *Just a example and it isn't runnable since `fn_registry` is a private
 67 |     /// field.* ```compile_fail
 68 |     /// # use helixdb::option::Options;
 69 |     /// let options = Options::default();
 70 |     /// assert!(options.fn_registry.is_some());
 71 |     /// // after calling `clone_partial()` some will be `None` because they
 72 |     /// won't be cloned actually. let options_cloned =
 73 |     /// options.clone_partial(); assert!(options_cloned.fn_registry.
 74 |     /// is_none()); ```
 75 |     pub fn clone_partial(&self) -> Self {
 76 |         Self {
 77 |             num_shard: self.num_shard,
 78 |             task_buffer_size: self.task_buffer_size,
 79 |             cache: self.cache,
 80 |             write_batch: self.write_batch,
 81 |             compact_prompt_interval: self.compact_prompt_interval,
 82 | 
 83 |             fn_registry: None,
 84 |             tsr: None,
 85 |         }
 86 |     }
 87 | 
 88 |     pub fn shards(mut self, num_shard: usize) -> Self {
 89 |         self.num_shard = num_shard;
 90 |         self
 91 |     }
 92 | 
 93 |     pub fn set_fn_registry(mut self, fn_registry: FnRegistry) -> Self {
 94 |         self.fn_registry = Some(fn_registry);
 95 |         self
 96 |     }
 97 | 
 98 |     pub fn cache<F>(mut self, f: F) -> Self
 99 |     where
100 |         F: FnOnce(CacheConfig) -> CacheConfig,
101 |     {
102 |         self.cache = f(self.cache);
103 |         self
104 |     }
105 | 
106 |     pub fn write_batch<F>(mut self, f: F) -> Self
107 |     where
108 |         F: FnOnce(WriteBatchConfig) -> WriteBatchConfig,
109 |     {
110 |         self.write_batch = f(self.write_batch);
111 |         self
112 |     }
113 | 
114 |     pub fn set_timestamp_reviewer(mut self, tsr: Box<dyn TimestampReviewer>) -> Self {
115 |         self.tsr = Some(tsr);
116 |         self
117 |     }
118 | 
119 |     pub fn set_task_buffer_size(mut self, buffer_size: usize) -> Self {
120 |         self.task_buffer_size = buffer_size;
121 |         self
122 |     }
123 | 
124 |     pub fn set_compact_prompt_interval(mut self, interval: Duration) -> Self {
125 |         self.compact_prompt_interval = interval;
126 |         self
127 |     }
128 | }
129 | 
130 | #[derive(Clone, Copy)]
131 | pub struct ReadOption {
132 |     /// Read request will decompress a compressed value then try to find
133 |     /// requested timestamp if true. Default value: true.
134 |     pub(crate) decompress: bool,
135 | }
136 | 
137 | impl Default for ReadOption {
138 |     fn default() -> Self {
139 |         Self { decompress: true }
140 |     }
141 | }
142 | 
143 | impl ReadOption {
144 |     pub fn no_decompress(mut self) -> Self {
145 |         self.decompress = false;
146 |         self
147 |     }
148 | }
149 | 
150 | #[derive(Copy, Clone, Debug)]
151 | pub struct ScanOption {
152 |     pub prefetch_buf_size: usize,
153 | }
154 | 
155 | #[cfg(test)]
156 | mod test {
157 |     use super::*;
158 | 
159 |     #[test]
160 |     fn partial_clone() {
161 |         let options = Options::default();
162 |         assert!(options.fn_registry.is_some());
163 |         assert!(options.tsr.is_some());
164 | 
165 |         // after calling `clone()` some will be `None` because they won't be cloned
166 |         // actually.
167 |         let options_cloned = options.clone_partial();
168 |         assert!(options_cloned.fn_registry.is_none());
169 |         assert!(options_cloned.tsr.is_none());
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/table.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use crate::context::Context;
 4 | use crate::error::Result;
 5 | use crate::file::{Rick, SSTable};
 6 | use crate::index::MemIndex;
 7 | #[cfg(test)]
 8 | use crate::types::Offset;
 9 | use crate::types::{Bytes, Entry, LevelId, ThreadId, Timestamp};
10 | 
11 | #[derive(Hash, PartialEq, Eq)]
12 | pub struct TableIdentifier {
13 |     pub tid: ThreadId,
14 |     pub lid: LevelId,
15 | }
16 | 
17 | impl From<(ThreadId, LevelId)> for TableIdentifier {
18 |     fn from(ids: (ThreadId, LevelId)) -> Self {
19 |         Self {
20 |             tid: ids.0,
21 |             lid: ids.1,
22 |         }
23 |     }
24 | }
25 | 
26 | /// Provides read methods to SSTable.
27 | ///
28 | /// If wants to modify a sstable should upgrade to a writable handle
29 | /// (unimplemented).
30 | pub struct TableReadHandle {
31 |     memindex: MemIndex,
32 |     sstable: SSTable,
33 |     rick: Rick,
34 |     ctx: Arc<Context>,
35 | }
36 | 
37 | impl TableReadHandle {
38 |     pub fn new(memindex: MemIndex, sstable: SSTable, rick: Rick, ctx: Arc<Context>) -> Self {
39 |         Self {
40 |             memindex,
41 |             sstable,
42 |             rick,
43 |             ctx,
44 |         }
45 |     }
46 | 
47 |     pub async fn get(&self, time_key: &(Timestamp, Bytes)) -> Result<Option<Entry>> {
48 |         let offset = if self.is_compressed() {
49 |             let mut align_time_key = time_key.clone();
50 |             align_time_key.0 = self.rick.get_align_ts();
51 |             self.memindex.get(&align_time_key)?
52 |         } else {
53 |             self.memindex.get(time_key)?
54 |         };
55 |         if let Some(offset) = offset {
56 |             Ok(Some(self.rick.read(offset).await?))
57 |         } else {
58 |             Ok(None)
59 |         }
60 |     }
61 | 
62 |     /// Upgrade to writeable handle.
63 |     pub fn upgrade() -> ! {
64 |         todo!()
65 |     }
66 | 
67 |     pub fn is_compressed(&self) -> bool {
68 |         self.rick.is_compressed()
69 |     }
70 | 
71 |     // For test case.
72 |     /// Get value's offset in rick file.
73 |     #[cfg(test)]
74 |     pub fn get_offset(&self, time_key: &(Timestamp, Bytes)) -> Result<Option<Offset>> {
75 |         self.memindex.get(time_key)
76 |     }
77 | 
78 |     fn decompress_entry(&self, key: &[u8], data: &[u8]) -> Result<Vec<(Timestamp, Bytes)>> {
79 |         self.ctx.fn_registry.decompress_entries(key, data)
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/types/entry.rs:
--------------------------------------------------------------------------------
  1 | use std::convert::TryInto;
  2 | use std::mem;
  3 | use std::ops::Range;
  4 | 
  5 | use flatbuffers::FlatBufferBuilder;
  6 | 
  7 | pub type Bytes = Vec<u8>;
  8 | pub type Timestamp = i64;
  9 | pub type ThreadId = u64;
 10 | /// Mono-increase identifier to level files. Starts from 1.
 11 | /// Level id `0` stands for Rick level.
 12 | pub type LevelId = u64;
 13 | 
 14 | /// Wrapper struct over protos::Entry.
 15 | ///
 16 | /// C representation is needed to converting `(&ts, &key)` to `&(ts, key)`.
 17 | #[derive(Debug, PartialEq, Eq, Clone)]
 18 | #[repr(C)]
 19 | pub struct Entry {
 20 |     pub timestamp: Timestamp,
 21 |     pub key: Bytes,
 22 |     pub value: Bytes,
 23 | }
 24 | 
 25 | impl Entry {
 26 |     pub fn encode(&self) -> Bytes {
 27 |         let mut fbb = FlatBufferBuilder::new();
 28 | 
 29 |         let timestamp = protos::Timestamp::new(self.timestamp);
 30 |         let key_bytes = fbb.create_vector(&self.key);
 31 |         let value_bytes = fbb.create_vector(&self.value);
 32 | 
 33 |         let entry = protos::Entry::create(
 34 |             &mut fbb,
 35 |             &protos::EntryArgs {
 36 |                 timestamp: Some(&timestamp),
 37 |                 key: Some(key_bytes),
 38 |                 value: Some(value_bytes),
 39 |             },
 40 |         );
 41 | 
 42 |         fbb.finish(entry, None);
 43 |         fbb.finished_data().to_vec()
 44 |     }
 45 | 
 46 |     pub fn decode(bytes: &[u8]) -> Self {
 47 |         // let fb_entry = flatbuffers::get_root::<protos::Entry<'_>>(bytes);
 48 |         let fb_entry = flatbuffers::root::<protos::Entry<'_>>(bytes).unwrap();
 49 | 
 50 |         Self {
 51 |             timestamp: fb_entry.timestamp().unwrap().timestamp(),
 52 |             key: fb_entry.key().unwrap().bytes().to_vec(),
 53 |             value: fb_entry.value().unwrap().bytes().to_vec(),
 54 |         }
 55 |     }
 56 | 
 57 |     /// # Unsafe
 58 |     /// - Purpose: make a `&(A, B)` over a `&Self{A, B, C}`
 59 |     /// - Safety: `Entry` is qualified with `repr(C)`, memory layout is ensured
 60 |     ///   to be the same with `((A, B), C)`.
 61 |     pub fn time_key(&self) -> &(Timestamp, Bytes) {
 62 |         unsafe {
 63 |             let p_entry = self as *const Entry;
 64 |             let p_ts_key = p_entry as *const (Timestamp, Bytes);
 65 |             &*p_ts_key
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | impl From<(Timestamp, Bytes, Bytes)> for Entry {
 71 |     fn from(input: (Timestamp, Bytes, Bytes)) -> Entry {
 72 |         let (timestamp, key, value) = input;
 73 |         Entry {
 74 |             timestamp,
 75 |             key,
 76 |             value,
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | /// Describe a encoded [Entry]'s buffer.
 82 | pub struct EntryMeta {
 83 |     pub length: u64,
 84 | }
 85 | 
 86 | impl EntryMeta {
 87 |     pub fn new(length: u64) -> Self {
 88 |         Self { length }
 89 |     }
 90 | 
 91 |     pub const fn meta_size() -> usize {
 92 |         mem::size_of::<Self>()
 93 |     }
 94 | 
 95 |     pub fn encode(&self) -> [u8; 8] {
 96 |         self.length.to_le_bytes()
 97 |     }
 98 | 
 99 |     pub fn decode(bytes: &[u8]) -> Self {
100 |         Self {
101 |             length: u64::from_le_bytes(bytes.try_into().unwrap()),
102 |         }
103 |     }
104 | }
105 | 
106 | // todo: replace with std::ops::Range.
107 | #[derive(Debug, Copy, Clone)]
108 | pub struct TimeRange {
109 |     start: Timestamp,
110 |     end: Timestamp,
111 | }
112 | 
113 | impl TimeRange {
114 |     /// Is `self` containing given timestamp.
115 |     pub fn contains(&self, ts: Timestamp) -> bool {
116 |         self.start <= ts && self.end >= ts
117 |     }
118 | 
119 |     pub fn range(&self) -> Range<Timestamp> {
120 |         Range {
121 |             start: self.start,
122 |             end: self.end,
123 |         }
124 |     }
125 | 
126 |     pub fn start(&self) -> Timestamp {
127 |         self.start
128 |     }
129 | 
130 |     pub fn end(&self) -> Timestamp {
131 |         self.end
132 |     }
133 | }
134 | 
135 | impl From<(Timestamp, Timestamp)> for TimeRange {
136 |     fn from(tuple: (Timestamp, Timestamp)) -> TimeRange {
137 |         Self {
138 |             start: tuple.0,
139 |             end: tuple.1,
140 |         }
141 |     }
142 | }
143 | 
144 | #[cfg(test)]
145 | mod test {
146 | 
147 |     use super::*;
148 | 
149 |     #[test]
150 |     fn entry_codec() {
151 |         let entry = Entry {
152 |             timestamp: 1000,
153 |             key: b"key".to_vec(),
154 |             value: b"value".to_vec(),
155 |         };
156 | 
157 |         let bytes = entry.encode();
158 | 
159 |         assert_eq!(entry, Entry::decode(&bytes));
160 |     }
161 | 
162 |     #[test]
163 |     fn time_range_contains() {
164 |         let range = TimeRange::from((0, 10));
165 | 
166 |         assert!(!range.contains(-1));
167 |         assert!(range.contains(0));
168 |         assert!(range.contains(5));
169 |         assert!(range.contains(10));
170 |         assert!(!range.contains(101));
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/types/level_info.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::VecDeque;
  2 | 
  3 | use flatbuffers::FlatBufferBuilder;
  4 | 
  5 | use super::{Bytes, LevelId, Timestamp};
  6 | use crate::error::Result;
  7 | use crate::file::FileManager;
  8 | 
  9 | #[derive(Default, PartialEq, Eq, Debug, Clone, Copy)]
 10 | pub struct LevelDesc {
 11 |     start: Timestamp,
 12 |     end: Timestamp,
 13 |     id: LevelId,
 14 | }
 15 | 
 16 | impl From<protos::LevelDesc> for LevelDesc {
 17 |     fn from(fb_desc: protos::LevelDesc) -> LevelDesc {
 18 |         let time_range = fb_desc.time_range();
 19 |         Self {
 20 |             start: time_range.start().timestamp(),
 21 |             end: time_range.end().timestamp(),
 22 |             id: fb_desc.id().id(),
 23 |         }
 24 |     }
 25 | }
 26 | 
 27 | impl LevelDesc {
 28 |     pub fn as_generated_type(&self) -> protos::LevelDesc {
 29 |         let start = protos::Timestamp::new(self.start);
 30 |         let end = protos::Timestamp::new(self.end);
 31 |         let time_range = protos::TimeRange::new(&start, &end);
 32 | 
 33 |         let id = protos::LevelId::new(self.id);
 34 | 
 35 |         protos::LevelDesc::new(&time_range, &id)
 36 |     }
 37 | 
 38 |     #[inline]
 39 |     pub fn is_timestamp_match(&self, timestamp: Timestamp) -> bool {
 40 |         self.start <= timestamp && timestamp <= self.end
 41 |     }
 42 | }
 43 | 
 44 | /// Metadata of every levels. Is a array-like container of [LevelDesc].
 45 | ///
 46 | /// [LevelDesc] is arranged from old (smaller timestamp) to new
 47 | /// (larger timestamp).
 48 | #[derive(Debug, PartialEq, Eq)]
 49 | pub struct LevelInfo {
 50 |     // todo: remove RwLock
 51 |     infos: VecDeque<LevelDesc>,
 52 | }
 53 | 
 54 | impl LevelInfo {
 55 |     pub fn encode(&self) -> Bytes {
 56 |         let mut fbb = FlatBufferBuilder::new();
 57 | 
 58 |         fbb.start_vector::<protos::LevelDesc>(self.infos.len());
 59 |         for desc in &self.infos {
 60 |             fbb.push(desc.as_generated_type());
 61 |         }
 62 |         let batch = fbb.end_vector::<protos::LevelDesc>(self.infos.len());
 63 | 
 64 |         let infos =
 65 |             protos::LevelInfo::create(&mut fbb, &protos::LevelInfoArgs { infos: Some(batch) });
 66 | 
 67 |         fbb.finish(infos, None);
 68 |         fbb.finished_data().to_vec()
 69 |     }
 70 | 
 71 |     pub fn decode(bytes: &[u8]) -> Self {
 72 |         // for empty level-info file.
 73 |         if bytes.is_empty() {
 74 |             return Self {
 75 |                 infos: VecDeque::default(),
 76 |             };
 77 |         }
 78 | 
 79 |         let fb_info = flatbuffers::root::<protos::LevelInfo<'_>>(bytes).unwrap();
 80 |         let infos = fb_info
 81 |             .infos()
 82 |             .unwrap()
 83 |             .into_iter()
 84 |             .rev() // `fbb.push()` in encode reversed the order
 85 |             .cloned()
 86 |             .map(LevelDesc::from)
 87 |             .collect();
 88 | 
 89 |         Self { infos }
 90 |     }
 91 | 
 92 |     /// Give a timestamp and find the level suits it.
 93 |     ///
 94 |     /// Rick entries' timestamp will not present in level-info.
 95 |     /// Thus if given timestamp is larger than the biggest timestamp recorded by
 96 |     /// this level-info, `Some(0)` will be returned. `0` is a special [LevelId]
 97 |     /// stands for Rick level.
 98 |     pub fn get_level_id(&self, timestamp: Timestamp) -> Option<LevelId> {
 99 |         // timestamp covered by rick will not present in level-info
100 |         if self.infos.is_empty() || timestamp > self.infos.back().unwrap().end {
101 |             return Some(0);
102 |         }
103 | 
104 |         for desc in &self.infos {
105 |             if desc.is_timestamp_match(timestamp) {
106 |                 return Some(desc.id);
107 |             }
108 |         }
109 | 
110 |         None
111 |     }
112 | 
113 |     /// Return new level id.
114 |     pub(crate) async fn add_level(
115 |         &mut self,
116 |         start: Timestamp,
117 |         end: Timestamp,
118 |         file_manager: &FileManager,
119 |     ) -> Result<LevelId> {
120 |         let mut new_desc = LevelDesc { start, end, id: 0 };
121 | 
122 |         let next_id = self.infos.back().map_or_else(|| 1, |desc| desc.id + 1);
123 |         new_desc.id = next_id;
124 |         self.infos.push_back(new_desc);
125 |         self.sync(file_manager).await?;
126 | 
127 |         Ok(next_id)
128 |     }
129 | 
130 |     pub(crate) async fn remove_last_level(&mut self, file_manager: &FileManager) -> Result<()> {
131 |         self.infos.pop_front();
132 | 
133 |         self.sync(file_manager).await
134 |     }
135 | 
136 |     /// Sync file infos to disk. Requires read lock.
137 |     async fn sync(&self, file_manager: &FileManager) -> Result<()> {
138 |         let bytes = self.encode();
139 |         file_manager.sync_level_info(bytes).await?;
140 | 
141 |         Ok(())
142 |     }
143 | 
144 |     #[cfg(test)]
145 |     fn new(descriptions: Vec<LevelDesc>) -> Self {
146 |         let infos = VecDeque::from(descriptions);
147 | 
148 |         Self { infos }
149 |     }
150 | }
151 | 
152 | #[cfg(test)]
153 | mod test {
154 | 
155 |     use glommio::LocalExecutor;
156 |     use tempfile::tempdir;
157 | 
158 |     use super::*;
159 | 
160 |     #[test]
161 |     fn level_desc_codec() {
162 |         let infos = LevelInfo::new(vec![
163 |             LevelDesc {
164 |                 start: 21,
165 |                 end: 40,
166 |                 id: 4,
167 |             },
168 |             LevelDesc {
169 |                 start: 100,
170 |                 end: 200,
171 |                 id: 8,
172 |             },
173 |         ]);
174 | 
175 |         let bytes = infos.encode();
176 |         let decoded = LevelInfo::decode(&bytes);
177 | 
178 |         assert_eq!(decoded, infos);
179 |     }
180 | 
181 |     #[test]
182 |     fn add_level() {
183 |         let ex = LocalExecutor::default();
184 |         ex.run(async {
185 |             let base_dir = tempdir().unwrap();
186 |             let file_manager = FileManager::with_base_dir(base_dir.path(), 1).unwrap();
187 | 
188 |             let mut info = LevelInfo::new(vec![]);
189 |             info.add_level(0, 9, &file_manager).await.unwrap();
190 |             info.add_level(10, 19, &file_manager).await.unwrap();
191 |             info.add_level(20, 29, &file_manager).await.unwrap();
192 |             drop(info);
193 | 
194 |             let info = file_manager.open_level_info().await.unwrap();
195 |             let infos: Vec<_> = info.infos.iter().copied().collect();
196 |             let expected = vec![
197 |                 LevelDesc {
198 |                     start: 0,
199 |                     end: 9,
200 |                     id: 1,
201 |                 },
202 |                 LevelDesc {
203 |                     start: 10,
204 |                     end: 19,
205 |                     id: 2,
206 |                 },
207 |                 LevelDesc {
208 |                     start: 20,
209 |                     end: 29,
210 |                     id: 3,
211 |                 },
212 |             ];
213 | 
214 |             assert_eq!(infos, expected);
215 |         });
216 |     }
217 | }
218 | 


--------------------------------------------------------------------------------
/src/types/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Wrapper over all generated types / structs. And implements
 2 | //! their utilities.
 3 | //!
 4 | //! `protos` dependency should only present in this mod.
 5 | 
 6 | mod entry;
 7 | mod level_info;
 8 | mod rick;
 9 | pub mod sstable;
10 | 
11 | pub use entry::{Bytes, Entry, EntryMeta, LevelId, ThreadId, TimeRange, Timestamp};
12 | pub use level_info::LevelInfo;
13 | pub(crate) use rick::{Offset, RickSuperBlock, ValueFormat};
14 | 
15 | // todo: maybe make a trait `Generated` or sth.
16 | // contains `encode()`, `decode()`, `to_generated_type()`.
17 | 


--------------------------------------------------------------------------------
/src/types/rick.rs:
--------------------------------------------------------------------------------
 1 | use flatbuffers::FlatBufferBuilder;
 2 | 
 3 | use super::{Bytes, Timestamp};
 4 | 
 5 | pub(crate) type Offset = u64;
 6 | 
 7 | pub(crate) type ValueFormat = protos::ValueFormat;
 8 | 
 9 | /// [Rick] file's super block.
10 | ///
11 | /// The binary representation will be padded to 4KB.
12 | #[derive(Debug, PartialEq, Eq)]
13 | pub(crate) struct RickSuperBlock {
14 |     pub is_ordered: bool,
15 |     pub legal_offset_start: Offset,
16 |     pub legal_offset_end: Offset,
17 |     // todo: add `version` and `crc` fields
18 |     pub value_format: ValueFormat,
19 |     /// Only valid when value format is `CompressedValue`
20 |     pub align_timestamp: Timestamp,
21 | }
22 | 
23 | impl RickSuperBlock {
24 |     pub const LENGTH: usize = 4096;
25 | 
26 |     pub fn encode(&self) -> Bytes {
27 |         let mut fbb = FlatBufferBuilder::new();
28 | 
29 |         let legal_offset_start = protos::Offset::new(self.legal_offset_start);
30 |         let legal_offset_end = protos::Offset::new(self.legal_offset_end);
31 |         let align_timestamp = protos::Timestamp::new(self.align_timestamp);
32 | 
33 |         let sb = protos::RickSuperBlock::create(
34 |             &mut fbb,
35 |             &protos::RickSuperBlockArgs {
36 |                 is_ordered: self.is_ordered,
37 |                 legal_offset_start: Some(&legal_offset_start),
38 |                 legal_offset_end: Some(&legal_offset_end),
39 |                 value_format: self.value_format,
40 |                 align_timestamp: Some(&align_timestamp),
41 |             },
42 |         );
43 | 
44 |         fbb.finish(sb, None);
45 |         let mut padding_bytes = fbb.finished_data().to_vec();
46 | 
47 |         // the un-padding bytes should shorter than 4096 otherwise it will be truncated.
48 |         debug_assert!(padding_bytes.len() <= Self::LENGTH);
49 |         // padding it. Flatbuffers has the information about payload's length, so
50 |         // tailing zero doesn't matter.
51 |         padding_bytes.resize(Self::LENGTH, 0);
52 |         padding_bytes
53 |     }
54 | 
55 |     pub fn decode(bytes: &[u8]) -> Self {
56 |         let fb_sb = flatbuffers::root::<protos::RickSuperBlock<'_>>(bytes).unwrap();
57 |         // let fb_sb = flatbuffers::get_root::<protos::RickSuperBlock<'_>>(bytes);
58 | 
59 |         Self {
60 |             is_ordered: fb_sb.is_ordered(),
61 |             legal_offset_start: fb_sb.legal_offset_start().unwrap().offset(),
62 |             legal_offset_end: fb_sb.legal_offset_end().unwrap().offset(),
63 |             value_format: fb_sb.value_format(),
64 |             align_timestamp: fb_sb.align_timestamp().unwrap().timestamp(),
65 |         }
66 |     }
67 | }
68 | 
69 | #[cfg(test)]
70 | mod test {
71 |     use super::*;
72 | 
73 |     #[test]
74 |     fn rick_super_block_codec() {
75 |         let sb = RickSuperBlock {
76 |             is_ordered: true,
77 |             legal_offset_start: 4096,
78 |             legal_offset_end: 8192,
79 |             value_format: ValueFormat::RawValue,
80 |             align_timestamp: 10086,
81 |         };
82 | 
83 |         let bytes = sb.encode();
84 |         assert_eq!(bytes.len(), RickSuperBlock::LENGTH);
85 |         assert_eq!(sb, RickSuperBlock::decode(&bytes));
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/types/sstable.rs:
--------------------------------------------------------------------------------
  1 | use flatbuffers::FlatBufferBuilder;
  2 | 
  3 | use super::{Bytes, LevelId, Offset, ThreadId, Timestamp};
  4 | 
  5 | /// Enumeration of blocks' type
  6 | pub type BlockType = protos::BlockType;
  7 | 
  8 | #[derive(Debug, PartialEq, Eq, Clone, Copy)]
  9 | pub(crate) struct BlockInfo {
 10 |     pub block_type: BlockType,
 11 |     pub offset: Offset,
 12 |     pub length: u64,
 13 | }
 14 | 
 15 | impl BlockInfo {
 16 |     pub fn as_generated_type(&self) -> protos::BlockInfo {
 17 |         let offset = protos::Offset::new(self.offset);
 18 | 
 19 |         protos::BlockInfo::new(self.block_type, &offset, self.length)
 20 |     }
 21 | }
 22 | 
 23 | impl From<protos::BlockInfo> for BlockInfo {
 24 |     fn from(fb_info: protos::BlockInfo) -> BlockInfo {
 25 |         Self {
 26 |             block_type: fb_info.block_type(),
 27 |             offset: fb_info.offset().offset(),
 28 |             length: fb_info.length(),
 29 |         }
 30 |     }
 31 | }
 32 | 
 33 | /// Will be padded to 4096 bytes.
 34 | #[derive(Debug, PartialEq, Eq)]
 35 | pub(crate) struct SSTableSuperBlock {
 36 |     pub thread_id: ThreadId,
 37 |     pub level_id: LevelId,
 38 |     pub blocks: Vec<BlockInfo>,
 39 | }
 40 | 
 41 | impl SSTableSuperBlock {
 42 |     pub const LENGTH: usize = 4096;
 43 | 
 44 |     pub fn encode(&self) -> Bytes {
 45 |         let mut fbb = FlatBufferBuilder::new();
 46 | 
 47 |         let thread_id = protos::ThreadId::new(self.thread_id);
 48 |         let level_id = protos::LevelId::new(self.level_id);
 49 |         fbb.start_vector::<protos::BlockInfo>(self.blocks.len());
 50 |         for info in &self.blocks {
 51 |             fbb.push(info.as_generated_type());
 52 |         }
 53 |         let blocks = fbb.end_vector::<protos::BlockInfo>(self.blocks.len());
 54 |         let blocks = protos::SSTableSuperBlock::create(
 55 |             &mut fbb,
 56 |             &protos::SSTableSuperBlockArgs {
 57 |                 thread_id: Some(&thread_id),
 58 |                 level_id: Some(&level_id),
 59 |                 blocks: Some(blocks),
 60 |             },
 61 |         );
 62 | 
 63 |         fbb.finish(blocks, None);
 64 |         let mut padding_bytes = fbb.finished_data().to_vec();
 65 | 
 66 |         // the un-padding bytes should shorter than 4096 otherwise it will be truncated.
 67 |         debug_assert!(padding_bytes.len() <= Self::LENGTH);
 68 |         // padding it. Flatbuffers has the information about payload's length, so
 69 |         // tailing zero doesn't matter.
 70 |         padding_bytes.resize(Self::LENGTH, 0);
 71 |         padding_bytes
 72 |     }
 73 | 
 74 |     pub fn decode(bytes: &[u8]) -> Self {
 75 |         if bytes.is_empty() {
 76 |             return Self {
 77 |                 blocks: vec![],
 78 |                 thread_id: 0,
 79 |                 level_id: 0,
 80 |             };
 81 |         }
 82 | 
 83 |         let fb_blocks = flatbuffers::root::<protos::SSTableSuperBlock<'_>>(bytes).unwrap();
 84 |         let thread_id = fb_blocks.thread_id().unwrap().id();
 85 |         let level_id = fb_blocks.level_id().unwrap().id();
 86 |         let blocks = fb_blocks
 87 |             .blocks()
 88 |             .unwrap()
 89 |             .into_iter()
 90 |             .cloned()
 91 |             .map(BlockInfo::from)
 92 |             .collect();
 93 | 
 94 |         Self {
 95 |             thread_id,
 96 |             level_id,
 97 |             blocks,
 98 |         }
 99 |     }
100 | 
101 |     /// Get blocks info of given block type. There may have many blocks with
102 |     /// the same types (but not tested yet. this is a todo).
103 |     pub fn get_block_info(&self, block_type: BlockType) -> Vec<BlockInfo> {
104 |         let mut result = vec![];
105 |         for block in &self.blocks {
106 |             if block.block_type == block_type {
107 |                 result.push(*block);
108 |             }
109 |         }
110 | 
111 |         result
112 |     }
113 | }
114 | 
115 | #[derive(Debug, PartialEq, Eq)]
116 | pub(crate) struct IndexBlockEntry {
117 |     pub value_offset: Offset,
118 |     pub timestamp: Timestamp,
119 |     pub key: Bytes,
120 | }
121 | 
122 | impl IndexBlockEntry {
123 |     pub fn encode(&self) -> Bytes {
124 |         let mut fbb = FlatBufferBuilder::new();
125 | 
126 |         let value_offset = protos::Offset::new(self.value_offset);
127 |         let timestamp = protos::Timestamp::new(self.timestamp);
128 |         let key_bytes = fbb.create_vector(&self.key);
129 | 
130 |         let entry = protos::IndexBlockEntry::create(
131 |             &mut fbb,
132 |             &protos::IndexBlockEntryArgs {
133 |                 value_offset: Some(&value_offset),
134 |                 timestamp: Some(&timestamp),
135 |                 key: Some(key_bytes),
136 |             },
137 |         );
138 | 
139 |         fbb.finish(entry, None);
140 |         fbb.finished_data().to_vec()
141 |     }
142 | 
143 |     pub fn decode(bytes: &[u8]) -> Self {
144 |         let fb_entry = flatbuffers::root::<protos::IndexBlockEntry<'_>>(bytes).unwrap();
145 | 
146 |         Self {
147 |             value_offset: fb_entry.value_offset().unwrap().offset(),
148 |             timestamp: fb_entry.timestamp().unwrap().timestamp(),
149 |             key: fb_entry.key().unwrap().bytes().to_vec(),
150 |         }
151 |     }
152 | }
153 | 
154 | #[cfg(test)]
155 | mod test {
156 |     use super::*;
157 | 
158 |     #[test]
159 |     fn sstable_super_block_codec() {
160 |         let block_info = BlockInfo {
161 |             block_type: BlockType::IndexBlock,
162 |             offset: 40960,
163 |             length: 10240,
164 |         };
165 |         let sb = SSTableSuperBlock {
166 |             thread_id: 3,
167 |             level_id: 5,
168 |             blocks: vec![block_info],
169 |         };
170 | 
171 |         let bytes = sb.encode();
172 |         assert_eq!(bytes.len(), SSTableSuperBlock::LENGTH);
173 |         assert_eq!(sb, SSTableSuperBlock::decode(&bytes));
174 |     }
175 | 
176 |     #[test]
177 |     fn sstable_index_entry_codec() {
178 |         let entry = IndexBlockEntry {
179 |             value_offset: 40960,
180 |             timestamp: 12345,
181 |             key: b"value".to_vec(),
182 |         };
183 | 
184 |         let bytes = entry.encode();
185 |         assert_eq!(entry, IndexBlockEntry::decode(&bytes));
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::Borrow;
  2 | use std::cmp::Ordering;
  3 | use std::convert::TryInto;
  4 | use std::marker::PhantomData;
  5 | use std::ops::Index;
  6 | 
  7 | use crate::error::{HelixError, Result};
  8 | use crate::types::{Bytes, Entry};
  9 | 
 10 | pub(crate) trait KeyExtractor<T: Borrow<Self>>: Eq {
 11 |     fn key(data: &T) -> &[u8];
 12 | }
 13 | 
 14 | impl<T: Index<usize, Output = Entry> + Borrow<Vec<Entry>>> KeyExtractor<T> for Vec<Entry> {
 15 |     fn key(data: &T) -> &[u8] {
 16 |         &data.index(0).key
 17 |     }
 18 | }
 19 | 
 20 | // todo: remove Eq bound?
 21 | pub trait Comparator: Send + Sync + Eq {
 22 |     fn cmp(lhs: &[u8], rhs: &[u8]) -> Ordering
 23 |     where
 24 |         Self: Sized;
 25 | }
 26 | 
 27 | #[derive(Eq, PartialEq)]
 28 | pub(crate) struct OrderingHelper<C: Comparator, T: KeyExtractor<T>> {
 29 |     pub data: T,
 30 |     _c: PhantomData<C>,
 31 | }
 32 | 
 33 | impl<C: Comparator, T: Eq + KeyExtractor<T>> Ord for OrderingHelper<C, T> {
 34 |     fn cmp(&self, other: &Self) -> Ordering {
 35 |         C::cmp(T::key(&self.data), T::key(&other.data))
 36 |     }
 37 | }
 38 | 
 39 | impl<C: Comparator, T: PartialEq + KeyExtractor<T>> PartialOrd for OrderingHelper<C, T> {
 40 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 41 |         Some(self.cmp(other))
 42 |     }
 43 | }
 44 | 
 45 | impl<C: Comparator, T: KeyExtractor<T>> From<T> for OrderingHelper<C, T> {
 46 |     fn from(data: T) -> Self {
 47 |         Self {
 48 |             data,
 49 |             _c: PhantomData,
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | #[derive(Eq, PartialEq)]
 55 | /// This comparator returns `Ordering::Equal` for every operands.
 56 | /// Which will ignore the provided left and right bound and result a full table
 57 | /// scan.
 58 | ///
 59 | /// # Example
 60 | /// ```rust
 61 | /// # use std::cmp::Ordering;
 62 | /// # use helixdb::NoOrderComparator;
 63 | /// # use crate::helixdb::Comparator;
 64 | /// assert_eq!(
 65 | ///     NoOrderComparator::cmp(&[1, 2, 3], &[2, 3, 3]),
 66 | ///     Ordering::Equal
 67 | /// );
 68 | /// assert_eq!(NoOrderComparator::cmp(&[1, 2, 3], &[1, 2]), Ordering::Equal);
 69 | /// assert_eq!(
 70 | ///     NoOrderComparator::cmp(&[1, 2, 3], &[1, 2, 3]),
 71 | ///     Ordering::Equal
 72 | /// );
 73 | /// ```
 74 | pub struct NoOrderComparator {}
 75 | 
 76 | impl Comparator for NoOrderComparator {
 77 |     fn cmp(_: &[u8], _: &[u8]) -> Ordering {
 78 |         Ordering::Equal
 79 |     }
 80 | }
 81 | 
 82 | #[derive(PartialEq, Eq)]
 83 | /// This comparator describes lexicographical order on `[u8]`
 84 | ///
 85 | /// # Example
 86 | /// ```rust
 87 | /// # use std::cmp::Ordering;
 88 | /// # use helixdb::LexicalComparator;
 89 | /// # use crate::helixdb::Comparator;
 90 | /// assert_eq!(
 91 | ///     LexicalComparator::cmp(&[1, 2, 3], &[2, 3, 3]),
 92 | ///     Ordering::Less
 93 | /// );
 94 | /// assert_eq!(
 95 | ///     LexicalComparator::cmp(&[1, 2, 3], &[1, 2]),
 96 | ///     Ordering::Greater
 97 | /// );
 98 | /// assert_eq!(
 99 | ///     LexicalComparator::cmp(&[1, 2, 3], &[1, 2, 3]),
100 | ///     Ordering::Equal
101 | /// );
102 | /// ```
103 | pub struct LexicalComparator {}
104 | 
105 | impl Comparator for LexicalComparator {
106 |     fn cmp(lhs: &[u8], rhs: &[u8]) -> Ordering {
107 |         lhs.cmp(rhs)
108 |     }
109 | }
110 | 
111 | pub fn encode_u64(data: u64) -> Bytes {
112 |     data.to_le_bytes().to_vec()
113 | }
114 | 
115 | pub fn decode_u64(data: &[u8]) -> u64 {
116 |     u64::from_le_bytes(data.try_into().unwrap())
117 | }
118 | 
119 | /// Check the length of data. Return `HelixError::IncompatibleLength`
120 | pub fn check_bytes_length(data: &[u8], length: usize) -> Result<()> {
121 |     if data.len() == length {
122 |         Ok(())
123 |     } else {
124 |         Err(HelixError::IncompatibleLength(length, data.len()))
125 |     }
126 | }
127 | 
128 | pub(crate) trait AssertSend: Send {}
129 | 
130 | pub(crate) trait AssertSync: Sync {}
131 | 


--------------------------------------------------------------------------------