├── .github ├── dependabot.yml └── workflows │ └── pages.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── book.toml ├── doc ├── .gitignore └── src │ ├── SUMMARY.md │ ├── arch │ ├── compaction.md │ ├── datafile.md │ ├── manifest.md │ ├── memtable.md │ ├── min-hash.md │ ├── separate-value.md │ ├── sharding.md │ ├── sstable.md │ └── virtual-sstable.md │ └── custom.css └── openkv └── Cargo.toml /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "17:00" 8 | open-pull-requests-limit: 2 9 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | paths: 6 | - 'doc/**' 7 | - 'book.toml' 8 | - 'README.md' 9 | - '.github/workflows/**' 10 | jobs: 11 | deploy-doc: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@v2 16 | 17 | - name: Install mdbook plugins 18 | uses: drmingdrmer/mdbook-full@main 19 | 20 | - name: Build mdbook 21 | run: mdbook build 22 | env: 23 | RUST_LOG: debug 24 | 25 | - name: Deploy to github page 26 | uses: peaceiris/actions-gh-pages@v3 27 | with: 28 | github_token: ${{ secrets.GITHUB_TOKEN }} 29 | # Because in `book.toml` there are two `output` table thus the output location is changed. 30 | publish_dir: ./doc/book/html 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "openkv", 4 | ] 5 | exclude = [ 6 | "example-todo", 7 | ] 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # openkv 2 | 3 | LSM based key-value store in rust, design for cloud. 4 | 5 | This project is in **Alpha** phase. 6 | API and data layout will change rapidly. 7 | 8 | # Goal 9 | 10 | - [ ] Transactional key-value store: 11 | Provide transactional write, and snapshot read. 12 | 13 | - [ ] Pluggable WAL: using local fs, raft log, or Kafka as a WAL provider. 14 | 15 | - [ ] Minimize ser/de cost: 16 | Access to serialized data in SSTable without deserializing. 17 | 18 | - [ ] Design for large datasets: 19 | 20 | Reduce unnecessary IO with an efficient data index. 21 | Reduce in-memory data size by combining sparse-index and bloom filter. 22 | 23 | - [ ] Flexible compaction policy: 24 | 25 | Reduces unnecessary compaction by comparing SSTable cardinality signature. 26 | Reduces IO consumption with partially SSTable merge down. 27 | Reclaims space quickly with cross-level SSTable merge. 28 | 29 | - [ ] Design for the cloud: 30 | Stores less accessed SSTable on S3. 31 | 32 | - [ ] Internal sharding 33 | A flattened structure reduces write/read amplification. 34 | A span with a heavy load will be pushed down more frequently than other spans. 35 | -------------------------------------------------------------------------------- /book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | language = "en" 3 | multilingual = false 4 | src = "doc/src" 5 | title = "openkv" 6 | description = "The openkv document" 7 | 8 | [build] 9 | build-dir = "doc/book" 10 | create-missing = false 11 | 12 | [preprocessor.svgbob] 13 | text_width = 8.0 14 | text_height = 16.0 15 | class = "bob" 16 | font_family = "arial" 17 | font_size = 14.0 18 | stroke_width = 2.0 19 | # there's using css-variables from theme: 20 | stroke_color = "var(--fg)" # see default theme / variables.css 21 | background_color = "transparent" # also useful `var(--bg)` 22 | # all properties are optional. 23 | 24 | [preprocessor.katex] 25 | 26 | # required by preprocessor.katex 27 | # https://github.com/lzanini/mdbook-katex 28 | [output.katex] 29 | 30 | # Output intermedia md for debug only 31 | [output.markdown] 32 | 33 | [output.html] 34 | additional-css = ["doc/src/custom.css"] 35 | # mathjax-support = true 36 | 37 | 38 | [output.linkcheck] 39 | # Should we check links on the internet? Enabling this option adds a 40 | # non-negligible performance impact 41 | # TODO: enable this 42 | # follow-web-links = true 43 | 44 | # Are we allowed to link to files outside of the book's root directory? This 45 | # may help prevent linking to sensitive files (e.g. "../../../../etc/shadow") 46 | traverse-parent-directories = false 47 | 48 | # If necessary, you can exclude one or more links from being checked with a 49 | # list of regular expressions. The regex will be applied to the link href (i.e. 50 | # the `./index.html` in `[some page](./index.html)`) so it can be used to 51 | # ignore both web and filesystem links. 52 | # 53 | # Hint: you can use TOML's raw strings (single quote) to avoid needing to 54 | # escape things twice. 55 | exclude = [ 'google\.com' ] 56 | 57 | # The User-Agent to use when sending web requests 58 | user-agent = "mdbook-linkcheck-0.4.0" 59 | 60 | # The number of seconds a cached result is valid for (12 hrs by default) 61 | cache-timeout = 43200 62 | 63 | # How should warnings be treated? 64 | # 65 | # - "warn" will emit warning messages 66 | # - "error" treats all warnings as errors, failing the linkcheck 67 | # - "ignore" will ignore warnings, suppressing diagnostic messages and allowing 68 | # the linkcheck to continuing 69 | warning-policy = "error" 70 | 71 | # Extra HTTP headers that must be send to certain web sites 72 | # in order to link check to succeed. 73 | # 74 | # This is a dictionary (map), with keys being regexes 75 | # matching a set of web sites, and values being an array of 76 | # the headers. 77 | [output.linkcheck.http-headers] 78 | # Any hyperlink that contains this regexp will be sent 79 | # the "Accept: text/html" header 80 | 'crates\.io' = ["Accept: text/html"] 81 | 82 | # mdbook-linkcheck will interpolate environment variables into your header via 83 | # $IDENT. 84 | # 85 | # If this is not what you want you must escape the `$` symbol, like `\$TOKEN`. 86 | # `\` itself can also be escaped via `\\`. 87 | # 88 | # Note: If interpolation fails, the header will be skipped and the failure will 89 | # be logged. This can be useful if a particular header isn't always necessary, 90 | # but may be helpful (e.g. when working with rate limiting). 91 | 92 | # 'website\.com' = ["Authorization: Basic $TOKEN"] 93 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | /book 2 | -------------------------------------------------------------------------------- /doc/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Architecture 5 | 6 | - [MemTable](arch/memtable.md) 7 | - [SSTable](arch/sstable.md) 8 | - [Virtual SSTable](arch/virtual-sstable.md) 9 | 10 | - [Separate Value](arch/separate-value.md) 11 | 12 | - [Internal Sharding](arch/sharding.md) 13 | 14 | - [Compaction](arch/compaction.md) 15 | - [min-hash](arch/min-hash.md) 16 | 17 | - [Data File](arch/datafile.md) 18 | 19 | - [Manifest](arch/manifest.md) 20 | 21 | -------------------------------------------------------------------------------- /doc/src/arch/compaction.md: -------------------------------------------------------------------------------- 1 | # Compaction 2 | 3 | Push a SSTable down and merge it with all overlapping SSTable at the next lower 4 | level. 5 | 6 | Leave only the record with the greatest `seq` when merging. 7 | 8 | When reaching the bottom level, i.e, level-0, a tombstone record can be removed 9 | for good. 10 | -------------------------------------------------------------------------------- /doc/src/arch/datafile.md: -------------------------------------------------------------------------------- 1 | # DataFile 2 | 3 | Multiple SSTable are stored in a large file, the DataFile. 4 | 5 | DataFile only allocates or reclaims space by a fixed size: its allocation unit. 6 | There is one DataFile for each allocation unit. 7 | 8 | Assuming in openkv the allocation unit are `4MB` and `16MB`, 9 | then there are two `DataFile`s: `df-4mb` and `df-16mb`. 10 | 11 | Each DataFile has a corresponding bitmap for tracking allocated unit. 12 | 13 | The bitmap is part of the [manifest][] 14 | 15 | ```bob 16 | 17 | +-+-+-+-+-+ 18 | "bitmap-4mb:" |1|0|1|1|0| 19 | +++-+++++-+ 20 | | | '--------------. 21 | | '---------. | 22 | v v v 23 | +------+------+------+------+------+ 24 | "DataFile-4mb:" |"SST0"| |"SST0"|"SST0"|"..." | 25 | +------+------+------+------+------+ 26 | 27 | 28 | +-+-+-+-+-+ 29 | "bitmap-16mb:" |1|0|1|1|0| 30 | +++-+++++-+ 31 | | | '--------------. 32 | | '---------. | 33 | v v v 34 | 35 | "..." 36 | 37 | ``` 38 | 39 | ## Commit a SSTable 40 | 41 | - Find the first `0` in df-bitmap. 42 | This bitmap has to be indexed to speed up searching for the first `0`. 43 | 44 | - Flush and fsync SSTable data to DataFile. 45 | 46 | - Flush and fsync bitmap in [manifest][]. 47 | 48 | 49 | 50 | [manifest]: manifest.md 51 | 52 | -------------------------------------------------------------------------------- /doc/src/arch/manifest.md: -------------------------------------------------------------------------------- 1 | # Manifest 2 | 3 | Manifest in openkv itself is a tiny db with WAL of operations to the system and a snapshot. 4 | 5 | 6 | Snapshot: 7 | 8 | ``` 9 | version: String 10 | 11 | data_file_bitmaps: { 12 | "4mb": Bitmap 13 | "16mb": Bitmap 14 | } 15 | 16 | spans: [ 17 | { 18 | start: String 19 | end: String 20 | vsstables: { 21 | 0: [VSST1, VSST2, ...] 22 | 1: [VSSTi, VSSTj, ...], 23 | } 24 | }, 25 | ... 26 | ] 27 | 28 | separated_values: [ 29 | { 30 | start_value_id: String, 31 | end_value_id: String 32 | vsstables: { 33 | 0: [VSST1, VSST2, ...] 34 | 1: [VSSTi, VSSTj, ...], 35 | } 36 | 37 | }, 38 | ... 39 | 40 | ] 41 | 42 | ``` 43 | -------------------------------------------------------------------------------- /doc/src/arch/memtable.md: -------------------------------------------------------------------------------- 1 | # MemTable 2 | 3 | MemTable in LSM is the in-memory representation of key-value pairs. 4 | 5 | It can be considered as a `BTreeMap`. 6 | 7 | 8 | There is only one active `MemTable` for every openkv instance. 9 | Memtable is shared by all [sub-LSM][] 10 | 11 | 12 | # Flush 13 | 14 | Flush MemTable to SSTable. 15 | 16 | If the MemTable becomes too large, flush one continous portion(span) into a 17 | sub-LSM. 18 | 19 | MemTable always flushes the biggest span, e.g., the one with most keys, unless 20 | there is a pressure to reclaim WAL space. 21 | 22 | 23 | [sub-LSM]: sharding.md#sub-lsm 24 | -------------------------------------------------------------------------------- /doc/src/arch/min-hash.md: -------------------------------------------------------------------------------- 1 | # Min-hash 2 | 3 | [min-hash][] is a signature of all keys in a SSTable and is used to optimize the 4 | compaction. E.g., find out a pair of SSTable with most common keys. 5 | 6 | # Jaccard similarity 7 | 8 | Let `U` be a set and `A` and `B` be subsets of `U`, 9 | then the **Jaccard index** is defined to be the ratio of the number of elements of their intersection and 10 | the number of elements of their union: 11 | 12 | $$ 13 | J(A,B)={{|A\cap B|} \over {|A\cup B|}}. 14 | $$ 15 | 16 | 17 | Let `h` be a hash function that maps the members of `U` to distinct integers, e.g., let `h(x) = SHA1(x)`. 18 | 19 | For any set `S`, define `H_min(S)` to be the minimal integer(hash value) of members in `S`: 20 | 21 | H_min(S) = min({h(x) | x ∈ S}) 22 | 23 | Now, applying `H_min` to both `A` and `B`, 24 | and assuming no hash collisions, 25 | the probability that `H_min(A) == H_min(B)` is true is equal to the similarity `J(A,B)`: 26 | 27 | Pr[ H_min(A) = H_min(B) ] = J(A,B) 28 | 29 | # Estimate the number of common keys 30 | 31 | Given two SSTable, the number of common keys can be calculated with: 32 | 33 | $$ 34 | c = \frac{p}{1+p} (|A| + |B|) 35 | $$ 36 | 37 | 38 | # Estimate the Probability 39 | 40 | The probability can be estimiated with `y/k`, 41 | where `k` is the number of different hash functions 42 | and `y` is the number of hash functions hᵢ for which hᵢ(A) == hᵢ(B) 43 | 44 | 45 | ### Generate signature for a SSTable 46 | 47 | ```rust 48 | fn gen_sstable_signature(sstable: &SSTable, number_of_hashes: usize) -> Vec{ 49 | 50 | // Result signature 51 | let signature = Vec::with_capacity(number_of_hashes); 52 | 53 | // Distribute hash values to `k` buckets to simulate `k` hash functions. 54 | let buckets = Vec::with_capacity(number_of_hashes); 55 | 56 | for k in sstable.keys() { 57 | let h = SHA1(k) as u64; 58 | buckets[h%number_of_hashes].push(h); 59 | } 60 | 61 | for i in 0..number_of_hashes { 62 | signature[i] = min(buckets[i]); 63 | } 64 | 65 | signature 66 | } 67 | ``` 68 | 69 | ### Calculate similarity of two SSTable 70 | 71 | 72 | ```rust 73 | 74 | fn calc_sig(a: &SSTable, b:&SSTable) -> f64 { 75 | let eq = 0.0; 76 | for i in 0..number_of_hashes: 77 | if a.signature[i] == a.signature[i] { 78 | eq += 1 79 | } 80 | eq / number_of_hashes 81 | 82 | } 83 | ``` 84 | 85 | ## Simulation 86 | 87 | We provides a python script [min-hash.py][] to estimate the accuracy of this algo. 88 | 89 | | Config | value | 90 | | :-- | --: | 91 | | Number of hash functions(buckets) | 128 | 92 | | Hash value | u64 | 93 | | Space cost | sizeof(u64) * 128 = 1KB | 94 | 95 | Actual vs Estimated: 96 | 97 | | \|A∪B\| | \|A\| | \|B\| | Actual (A∩B)/(A∪B)% | Estimated% | error% | 98 | | --: | --: | --: | --: | --: | --: | 99 | | 1000 | 360 | 840 | 20.00% | 21.88% | 1.87% | 100 | | 1000 | 520 | 880 | 40.00% | 38.28% | -1.72% | 101 | | 1000 | 680 | 920 | 60.00% | 60.94% | 0.94% | 102 | | 1000 | 839 | 959 | 80.16% | 78.91% | -1.25% | 103 | | 1000 | 1000 | 1000 | 100.00% | 100.00% | 0.00% | 104 | | 10000 | 3600 | 8400 | 20.00% | 15.62% | -4.38% | 105 | | 10000 | 5200 | 8800 | 40.00% | 35.16% | -4.84% | 106 | | 10000 | 6800 | 9200 | 60.00% | 60.94% | 0.94% | 107 | | 10000 | 8399 | 9599 | 80.02% | 85.16% | 5.14% | 108 | | 10000 | 10000 | 10000 | 100.00% | 100.00% | 0.00% | 109 | | 100000 | 36000 | 84000 | 20.00% | 21.88% | 1.87% | 110 | | 100000 | 52000 | 88000 | 40.00% | 47.66% | 7.66% | 111 | | 100000 | 68000 | 92000 | 60.00% | 62.50% | 2.50% | 112 | | 100000 | 83999 | 95999 | 80.00% | 80.47% | 0.47% | 113 | | 100000 | 100000 | 100000 | 100.00% | 100.00% | 0.00% | 114 | | 1000000 | 360000 | 840000 | 20.00% | 19.53% | -0.47% | 115 | | 1000000 | 520000 | 880000 | 40.00% | 40.62% | 0.62% | 116 | | 1000000 | 680000 | 920000 | 60.00% | 58.59% | -1.41% | 117 | | 1000000 | 839999 | 959999 | 80.00% | 75.78% | -4.22% | 118 | | 1000000 | 1000000 | 1000000 | 100.00% | 100.00% | 0.00% | 119 | 120 | 121 | # Optimize compaction with min-hash 122 | 123 | With min-hash we can find the best choice to compact. 124 | 125 | For every SSTable, 126 | calculate the Jaccard index of it and the overlapping SSTable at the lower level. 127 | 128 | Push down the one with the max Jaccard index. 129 | 130 | - The space cost is negligible. Only `1KB` more space for every SSTable. 131 | 132 | - The time complexity is `O(n)`, where `n` is the number of SSTable in the 133 | system. Because for every SSTable, there are about only `k` SSTable that 134 | have overlapping key range with it.where `k` is the level fanout. 135 | 136 | 137 | [min-hash]: https://en.wikipedia.org/wiki/MinHash 138 | [min-hash.py]: https://drmingdrmer.github.io/post-res/compact/min-hash.py 139 | -------------------------------------------------------------------------------- /doc/src/arch/separate-value.md: -------------------------------------------------------------------------------- 1 | # Separate value 2 | 3 | ```bob 4 | 5 | 6 | "flush to SSTable with separate value" 7 | 8 | |"Separate values:" 9 | .------+------+------. | 10 | |"k1" |"k2" |"k3" | | 11 | +------+------+------+ | 12 | |"bar" |"vid2"|"vid3"| | 13 | `------+--+---+--+---' | 14 | | `-------------------------. 15 | '-------------------------. | 16 | | v v 17 | | .------+------+------. .------+------+------. 18 | | |"vid2"|"vid3"|"..." | |"vidi"|"vidj"|"..." | 19 | | +------+------+------+ +------+------+------+ 20 | | |"v2" |"v3" |"..." | |"vi" |"vj" |"..." | 21 | | `------+------+------' `------+------+------' 22 | 23 | 24 | "k3 is removed at level-0 by compaction:" 25 | 26 | |"Separate values:" 27 | |.-----. 28 | || v 29 | .------+------+------. || .------+------. 30 | |"k1" |"k2" |"k3" | || |"vid3"|"..." | <- "Tombstone" 31 | +------+------+------+ || `------+------' 32 | |"bar" |"vid2"|"vid3"| || 33 | `------+--+---+--+---' || 34 | | `------------' 35 | '-------------------------. 36 | | v 37 | | .------+------+------. .------+------+------. 38 | | |"vid2"|"vid3"|"..." | |"vidi"|"vidj"|"..." | 39 | | +------+------+------+ +------+------+------+ 40 | | |"v2" |"v3" |"..." | |"vi" |"vj" |"..." | 41 | | `------+------+------' `------+------+------' 42 | 43 | ``` 44 | -------------------------------------------------------------------------------- /doc/src/arch/sharding.md: -------------------------------------------------------------------------------- 1 | # Sharding 2 | 3 | One of the performance issue about LSM tree is the write/read amplification. 4 | When a db becomes bigger, the number of levels(`l`) in a LSM becomes increases, in 5 | logarithm order. 6 | 7 | A record will be rewritten([compaction][]) `l` times to enter the bottom level. 8 | Assumes the fanout of every level is `n`, 9 | every record amplifies write IO by `O(l) * n` times. 10 | 11 | By splitting LSM into several smaller ones, `l` becomes smaller and the 12 | write/read amplification will be reduced. 13 | 14 | Thus openkv organize its data in a way resembles to a two-level BTree: 15 | - The btree root node is a array of all sub-LSM, sorted in key order. 16 | - Each of the leaf nodes is a small LSM. 17 | 18 | 19 | ```bob 20 | 21 | 22 | .------------+----------+------------. 23 | | "(-oo, b]" | "[b, e)" | "[e, +oo)" | 24 | `------------+----------+------------' 25 | | | '---------~~~ 26 | .------. | `----------. 27 | | "L3" | | | 28 | `------' v v 29 | .------. .------. .------. 30 | | "L2" | | "L2" | | "L2" | 31 | `------' ------> `------' `------' 32 | .------. .------. .------. .------. 33 | | "L1" | | "L1" | | "L1" | | "L1" | 34 | `------' `------' `------' `------' 35 | .------. .------. .------. .------. .------. .------. .------. 36 | | "L0" | | "L0" | | "L0" | | "L0" | | "L0" | | "L0" | | "L0" | 37 | `------' `------' `------' `------' `------' `------' `------' 38 | ``` 39 | 40 | 41 | ## Sub-LSM 42 | 43 | Sub-LSM is a small LSM tree, with a limited number of levels. 44 | 45 | 46 | 47 | # Split and Merge 48 | 49 | - A sub-LSM will be split if the level exceeds a threshold(e.g., 3). 50 | 51 | - Two adjacent sub-LSM is merged into one if both of them becomes lower than 1/3 of the threshold. 52 | 53 | 54 | 55 | [compaction]: compaction.md 56 | -------------------------------------------------------------------------------- /doc/src/arch/sstable.md: -------------------------------------------------------------------------------- 1 | # SSTable 2 | 3 | A SSTable(solid-state table) is the same concept used in levelDB or rocksDB. 4 | 5 | The size of an SSTable can only be `4MB`(for tombstone only SSTable) or `16MB`(regular SSTable). 6 | 7 | The data layout inside an SSTable: 8 | 9 | 10 | ```bob 11 | .-----------+---------+---------+-------------+-----------------------. 12 | | "Header" | "Metas" | "Index" | "Records" | "Checksum" | 13 | `-----------+---------+---------+-------------+-----------------------' 14 | ``` 15 | 16 | ## Header 17 | 18 | `Header` is fix-sized and contains an SSTable format version etc. 19 | 20 | ## Metas 21 | 22 | `Metas` stores configs of this SSTable, such as key types, value types, etc. 23 | E.g., a fix-sized key or value may use a type-specific store format. 24 | 25 | Fix-sized key does not need to store key-suffix in `Records` segment. 26 | TODO: prove it. 27 | 28 | `Metas` also stores the cardinality signature of all keys to optimize SSTable compaction. 29 | 30 | 31 | ## Index 32 | 33 | `Index` is a trie built from all keys stored in this SSTable, without 34 | single-branch tail. 35 | The leaf node in the trie stored the corresponding offset of a record in 36 | `Records`. 37 | 38 | It contains enough info to locate a present key in the `Records` segment. 39 | But there could be a false-positive for a key not in `Records`. 40 | 41 | For a given example of 3 key values: 42 | 43 | ``` 44 | fantastice: 1 45 | foo: 5 46 | food: 3 47 | ``` 48 | 49 | The Index would be like this: 50 | 51 | ```bob 52 | 53 | .-. .-. 54 | |f|-+--->|a| : "offset for fantastice" 55 | '-' | '-' 56 | | 57 | | .-. .-. 58 | `--->|o|--->|o| : "offset of foo" 59 | '-' '-' 60 | | 61 | | .-. 62 | `--> |d| : "offset of food" 63 | '-' 64 | 65 | ``` 66 | 67 | ### Caching 68 | 69 | `Index` is preferred to reside in memory, while the `Records` is left on disk. 70 | 71 | ### Sparse Index 72 | 73 | Not all keys in the `Index` has a corresponding offset stored for it. 74 | Since the IO optimization should be IO-oriented, reading several bytes may cost 75 | almost the same resource as reading several kilo-bytes. 76 | Thus the `Index` only need to store an offset for several adjacent keys. 77 | 78 | And we just need to limit the distance between two adjacent offsets in the 79 | `Index` to be less than a expected value, e.g., 16 KB. 80 | 81 | 82 | 83 | 84 | ## Records 85 | 86 | The `Records` segment stores key suffixes that are not included in `Index` and 87 | value. 88 | 89 | `Records` is indexed by offsets that are stored in `Index`. 90 | 91 | - `key-suffix` may be empty. 92 | - `seq` is a monotonic sequence number. 93 | - `value` is var-len serialized `bytes`. 94 | 95 | ```bob 96 | .--------------+-----+---------. 97 | | "key-suffix" |"seq"| "value" | 98 | |--------------+-----+---------| 99 | | "key-suffix" |"seq"| "value" | 100 | |--------------+-----+---------| 101 | | "..." |"..."| "..." | 102 | '--------------+-----+---------' 103 | ``` 104 | 105 | 106 | 107 | ## Checksum 108 | 109 | SHA256 of all preceding bytes in the SSTable. 110 | 111 | 112 | # Positive and Negative SSTable 113 | 114 | One [flush][] builds 2 SSTable: 115 | - One of them contains all inserted or updated records, the Positive SSTable. 116 | - The other one contains only deleted records, i.e., tomobstones: the Negative SSTable. 117 | 118 | Both has the same **level** and the same key **range**. 119 | In other word, we store updated recoreds and deleted records separately. 120 | 121 | ```bob 122 | "Records part in P-SSTable:" 123 | 124 | .--------------+-----+---------. 125 | | "key-suffix" |"seq"| "value" | 126 | |--------------+-----+---------| 127 | | "key-suffix" |"seq"| "value" | 128 | |--------------+-----+---------| 129 | | "..." |"..."| "..." | 130 | '--------------+-----+---------' 131 | 132 | "Records part in N-SSTable:" 133 | 134 | .--------------+-----. 135 | | "key-suffix" |"seq"| 136 | |--------------+-----| 137 | | "key-suffix" |"seq"| 138 | |--------------+-----| 139 | | "..." |"..."| 140 | '--------------+-----' 141 | ``` 142 | 143 | By separating P/N records, 144 | it is possible to compact a N-SSTable down to reclaim space quickly. 145 | 146 | E.g., a Negative SSTable `N1` can be pushed down and be merged with `N2` and `N3`, 147 | without touching `P1`. Because to `N1`, `P1` is **transparent**: `N1 ∩ P1 = ø` 148 | 149 | ```bob 150 | .------. 151 | .---------+ "N1" +---------. 152 | | '------' | 153 | | .------. | 154 | | | "P1" | | 155 | | '------' | 156 | | | 157 | | .------. .------. | 158 | +-->| "N2" | | "N3" |<--+ 159 | '------' '------' 160 | .------. .------. 161 | | "P2" | | "P3" | 162 | '------' '------' 163 | ``` 164 | 165 | 166 | # Reading a record 167 | 168 | - Search the key in the `Index`. If it matches a path in the trie, 169 | scan `Records` from the offset upto next offset to find the record. 170 | 171 | - If the key-suffix also matches the search key, returns the value. 172 | 173 | 174 | # Performance considerations 175 | 176 | - Trie naturally compresses prefixes and an SSTable only stores a small range of keys 177 | thus the space cost is low enough. 178 | 179 | For a static trie, succinct data structure can be adopted to reduce the size 180 | of the trie. 181 | 182 | Our goal budget for a key is about several bytes. So that keeping the `Index` 183 | in memory is possible and unnecessary IO can be reduced. 184 | 185 | - Trie is not cache friendly thus a single trie should not be very large. 186 | The expected time cost for a query in the `Index` is less than 200 ns. 187 | 188 | 189 | [flush]: memtable.md#flush 190 | -------------------------------------------------------------------------------- /doc/src/arch/virtual-sstable.md: -------------------------------------------------------------------------------- 1 | # Virtual SSTable 2 | 3 | `VSSTable` is a unit to manage SSTable. 4 | Because an SSTable may be referenced more than once: 5 | e.g. when splitting a span. 6 | 7 | ```bob 8 | 9 | .~~~~~~. .~~~~~~. .~~~~~~. | .~~~~~~. 10 | ! "V4" ! ! "V5" ! ! "V4" ! | ! "V5" ! 11 | `~~+~~~' `~~+~~~' `~~+~~~' | `~~+~~~' 12 | .~~~~~~. | .~~~~~~. | .~~~~~~. split .~~~~~~. | .~~~~~~. | .~~~~~~. | .~~~~~~. 13 | ! "V1" ! | ! "V2" ! | ! "V3" ! --------> ! "V1" ! | ! "V2" ! | ! "V2'"! | ! "V3" ! 14 | `~~+~~~' | `~~+~~~' | `~~+~~~' `~~+~~~' | `~~~~~~' | `~~~~~~' | `~~+~~~' 15 | | | | | | | | | | | | | 16 | | | | | | | | | | | | | 17 | | | | | | | | | | | | | 18 | | | | | | | | | | | | | 19 | | | | | | | | | | | | | 20 | | v | v | | v | | | v | 21 | | .------. | .------. | | .------. '---. .---' .------. | 22 | | | "T4" | | | "T5" | | | | "T4" | | | | "T5" | | 23 | v `------' v `------' v v `------' v v `------' v 24 | .------. .------. .------. .------. .------. .------. 25 | | "T1" | | "T2" | | "T3" | | "T1" | | "T2" | | "T3" | 26 | '------' `------' `------' '------' `------' '------' 27 | ``` 28 | 29 | A SSTable is removed when there is no VSSTable referencing it. 30 | 31 | A VSSTable has a key range that is subset of the SSTable it references. 32 | 33 | 34 | ``` 35 | struct VSSTable { 36 | key_start: String, 37 | key_end: String, 38 | sstable: SSTableId, 39 | } 40 | ``` 41 | -------------------------------------------------------------------------------- /doc/src/custom.css: -------------------------------------------------------------------------------- 1 | svg.bob line { 2 | stroke: #888; 3 | } 4 | 5 | svg.bob path { 6 | stroke: #888; 7 | } 8 | 9 | svg.bob line.dashed { 10 | stroke: #ccc; 11 | } 12 | 13 | svg.bob circle { 14 | stroke: #bbb; 15 | } 16 | -------------------------------------------------------------------------------- /openkv/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "openkv" 3 | version = "0.1.0" 4 | edition = "2021" 5 | license = "Apache-2.0" 6 | readme = "../README.md" 7 | 8 | [dependencies] 9 | anyerror = { version = "0.1.4", features = ["anyhow"]} 10 | async-trait = "0.1.36" 11 | byte-unit = "4.0.12" 12 | bytes = "1.0" 13 | derive_more = { version="0.99.9" } 14 | futures = "0.3" 15 | maplit = "1.0.2" 16 | rand = "0.8" 17 | serde = { version="1", features=["derive"] } 18 | clap = { version = "3.0.7", features = ["derive", "env"] } 19 | thiserror = "1.0.29" 20 | tokio = { version="1.8", default-features=false, features=["fs", "io-util", "macros", "rt", "rt-multi-thread", "sync", "time"] } 21 | tracing = "0.1.29" 22 | tracing-futures = "0.2.4" 23 | 24 | [dev-dependencies] 25 | anyhow = "1.0.32" 26 | lazy_static = "1.4.0" 27 | memstore = { version="0.2.0", path="../memstore" } 28 | pretty_assertions = "1.0.0" 29 | tracing-appender = "0.2.0" 30 | tracing-subscriber = { version = "0.3.3", features=["env-filter"] } 31 | 32 | 33 | 34 | [features] 35 | docinclude = [] # Used only for activating `doc(include="...")` on nightly. 36 | 37 | [package.metadata.docs.rs] 38 | features = ["docinclude"] # Activate `docinclude` during docs.rs build. 39 | --------------------------------------------------------------------------------