├── mini-lsm-book ├── .gitignore ├── .licensesnip ├── src │ ├── mini-lsm-logo.png │ ├── 09-whats-next.md │ ├── copyright.md │ ├── 08-key-compression.md │ ├── 07-bloom-filter.md │ ├── 06-recovery.md │ ├── 05-compaction.md │ ├── 00-get-started.md │ ├── 00-v1.md │ ├── week1-overview.md │ ├── week4-overview.md │ ├── sitemap.txt │ ├── SUMMARY.md │ ├── discord-badge.svg │ ├── lsm-tutorial │ │ ├── week1-01-single.svg │ │ ├── week2-00-two-extremes-1.svg │ │ ├── week1-01-frozen.svg │ │ ├── week1-01-overview.svg │ │ └── week1-02-overview.svg │ ├── 00-overview.md │ ├── week3-07-compaction-filter.md │ ├── week3-04-watermark.md │ └── week2-06-wal.md ├── README.md ├── theme │ └── head.hbs._ ├── sitemap.sh ├── licensesnip.config.jsonc ├── custom.css └── book.toml ├── mini-lsm-starter ├── src │ ├── tests │ │ └── .gitkeep │ ├── tests.rs │ ├── bin │ │ └── wrapper.rs │ ├── lib.rs │ ├── debug.rs │ ├── iterators.rs │ ├── mvcc │ │ ├── watermark.rs │ │ └── txn.rs │ ├── block.rs │ ├── compact │ │ ├── tiered.rs │ │ ├── leveled.rs │ │ └── simple_leveled.rs │ ├── wal.rs │ ├── manifest.rs │ ├── block │ │ ├── builder.rs │ │ └── iterator.rs │ ├── iterators │ │ ├── two_merge_iterator.rs │ │ ├── concat_iterator.rs │ │ └── merge_iterator.rs │ ├── mvcc.rs │ ├── table │ │ ├── builder.rs │ │ ├── iterator.rs │ │ └── bloom.rs │ └── lsm_iterator.rs ├── README.md └── Cargo.toml ├── mini-lsm ├── src │ ├── debug.rs │ ├── mvcc.rs │ ├── mvcc │ │ ├── txn.rs │ │ └── watermark.rs │ ├── bin │ │ ├── mini-lsm-cli.rs │ │ ├── compaction-simulator.rs │ │ └── wrapper.rs │ ├── tests.rs │ ├── lib.rs │ ├── iterators.rs │ ├── tests │ │ ├── week2_day2.rs │ │ ├── week2_day4.rs │ │ ├── week2_day3.rs │ │ ├── week2_day6.rs │ │ └── week1_day7.rs │ ├── block.rs │ ├── iterators │ │ ├── two_merge_iterator.rs │ │ └── concat_iterator.rs │ ├── manifest.rs │ ├── block │ │ └── builder.rs │ ├── wal.rs │ └── table │ │ ├── iterator.rs │ │ ├── bloom.rs │ │ └── builder.rs ├── README.md └── Cargo.toml ├── mini-lsm-mvcc ├── src │ ├── debug.rs │ ├── tests │ │ ├── harness.rs │ │ ├── week1_day1.rs │ │ ├── week1_day2.rs │ │ ├── week1_day3.rs │ │ ├── week1_day4.rs │ │ ├── week1_day5.rs │ │ ├── week1_day6.rs │ │ ├── week1_day7.rs │ │ ├── week2_day1.rs │ │ ├── week2_day2.rs │ │ ├── week2_day3.rs │ │ ├── week2_day4.rs │ │ ├── week2_day5.rs │ │ ├── week2_day6.rs │ │ ├── week3_day1.rs │ │ ├── week3_day2.rs │ │ ├── week3_day7.rs │ │ └── week3_day5.rs │ ├── bin │ │ ├── mini-lsm-cli.rs │ │ ├── compaction-simulator.rs │ │ └── wrapper.rs │ ├── lib.rs │ ├── tests.rs │ ├── iterators.rs │ ├── mvcc │ │ └── watermark.rs │ ├── block.rs │ ├── mvcc.rs │ ├── iterators │ │ ├── two_merge_iterator.rs │ │ └── concat_iterator.rs │ ├── manifest.rs │ ├── block │ │ └── builder.rs │ └── table │ │ ├── iterator.rs │ │ └── bloom.rs ├── README.md └── Cargo.toml ├── .gitignore ├── .config └── nextest.toml ├── .cargo └── config.toml ├── rust-toolchain.toml ├── licensesnip.config.jsonc ├── rustfmt.toml.nightly ├── xtask └── Cargo.toml ├── Cargo.toml ├── .github └── workflows │ ├── pr.yml │ └── main.yml ├── .licensesnip └── SOLUTIONS.md /mini-lsm-book/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mini-lsm/src/debug.rs: -------------------------------------------------------------------------------- 1 | ../../mini-lsm-starter/src/debug.rs -------------------------------------------------------------------------------- /mini-lsm/src/mvcc.rs: -------------------------------------------------------------------------------- 1 | ../../mini-lsm-starter/src/mvcc.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/debug.rs: -------------------------------------------------------------------------------- 1 | ../../mini-lsm-starter/src/debug.rs -------------------------------------------------------------------------------- /mini-lsm/src/mvcc/txn.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/mvcc/txn.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/harness.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/harness.rs -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .vscode/ 3 | sync-tmp/ 4 | mini-lsm.db/ 5 | lsm.db/ 6 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day1.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day1.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day2.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day2.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day3.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day3.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day4.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day4.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day5.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day5.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day6.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day6.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week1_day7.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week1_day7.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day1.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day1.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day2.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day2.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day3.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day3.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day4.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day4.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day5.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day5.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week2_day6.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm/src/tests/week2_day6.rs -------------------------------------------------------------------------------- /mini-lsm/src/mvcc/watermark.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/mvcc/watermark.rs -------------------------------------------------------------------------------- /mini-lsm/src/bin/mini-lsm-cli.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/bin/mini-lsm-cli.rs -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/bin/mini-lsm-cli.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/bin/mini-lsm-cli.rs -------------------------------------------------------------------------------- /mini-lsm-starter/README.md: -------------------------------------------------------------------------------- 1 | # mini-lsm-starter 2 | 3 | Starter code for Mini-LSM. 4 | -------------------------------------------------------------------------------- /mini-lsm/src/bin/compaction-simulator.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/bin/compaction-simulator.rs -------------------------------------------------------------------------------- /.config/nextest.toml: -------------------------------------------------------------------------------- 1 | [profile.default] 2 | slow-timeout = { period = "10s", terminate-after = 3 } 3 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/bin/compaction-simulator.rs: -------------------------------------------------------------------------------- 1 | ../../../mini-lsm-starter/src/bin/compaction-simulator.rs -------------------------------------------------------------------------------- /mini-lsm/README.md: -------------------------------------------------------------------------------- 1 | # Week 2 Solution 2 | 3 | This is the solution for Mini-LSM week 1 + week 2. 4 | -------------------------------------------------------------------------------- /mini-lsm-book/.licensesnip: -------------------------------------------------------------------------------- 1 | mini-lsm-book © 2022-2025 by Alex Chi Z is licensed under CC BY-NC-SA 4.0 2 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [alias] 2 | xtask = "run --package mini-lsm-xtask --" 3 | x = "run --package mini-lsm-xtask --" 4 | -------------------------------------------------------------------------------- /mini-lsm-book/src/mini-lsm-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyzh/mini-lsm/HEAD/mini-lsm-book/src/mini-lsm-logo.png -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "stable" 3 | components = [ "rustfmt", "clippy" ] 4 | profile = "minimal" 5 | -------------------------------------------------------------------------------- /mini-lsm-book/README.md: -------------------------------------------------------------------------------- 1 | # mini-lsm-book 2 | 3 | The mini-lsm course in mdbook. This part of the repo is licensed under CC BY-NC-SA 4.0. 4 | -------------------------------------------------------------------------------- /licensesnip.config.jsonc: -------------------------------------------------------------------------------- 1 | { 2 | "use_gitignore": true, 3 | "file_types": { 4 | "rs": { 5 | "before_line": "// " 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /mini-lsm-book/theme/head.hbs._: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-lsm-book/sitemap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mdbook build 4 | sscli -b https://skyzh.github.io/mini-lsm -r book -f xml -o > src/sitemap.xml 5 | sscli -b https://skyzh.github.io/mini-lsm -r book -f txt -o > src/sitemap.txt 6 | -------------------------------------------------------------------------------- /mini-lsm-book/licensesnip.config.jsonc: -------------------------------------------------------------------------------- 1 | { 2 | "use_gitignore": true, 3 | "file_types": { 4 | "md": { 5 | "before_block": "" 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /mini-lsm-book/custom.css: -------------------------------------------------------------------------------- 1 | .content img { 2 | margin-left: auto; 3 | margin-right: auto; 4 | display: block; 5 | } 6 | 7 | .caption { 8 | text-align: center; 9 | font-size: smaller; 10 | color: gray; 11 | } 12 | -------------------------------------------------------------------------------- /mini-lsm-book/src/09-whats-next.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # What's Next 6 | 7 | We did not finish this chapter as part of Mini-LSM v1. 8 | 9 | {{#include copyright.md}} 10 | -------------------------------------------------------------------------------- /mini-lsm-book/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Alex Chi Z"] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "LSM in a Week" 7 | 8 | [preprocessor.toc] 9 | command = "mdbook-toc" 10 | renderer = ["html"] 11 | 12 | [output.html] 13 | additional-css = ["custom.css"] 14 | git-repository-url = "https://github.com/skyzh/mini-lsm" 15 | -------------------------------------------------------------------------------- /rustfmt.toml.nightly: -------------------------------------------------------------------------------- 1 | comment_width = 120 2 | format_code_in_doc_comments = true 3 | format_macro_bodies = true 4 | format_macro_matchers = true 5 | normalize_comments = true 6 | normalize_doc_attributes = true 7 | imports_granularity = "Module" 8 | group_imports = "StdExternalCrate" 9 | reorder_impl_items = true 10 | reorder_imports = true 11 | tab_spaces = 4 12 | wrap_comments = true -------------------------------------------------------------------------------- /xtask/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mini-lsm-xtask" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | homepage = { workspace = true } 6 | keywords = { workspace = true } 7 | license = { workspace = true } 8 | repository = { workspace = true } 9 | publish = false 10 | 11 | [dependencies] 12 | anyhow = "1" 13 | console = "0.15" 14 | clap = { version = "4", features = ["derive"] } 15 | duct = "0.13" 16 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["mini-lsm", "xtask", "mini-lsm-starter", "mini-lsm-mvcc"] 3 | resolver = "3" 4 | 5 | [workspace.package] 6 | version = "0.2.0" 7 | edition = "2024" 8 | homepage = "https://github.com/skyzh/mini-lsm" 9 | keywords = ["storage", "database", "tutorial", "course"] 10 | license = "Apache-2.0" 11 | repository = "https://github.com/skyzh/mini-lsm" 12 | 13 | [workspace.dependencies] 14 | anyhow = "1" 15 | bytes = "1" 16 | -------------------------------------------------------------------------------- /mini-lsm-book/src/copyright.md: -------------------------------------------------------------------------------- 1 | 4 | 5 |

Your feedback is greatly appreciated. Welcome to join our Discord Community.
Found an issue? Create an issue / pull request on github.com/skyzh/mini-lsm.
mini-lsm-book © 2022-2025 by Alex Chi Z is licensed under CC BY-NC-SA 4.0.

6 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | name: CI (pull request) 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-22.04 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: setup rust toolchain 17 | run: rustup update && rustup toolchain install 18 | - uses: taiki-e/install-action@nextest 19 | - uses: taiki-e/install-action@mdbook 20 | - name: check and build 21 | run: cargo x ci 22 | -------------------------------------------------------------------------------- /mini-lsm-book/src/08-key-compression.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Key Compression 6 | 7 |
8 | 9 | This is a legacy version of the Mini-LSM course and we will not maintain it anymore. We now have a better version of this course 10 | and this chapter is now part of [Mini LSM Week 1 Day 7: SST Optimizations](./week1-07-sst-optimizations.md). 11 | 12 |
13 | 14 | We did not finish this chapter as part of Mini-LSM v1. 15 | 16 | {{#include copyright.md}} 17 | -------------------------------------------------------------------------------- /mini-lsm-book/src/07-bloom-filter.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Bloom Filters 6 | 7 | 8 |
9 | 10 | This is a legacy version of the Mini-LSM course and we will not maintain it anymore. We now have a better version of this course 11 | and this chapter is now part of [Mini LSM Week 1 Day 7: SST Optimizations](./week1-07-sst-optimizations.md). 12 | 13 |
14 | 15 | We did not finish this chapter as part of Mini-LSM v1. 16 | 17 | {{#include copyright.md}} 18 | -------------------------------------------------------------------------------- /.licensesnip: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022-%YEAR% Alex Chi Z 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /mini-lsm-book/src/06-recovery.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Write-Ahead Log for Recovery 6 | 7 |
8 | 9 | This is a legacy version of the Mini-LSM course and we will not maintain it anymore. We now have a better version of this course 10 | and this chapter is now part of: 11 | 12 | - [Mini-LSM Week 2 Day 5: Manifest](./week2-05-manifest.md) 13 | - [Mini-LSM Week 2 Day 6: Write-Ahead Log (WAL)](./week2-06-wal.md) 14 | 15 |
16 | 17 | We did not finish this chapter as part of Mini-LSM v1. 18 | 19 | {{#include copyright.md}} 20 | -------------------------------------------------------------------------------- /mini-lsm-starter/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mini-lsm-starter" 3 | version = "0.2.0" 4 | edition = "2024" 5 | publish = false 6 | 7 | [dependencies] 8 | anyhow = "1" 9 | arc-swap = "1" 10 | bytes = "1" 11 | crossbeam-epoch = "0.9" 12 | crossbeam-skiplist = "0.1" 13 | parking_lot = "0.12" 14 | ouroboros = "0.18" 15 | moka = "0.9" 16 | clap = { version = "4.4.17", features = ["derive"] } 17 | rand = "0.8.5" 18 | crossbeam-channel = "0.5.11" 19 | serde_json = { version = "1.0" } 20 | serde = { version = "1.0", features = ["derive"] } 21 | farmhash = "1" 22 | nom = "7.1.3" 23 | rustyline = "13.0.0" 24 | 25 | [dev-dependencies] 26 | tempfile = "3" 27 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/README.md: -------------------------------------------------------------------------------- 1 | 16 | 17 | # Week 3 Solution 18 | 19 | This is the solution of Mini-LSM week 3 with MVCC implementation. 20 | -------------------------------------------------------------------------------- /mini-lsm/src/bin/wrapper.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod mini_lsm_wrapper { 16 | pub use mini_lsm::*; 17 | } 18 | 19 | #[allow(dead_code)] 20 | fn main() {} 21 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/bin/wrapper.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod mini_lsm_wrapper { 16 | pub use mini_lsm_mvcc::*; 17 | } 18 | 19 | #[allow(dead_code)] 20 | fn main() {} 21 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/tests.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | //! DO NOT MODIFY -- Mini-LSM tests modules 16 | //! This file will be automatically rewritten by the copy-test command. 17 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/bin/wrapper.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod mini_lsm_wrapper { 16 | pub use mini_lsm_starter::*; 17 | } 18 | 19 | #[allow(dead_code)] 20 | fn main() {} 21 | -------------------------------------------------------------------------------- /mini-lsm-book/src/05-compaction.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Leveled Compaction 6 | 7 | 8 |
9 | 10 | This is a legacy version of the Mini-LSM course and we will not maintain it anymore. We now have a better version of this course 11 | and this chapter is now part of: 12 | 13 | - [Mini-LSM Week 2 Day 1: Compaction Implementation](./week2-01-compaction.md) 14 | - [Mini-LSM Week 2 Day 2: Simple Compaction Strategy](./week2-02-simple.md) 15 | - [Mini-LSM Week 2 Day 3: Tiered Compaction Strategy](./week2-03-tiered.md) 16 | - [Mini-LSM Week 2 Day 4: Leveled Compaction Strategy](./week2-04-leveled.md) 17 | 18 |
19 | 20 | We did not finish this chapter as part of Mini-LSM v1. 21 | 22 | {{#include copyright.md}} 23 | -------------------------------------------------------------------------------- /mini-lsm/src/tests.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | mod harness; 16 | mod week1_day1; 17 | mod week1_day2; 18 | mod week1_day3; 19 | mod week1_day4; 20 | mod week1_day5; 21 | mod week1_day6; 22 | mod week1_day7; 23 | mod week2_day1; 24 | mod week2_day2; 25 | mod week2_day3; 26 | mod week2_day4; 27 | mod week2_day5; 28 | mod week2_day6; 29 | -------------------------------------------------------------------------------- /mini-lsm/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod block; 16 | pub mod compact; 17 | pub mod debug; 18 | pub mod iterators; 19 | pub mod key; 20 | pub mod lsm_iterator; 21 | pub mod lsm_storage; 22 | pub mod manifest; 23 | pub mod mem_table; 24 | pub mod mvcc; 25 | pub mod table; 26 | pub mod wal; 27 | 28 | #[cfg(test)] 29 | mod tests; 30 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod block; 16 | pub mod compact; 17 | pub mod debug; 18 | pub mod iterators; 19 | pub mod key; 20 | pub mod lsm_iterator; 21 | pub mod lsm_storage; 22 | pub mod manifest; 23 | pub mod mem_table; 24 | pub mod mvcc; 25 | pub mod table; 26 | pub mod wal; 27 | 28 | #[cfg(test)] 29 | mod tests; 30 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod block; 16 | pub mod compact; 17 | pub mod debug; 18 | pub mod iterators; 19 | pub mod key; 20 | pub mod lsm_iterator; 21 | pub mod lsm_storage; 22 | pub mod manifest; 23 | pub mod mem_table; 24 | pub mod mvcc; 25 | pub mod table; 26 | pub mod wal; 27 | 28 | #[cfg(test)] 29 | mod tests; 30 | -------------------------------------------------------------------------------- /mini-lsm-book/src/00-get-started.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Environment Setup 6 | 7 | The starter code and reference solution is available at [https://github.com/skyzh/mini-lsm](https://github.com/skyzh/mini-lsm). 8 | 9 | ## Install Rust 10 | 11 | See [https://rustup.rs](https://rustup.rs) for more information. 12 | 13 | ## Clone the repo 14 | 15 | ``` 16 | git clone https://github.com/skyzh/mini-lsm 17 | ``` 18 | 19 | ## Starter code 20 | 21 | ``` 22 | cd mini-lsm/mini-lsm-starter 23 | code . 24 | ``` 25 | 26 | ## Install Tools 27 | 28 | You will need the latest stable Rust to compile this project. The minimum requirement is `1.74`. 29 | 30 | ``` 31 | cargo x install-tools 32 | ``` 33 | 34 | ## Run tests 35 | 36 | ``` 37 | cargo x copy-test --week 1 --day 1 38 | cargo x scheck 39 | ``` 40 | 41 | Now, you can go ahead and start [Week 1: Mini-LSM](./week1-overview.md). 42 | 43 | {{#include copyright.md}} 44 | -------------------------------------------------------------------------------- /SOLUTIONS.md: -------------------------------------------------------------------------------- 1 | # Mini-LSM Community Solutions 2 | 3 | You can add your solution to this page once you finish any full week of the course. You may have a one-sentence introduction of what you have done in your solution and any special functionalities you have implemented. 4 | 5 | ## Week 1 6 | * [pj/mini-lsm-simple-solution](https://github.com/pjzhong/mini-lsm-solution): A simple solution of Mini-LSM. 7 | 8 | ## Week 2 9 | * [7143192/mini-lsm](https://github.com/7143192/mini-lsm): A solution of mini-lsm, finish all tasks except bonus task in week 1 and week2. 10 | 11 | ## Week 3 12 | 13 | * [skyzh/mini-lsm-solution-checkpoint](https://github.com/skyzh/mini-lsm-solution-checkpoint): The author's solution of Mini-LSM. 14 | * [fh/solution](https://github.com/Foreverhighness/mini-lsm/tree/solution): A solution which makes it easy to change the mvcc version, implementing in rust idiom way. 15 | * [Duckulus/mini-lsm-solution](https://github.com/Duckulus/mini-lsm-solution): Full implementation of mini-lsm with a commit for each day of the course 16 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | mod harness; 16 | mod week1_day1; 17 | mod week1_day2; 18 | mod week1_day3; 19 | mod week1_day4; 20 | mod week1_day5; 21 | mod week1_day6; 22 | mod week1_day7; 23 | mod week2_day1; 24 | mod week2_day2; 25 | mod week2_day3; 26 | mod week2_day4; 27 | mod week2_day5; 28 | mod week2_day6; 29 | mod week3_day1; 30 | mod week3_day2; 31 | mod week3_day3; 32 | mod week3_day4; 33 | mod week3_day5; 34 | mod week3_day6; 35 | mod week3_day7; 36 | -------------------------------------------------------------------------------- /mini-lsm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mini-lsm" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | homepage = { workspace = true } 6 | keywords = { workspace = true } 7 | license = { workspace = true } 8 | repository = { workspace = true } 9 | description = "A course for building an LSM tree storage engine in a week." 10 | 11 | 12 | [dependencies] 13 | anyhow = "1" 14 | arc-swap = "1" 15 | bytes = "1" 16 | crossbeam-epoch = "0.9" 17 | crossbeam-skiplist = "0.1" 18 | parking_lot = "0.12" 19 | ouroboros = "0.18" 20 | moka = "0.9" 21 | clap = { version = "4.4.17", features = ["derive"] } 22 | rand = "0.8.5" 23 | crossbeam-channel = "0.5.11" 24 | serde_json = { version = "1.0" } 25 | serde = { version = "1.0", features = ["derive"] } 26 | farmhash = "1" 27 | crc32fast = "1.3.2" 28 | nom = "7.1.3" 29 | rustyline = "13.0.0" 30 | 31 | [dev-dependencies] 32 | tempfile = "3" 33 | 34 | [[bin]] 35 | name = "mini-lsm-cli-ref" 36 | path = "src/bin/mini-lsm-cli.rs" 37 | 38 | [[bin]] 39 | name = "mini-lsm-wrapper-ref" 40 | path = "src/bin/wrapper.rs" 41 | 42 | [[bin]] 43 | name = "compaction-simulator-ref" 44 | path = "src/bin/compaction-simulator.rs" 45 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI (main) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-22.04 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: setup rust toolchain 17 | run: rustup update && rustup toolchain install 18 | - uses: taiki-e/install-action@nextest 19 | - uses: taiki-e/install-action@mdbook 20 | - name: patch for gh-pages build 21 | run: mv mini-lsm-book/theme/head.hbs._ mini-lsm-book/theme/head.hbs 22 | - name: check and build 23 | run: cargo x ci 24 | - uses: actions/upload-pages-artifact@v3 25 | with: 26 | path: mini-lsm-book/book 27 | 28 | deploy: 29 | needs: build 30 | permissions: 31 | pages: write 32 | id-token: write 33 | environment: 34 | name: github-pages 35 | url: ${{ steps.deployment.outputs.page_url }} 36 | runs-on: ubuntu-22.04 37 | if: github.repository == 'skyzh/mini-lsm' 38 | steps: 39 | - name: Deploy to GitHub Pages 40 | id: deployment 41 | uses: actions/deploy-pages@v4 42 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mini-lsm-mvcc" 3 | version = { workspace = true } 4 | edition = { workspace = true } 5 | homepage = { workspace = true } 6 | keywords = { workspace = true } 7 | license = { workspace = true } 8 | repository = { workspace = true } 9 | description = "A course for building an LSM tree storage engine in a week." 10 | 11 | 12 | [dependencies] 13 | anyhow = "1" 14 | arc-swap = "1" 15 | bytes = "1" 16 | crossbeam-epoch = "0.9" 17 | crossbeam-skiplist = "0.1" 18 | parking_lot = "0.12" 19 | ouroboros = "0.18" 20 | moka = "0.9" 21 | clap = { version = "4.4.17", features = ["derive"] } 22 | rand = "0.8.5" 23 | crossbeam-channel = "0.5.11" 24 | serde_json = { version = "1.0" } 25 | serde = { version = "1.0", features = ["derive"] } 26 | farmhash = "1" 27 | crc32fast = "1.3.2" 28 | nom = "7.1.3" 29 | rustyline = "13.0.0" 30 | 31 | [dev-dependencies] 32 | tempfile = "3" 33 | 34 | [[bin]] 35 | name = "mini-lsm-cli-mvcc-ref" 36 | path = "src/bin/mini-lsm-cli.rs" 37 | 38 | [[bin]] 39 | name = "mini-lsm-wrapper-mvcc-ref" 40 | path = "src/bin/wrapper.rs" 41 | 42 | [[bin]] 43 | name = "compaction-simulator-mvcc-ref" 44 | path = "src/bin/compaction-simulator.rs" 45 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/debug.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use crate::lsm_storage::{LsmStorageInner, MiniLsm}; 16 | 17 | impl LsmStorageInner { 18 | pub fn dump_structure(&self) { 19 | let snapshot = self.state.read(); 20 | if !snapshot.l0_sstables.is_empty() { 21 | println!( 22 | "L0 ({}): {:?}", 23 | snapshot.l0_sstables.len(), 24 | snapshot.l0_sstables, 25 | ); 26 | } 27 | for (level, files) in &snapshot.levels { 28 | println!("L{level} ({}): {:?}", files.len(), files); 29 | } 30 | } 31 | } 32 | 33 | impl MiniLsm { 34 | pub fn dump_structure(&self) { 35 | self.inner.dump_structure() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /mini-lsm/src/iterators.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod concat_iterator; 16 | pub mod merge_iterator; 17 | pub mod two_merge_iterator; 18 | 19 | pub trait StorageIterator { 20 | type KeyType<'a>: PartialEq + Eq + PartialOrd + Ord 21 | where 22 | Self: 'a; 23 | 24 | /// Get the current value. 25 | fn value(&self) -> &[u8]; 26 | 27 | /// Get the current key. 28 | fn key(&self) -> Self::KeyType<'_>; 29 | 30 | /// Check if the current iterator is valid. 31 | fn is_valid(&self) -> bool; 32 | 33 | /// Move to the next position. 34 | fn next(&mut self) -> anyhow::Result<()>; 35 | 36 | /// Number of underlying active iterators for this iterator. 37 | fn num_active_iterators(&self) -> usize { 38 | 1 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/iterators.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod concat_iterator; 16 | pub mod merge_iterator; 17 | pub mod two_merge_iterator; 18 | 19 | pub trait StorageIterator { 20 | type KeyType<'a>: PartialEq + Eq + PartialOrd + Ord 21 | where 22 | Self: 'a; 23 | 24 | /// Get the current value. 25 | fn value(&self) -> &[u8]; 26 | 27 | /// Get the current key. 28 | fn key(&self) -> Self::KeyType<'_>; 29 | 30 | /// Check if the current iterator is valid. 31 | fn is_valid(&self) -> bool; 32 | 33 | /// Move to the next position. 34 | fn next(&mut self) -> anyhow::Result<()>; 35 | 36 | /// Number of underlying active iterators for this iterator. 37 | fn num_active_iterators(&self) -> usize { 38 | 1 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/iterators.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod concat_iterator; 16 | pub mod merge_iterator; 17 | pub mod two_merge_iterator; 18 | 19 | pub trait StorageIterator { 20 | type KeyType<'a>: PartialEq + Eq + PartialOrd + Ord 21 | where 22 | Self: 'a; 23 | 24 | /// Get the current value. 25 | fn value(&self) -> &[u8]; 26 | 27 | /// Get the current key. 28 | fn key(&self) -> Self::KeyType<'_>; 29 | 30 | /// Check if the current iterator is valid. 31 | fn is_valid(&self) -> bool; 32 | 33 | /// Move to the next position. 34 | fn next(&mut self) -> anyhow::Result<()>; 35 | 36 | /// Number of underlying active iterators for this iterator. 37 | fn num_active_iterators(&self) -> usize { 38 | 1 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mini-lsm/src/tests/week2_day2.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use tempfile::tempdir; 16 | 17 | use crate::{ 18 | compact::{CompactionOptions, SimpleLeveledCompactionOptions}, 19 | lsm_storage::{LsmStorageOptions, MiniLsm}, 20 | }; 21 | 22 | use super::harness::{check_compaction_ratio, compaction_bench}; 23 | 24 | #[test] 25 | fn test_integration() { 26 | let dir = tempdir().unwrap(); 27 | let storage = MiniLsm::open( 28 | &dir, 29 | LsmStorageOptions::default_for_week2_test(CompactionOptions::Simple( 30 | SimpleLeveledCompactionOptions { 31 | level0_file_num_compaction_trigger: 2, 32 | max_levels: 3, 33 | size_ratio_percent: 200, 34 | }, 35 | )), 36 | ) 37 | .unwrap(); 38 | 39 | compaction_bench(storage.clone()); 40 | check_compaction_ratio(storage.clone()); 41 | } 42 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/mvcc/watermark.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::collections::BTreeMap; 19 | 20 | pub struct Watermark { 21 | readers: BTreeMap, 22 | } 23 | 24 | impl Default for Watermark { 25 | fn default() -> Self { 26 | Self::new() 27 | } 28 | } 29 | 30 | impl Watermark { 31 | pub fn new() -> Self { 32 | Self { 33 | readers: BTreeMap::new(), 34 | } 35 | } 36 | 37 | pub fn add_reader(&mut self, ts: u64) {} 38 | 39 | pub fn remove_reader(&mut self, ts: u64) {} 40 | 41 | pub fn num_retained_snapshots(&self) -> usize { 42 | self.readers.len() 43 | } 44 | 45 | pub fn watermark(&self) -> Option { 46 | Some(0) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /mini-lsm/src/tests/week2_day4.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use tempfile::tempdir; 16 | 17 | use crate::{ 18 | compact::{CompactionOptions, LeveledCompactionOptions}, 19 | lsm_storage::{LsmStorageOptions, MiniLsm}, 20 | }; 21 | 22 | use super::harness::{check_compaction_ratio, compaction_bench}; 23 | 24 | #[test] 25 | fn test_integration() { 26 | let dir = tempdir().unwrap(); 27 | let storage = MiniLsm::open( 28 | &dir, 29 | LsmStorageOptions::default_for_week2_test(CompactionOptions::Leveled( 30 | LeveledCompactionOptions { 31 | level0_file_num_compaction_trigger: 2, 32 | level_size_multiplier: 2, 33 | base_level_size_mb: 1, 34 | max_levels: 4, 35 | }, 36 | )), 37 | ) 38 | .unwrap(); 39 | 40 | compaction_bench(storage.clone()); 41 | check_compaction_ratio(storage.clone()); 42 | } 43 | -------------------------------------------------------------------------------- /mini-lsm/src/tests/week2_day3.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use tempfile::tempdir; 16 | 17 | use crate::{ 18 | compact::{CompactionOptions, TieredCompactionOptions}, 19 | lsm_storage::{LsmStorageOptions, MiniLsm}, 20 | }; 21 | 22 | use super::harness::{check_compaction_ratio, compaction_bench}; 23 | 24 | #[test] 25 | fn test_integration() { 26 | let dir = tempdir().unwrap(); 27 | let storage = MiniLsm::open( 28 | &dir, 29 | LsmStorageOptions::default_for_week2_test(CompactionOptions::Tiered( 30 | TieredCompactionOptions { 31 | num_tiers: 3, 32 | max_size_amplification_percent: 200, 33 | size_ratio: 1, 34 | min_merge_width: 2, 35 | max_merge_width: None, 36 | }, 37 | )), 38 | ) 39 | .unwrap(); 40 | 41 | compaction_bench(storage.clone()); 42 | check_compaction_ratio(storage.clone()); 43 | } 44 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/mvcc/watermark.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::collections::BTreeMap; 16 | 17 | pub struct Watermark { 18 | readers: BTreeMap, 19 | } 20 | 21 | impl Default for Watermark { 22 | fn default() -> Self { 23 | Self::new() 24 | } 25 | } 26 | 27 | impl Watermark { 28 | pub fn new() -> Self { 29 | Self { 30 | readers: BTreeMap::new(), 31 | } 32 | } 33 | 34 | pub fn add_reader(&mut self, ts: u64) { 35 | *self.readers.entry(ts).or_default() += 1; 36 | } 37 | 38 | pub fn remove_reader(&mut self, ts: u64) { 39 | let cnt = self.readers.get_mut(&ts).unwrap(); 40 | *cnt -= 1; 41 | if *cnt == 0 { 42 | self.readers.remove(&ts); 43 | } 44 | } 45 | 46 | pub fn num_retained_snapshots(&self) -> usize { 47 | self.readers.len() 48 | } 49 | 50 | pub fn watermark(&self) -> Option { 51 | self.readers.first_key_value().map(|(ts, _)| *ts) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/block.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | mod builder; 19 | mod iterator; 20 | 21 | pub use builder::BlockBuilder; 22 | use bytes::Bytes; 23 | pub use iterator::BlockIterator; 24 | 25 | /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. 26 | pub struct Block { 27 | pub(crate) data: Vec, 28 | pub(crate) offsets: Vec, 29 | } 30 | 31 | impl Block { 32 | /// Encode the internal data to the data layout illustrated in the course 33 | /// Note: You may want to recheck if any of the expected field is missing from your output 34 | pub fn encode(&self) -> Bytes { 35 | unimplemented!() 36 | } 37 | 38 | /// Decode from the data layout, transform the input `data` to a single `Block` 39 | pub fn decode(data: &[u8]) -> Self { 40 | unimplemented!() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/compact/tiered.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use serde::{Deserialize, Serialize}; 16 | 17 | use crate::lsm_storage::LsmStorageState; 18 | 19 | #[derive(Debug, Serialize, Deserialize)] 20 | pub struct TieredCompactionTask { 21 | pub tiers: Vec<(usize, Vec)>, 22 | pub bottom_tier_included: bool, 23 | } 24 | 25 | #[derive(Debug, Clone)] 26 | pub struct TieredCompactionOptions { 27 | pub num_tiers: usize, 28 | pub max_size_amplification_percent: usize, 29 | pub size_ratio: usize, 30 | pub min_merge_width: usize, 31 | pub max_merge_width: Option, 32 | } 33 | 34 | pub struct TieredCompactionController { 35 | options: TieredCompactionOptions, 36 | } 37 | 38 | impl TieredCompactionController { 39 | pub fn new(options: TieredCompactionOptions) -> Self { 40 | Self { options } 41 | } 42 | 43 | pub fn generate_compaction_task( 44 | &self, 45 | _snapshot: &LsmStorageState, 46 | ) -> Option { 47 | unimplemented!() 48 | } 49 | 50 | pub fn apply_compaction_result( 51 | &self, 52 | _snapshot: &LsmStorageState, 53 | _task: &TieredCompactionTask, 54 | _output: &[usize], 55 | ) -> (LsmStorageState, Vec) { 56 | unimplemented!() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /mini-lsm-book/src/00-v1.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Mini-LSM v1 6 | 7 | This is a legacy version of the Mini-LSM course and we will not maintain it anymore. We now have a new version of this course. We keep the legacy version in this book so that the search engine can keep the pages in the index and users can follow the links to the new version of the course. 8 | 9 | ## V1 Course Overview 10 | 11 | ![Course Overview](legacy-lsm-tutorial/00-lsm-course-overview.svg) 12 | 13 | In this course, we will build the LSM tree structure in 7 days: 14 | 15 | * Day 1: Block encoding. SSTs are composed of multiple data blocks. We will implement the block encoding. 16 | * Day 2: SST encoding. 17 | * Day 3: MemTable and Merge Iterators. 18 | * Day 4: Block cache and Engine. To reduce disk I/O and maximize performance, we will use moka-rs to build a block cache 19 | for the LSM tree. In this day we will get a functional (but not persistent) key-value engine with `get`, `put`, `scan`, 20 | `delete` API. 21 | * Day 5: Compaction. Now it's time to maintain a leveled structure for SSTs. 22 | * Day 6: Recovery. We will implement WAL and manifest so that the engine can recover after restart. 23 | * Day 7: Bloom filter and key compression. They are widely-used optimizations in LSM tree structures. 24 | 25 | ## Development Guide 26 | 27 | We provide you starter code (see `mini-lsm-starter` crate), where we simply replace all function body with 28 | `unimplemented!()`. You can start your project based on this starter code. We provide test cases, but they are very 29 | simple. We recommend you to think carefully about your implementation and write test cases by yourself. 30 | 31 | * You can use `cargo x scheck` to run all test cases and do style check in your codebase. 32 | * You can use `cargo x copy-test dayX` to copy test cases to the starter code. 33 | 34 | {{#include copyright.md}} 35 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/wal.rs: -------------------------------------------------------------------------------- 1 | // REMOVE THIS LINE after fully implementing this functionality 2 | // Copyright (c) 2022-2025 Alex Chi Z 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use anyhow::Result; 19 | use bytes::Bytes; 20 | use crossbeam_skiplist::SkipMap; 21 | use parking_lot::Mutex; 22 | use std::fs::File; 23 | use std::io::BufWriter; 24 | use std::path::Path; 25 | use std::sync::Arc; 26 | 27 | use crate::key::KeySlice; 28 | 29 | pub struct Wal { 30 | file: Arc>>, 31 | } 32 | 33 | impl Wal { 34 | pub fn create(_path: impl AsRef) -> Result { 35 | unimplemented!() 36 | } 37 | 38 | pub fn recover(_path: impl AsRef, _skiplist: &SkipMap) -> Result { 39 | unimplemented!() 40 | } 41 | 42 | pub fn put(&self, _key: &[u8], _value: &[u8]) -> Result<()> { 43 | unimplemented!() 44 | } 45 | 46 | /// Implement this in week 3, day 5; if you want to implement this earlier, use `&[u8]` as the key type. 47 | pub fn put_batch(&self, _data: &[(KeySlice, &[u8])]) -> Result<()> { 48 | unimplemented!() 49 | } 50 | 51 | pub fn sync(&self) -> Result<()> { 52 | unimplemented!() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/manifest.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::fs::File; 19 | use std::path::Path; 20 | use std::sync::Arc; 21 | 22 | use anyhow::Result; 23 | use parking_lot::{Mutex, MutexGuard}; 24 | use serde::{Deserialize, Serialize}; 25 | 26 | use crate::compact::CompactionTask; 27 | 28 | pub struct Manifest { 29 | file: Arc>, 30 | } 31 | 32 | #[derive(Serialize, Deserialize)] 33 | pub enum ManifestRecord { 34 | Flush(usize), 35 | NewMemtable(usize), 36 | Compaction(CompactionTask, Vec), 37 | } 38 | 39 | impl Manifest { 40 | pub fn create(_path: impl AsRef) -> Result { 41 | unimplemented!() 42 | } 43 | 44 | pub fn recover(_path: impl AsRef) -> Result<(Self, Vec)> { 45 | unimplemented!() 46 | } 47 | 48 | pub fn add_record( 49 | &self, 50 | _state_lock_observer: &MutexGuard<()>, 51 | record: ManifestRecord, 52 | ) -> Result<()> { 53 | self.add_record_when_init(record) 54 | } 55 | 56 | pub fn add_record_when_init(&self, _record: ManifestRecord) -> Result<()> { 57 | unimplemented!() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /mini-lsm-book/src/week1-overview.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Week 1 Overview: Mini-LSM 6 | 7 | ![Chapter Overview](./lsm-tutorial/week1-overview.svg) 8 | 9 | In the first week of the course, you will build necessary storage formats for the storage engine, the read path and the write path of the system, and have a working implementation of an LSM-based key-value store. There are 7 chapters (days) for this part. 10 | 11 | * [Day 1: Memtable](./week1-01-memtable.md). You will implement the in-memory read and write path of the system. 12 | * [Day 2: Merge Iterator](./week1-02-merge-iterator.md). You will extend what you have built in day 1 and implement a `scan` interface for your system. 13 | * [Day 3: Block Encoding](./week1-03-block.md). Now we start the first step of the on-disk structure and build the encoding/decoding of the blocks. 14 | * [Day 4: SST Encoding](./week1-04-sst.md). SSTs are composed of blocks and at the end of the day, you will have the basic building blocks of the LSM on-disk structure. 15 | * [Day 5: Read Path](./week1-05-read-path.md). Now that we have both in-memory and on-disk structures, we can combine them together and have a fully-working read path for the storage engine. 16 | * [Day 6: Write Path](./week1-06-write-path.md). In day 5, the test harness generates the structures, and in day 6, you will control the SST flushes by yourself. You will implement flush to level-0 SST and the storage engine is complete. 17 | * [Day 7: SST Optimizations](./week1-07-sst-optimizations.md). We will implement several SST format optimizations and improve the performance of the system. 18 | 19 | At the end of the week, your storage engine should be able to handle all get/scan/put requests. The only missing parts are persisting the LSM state to disk and a more efficient way of organizing the SSTs on the disk. You will have a working **Mini-LSM** storage engine. 20 | 21 | {{#include copyright.md}} 22 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/block/builder.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use crate::key::{KeySlice, KeyVec}; 19 | 20 | use super::Block; 21 | 22 | /// Builds a block. 23 | pub struct BlockBuilder { 24 | /// Offsets of each key-value entries. 25 | offsets: Vec, 26 | /// All serialized key-value pairs in the block. 27 | data: Vec, 28 | /// The expected block size. 29 | block_size: usize, 30 | /// The first key in the block 31 | first_key: KeyVec, 32 | } 33 | 34 | impl BlockBuilder { 35 | /// Creates a new block builder. 36 | pub fn new(block_size: usize) -> Self { 37 | unimplemented!() 38 | } 39 | 40 | /// Adds a key-value pair to the block. Returns false when the block is full. 41 | /// You may find the `bytes::BufMut` trait useful for manipulating binary data. 42 | #[must_use] 43 | pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { 44 | unimplemented!() 45 | } 46 | 47 | /// Check if there is no key-value pair in the block. 48 | pub fn is_empty(&self) -> bool { 49 | unimplemented!() 50 | } 51 | 52 | /// Finalize the block. 53 | pub fn build(self) -> Block { 54 | unimplemented!() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /mini-lsm-book/src/week4-overview.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # The Rest of Your Life (TBD) 6 | 7 | This is an advanced part that deep dives into optimizations and applications of the LSM storage engine and will make your implementation more production-ready. We are still planning on the content, and this part will not be publicly available in near future. 8 | 9 | | Week + Chapter | Topic | Solution | Starter Code | Writeup | 10 | | -------------- | ------------------------------------ | -------- | ------------ | ------- | 11 | | 4.1 | Benchmarking | | | | 12 | | 4.2 | Block Compression | | | | 13 | | 4.3 | Trivial Move and Parallel Compaction | | | | 14 | | 4.4 | Alternative Block Encodings | | | | 15 | | 4.5 | Rate Limiter and I/O Optimizations | | | | 16 | | 4.6 | Build Your Own Block Cache | | | | 17 | | 4.7 | Build Your Own SkipList | | | | 18 | | 4.8 | Async Engine | | | | 19 | | 4.9 | IO-uring-based I/O engine | | | | 20 | | 4.10 | Prefetching | | | | 21 | | 4.11 | Key-Value Separation | | | | 22 | | 4.12 | Column Families | | | | 23 | | 4.13 | Sharding | | | | 24 | | 4.14 | Compaction Optimizations | | | | 25 | | 4.15 | SQL over Mini-LSM | | | | 26 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/iterators/two_merge_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use anyhow::Result; 19 | 20 | use super::StorageIterator; 21 | 22 | /// Merges two iterators of different types into one. If the two iterators have the same key, only 23 | /// produce the key once and prefer the entry from A. 24 | pub struct TwoMergeIterator { 25 | a: A, 26 | b: B, 27 | // Add fields as need 28 | } 29 | 30 | impl< 31 | A: 'static + StorageIterator, 32 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 33 | > TwoMergeIterator 34 | { 35 | pub fn create(a: A, b: B) -> Result { 36 | unimplemented!() 37 | } 38 | } 39 | 40 | impl< 41 | A: 'static + StorageIterator, 42 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 43 | > StorageIterator for TwoMergeIterator 44 | { 45 | type KeyType<'a> = A::KeyType<'a>; 46 | 47 | fn key(&self) -> Self::KeyType<'_> { 48 | unimplemented!() 49 | } 50 | 51 | fn value(&self) -> &[u8] { 52 | unimplemented!() 53 | } 54 | 55 | fn is_valid(&self) -> bool { 56 | unimplemented!() 57 | } 58 | 59 | fn next(&mut self) -> Result<()> { 60 | unimplemented!() 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /mini-lsm-book/src/sitemap.txt: -------------------------------------------------------------------------------- 1 | https://skyzh.github.io/mini-lsm 2 | https://skyzh.github.io/mini-lsm/00-get-started 3 | https://skyzh.github.io/mini-lsm/00-overview 4 | https://skyzh.github.io/mini-lsm/00-preface 5 | https://skyzh.github.io/mini-lsm/00-v1 6 | https://skyzh.github.io/mini-lsm/01-block 7 | https://skyzh.github.io/mini-lsm/02-sst 8 | https://skyzh.github.io/mini-lsm/03-memtable 9 | https://skyzh.github.io/mini-lsm/04-engine 10 | https://skyzh.github.io/mini-lsm/05-compaction 11 | https://skyzh.github.io/mini-lsm/06-recovery 12 | https://skyzh.github.io/mini-lsm/07-bloom-filter 13 | https://skyzh.github.io/mini-lsm/08-key-compression 14 | https://skyzh.github.io/mini-lsm/09-whats-next 15 | https://skyzh.github.io/mini-lsm/week1-01-memtable 16 | https://skyzh.github.io/mini-lsm/week1-02-merge-iterator 17 | https://skyzh.github.io/mini-lsm/week1-03-block 18 | https://skyzh.github.io/mini-lsm/week1-04-sst 19 | https://skyzh.github.io/mini-lsm/week1-05-read-path 20 | https://skyzh.github.io/mini-lsm/week1-06-write-path 21 | https://skyzh.github.io/mini-lsm/week1-07-sst-optimizations 22 | https://skyzh.github.io/mini-lsm/week1-overview 23 | https://skyzh.github.io/mini-lsm/week2-01-compaction 24 | https://skyzh.github.io/mini-lsm/week2-02-simple 25 | https://skyzh.github.io/mini-lsm/week2-03-tiered 26 | https://skyzh.github.io/mini-lsm/week2-04-leveled 27 | https://skyzh.github.io/mini-lsm/week2-05-manifest 28 | https://skyzh.github.io/mini-lsm/week2-06-wal 29 | https://skyzh.github.io/mini-lsm/week2-07-snacks 30 | https://skyzh.github.io/mini-lsm/week2-overview 31 | https://skyzh.github.io/mini-lsm/week3-01-ts-key-refactor 32 | https://skyzh.github.io/mini-lsm/week3-02-snapshot-read-part-1 33 | https://skyzh.github.io/mini-lsm/week3-03-snapshot-read-part-2 34 | https://skyzh.github.io/mini-lsm/week3-04-watermark 35 | https://skyzh.github.io/mini-lsm/week3-05-txn-occ 36 | https://skyzh.github.io/mini-lsm/week3-06-serializable 37 | https://skyzh.github.io/mini-lsm/week3-07-compaction-filter 38 | https://skyzh.github.io/mini-lsm/week3-overview 39 | https://skyzh.github.io/mini-lsm/week4-overview 40 | -------------------------------------------------------------------------------- /mini-lsm/src/block.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | mod builder; 16 | mod iterator; 17 | 18 | pub use builder::BlockBuilder; 19 | use bytes::{Buf, BufMut, Bytes}; 20 | pub use iterator::BlockIterator; 21 | 22 | pub(crate) const SIZEOF_U16: usize = std::mem::size_of::(); 23 | 24 | /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted 25 | /// key-value pairs. 26 | pub struct Block { 27 | pub(crate) data: Vec, 28 | pub(crate) offsets: Vec, 29 | } 30 | 31 | impl Block { 32 | pub fn encode(&self) -> Bytes { 33 | let mut buf = self.data.clone(); 34 | let offsets_len = self.offsets.len(); 35 | for offset in &self.offsets { 36 | buf.put_u16(*offset); 37 | } 38 | // Adds number of elements at the end of the block 39 | buf.put_u16(offsets_len as u16); 40 | buf.into() 41 | } 42 | 43 | pub fn decode(data: &[u8]) -> Self { 44 | // get number of elements in the block 45 | let entry_offsets_len = (&data[data.len() - SIZEOF_U16..]).get_u16() as usize; 46 | let data_end = data.len() - SIZEOF_U16 - entry_offsets_len * SIZEOF_U16; 47 | let offsets_raw = &data[data_end..data.len() - SIZEOF_U16]; 48 | // get offset array 49 | let offsets = offsets_raw 50 | .chunks(SIZEOF_U16) 51 | .map(|mut x| x.get_u16()) 52 | .collect(); 53 | // retrieve data 54 | let data = data[0..data_end].to_vec(); 55 | Self { data, offsets } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/block.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | mod builder; 16 | mod iterator; 17 | 18 | pub use builder::BlockBuilder; 19 | use bytes::{Buf, BufMut, Bytes}; 20 | pub use iterator::BlockIterator; 21 | 22 | pub(crate) const SIZEOF_U16: usize = std::mem::size_of::(); 23 | 24 | /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted 25 | /// key-value pairs. 26 | pub struct Block { 27 | pub(crate) data: Vec, 28 | pub(crate) offsets: Vec, 29 | } 30 | 31 | impl Block { 32 | pub fn encode(&self) -> Bytes { 33 | let mut buf = self.data.clone(); 34 | let offsets_len = self.offsets.len(); 35 | for offset in &self.offsets { 36 | buf.put_u16(*offset); 37 | } 38 | // Adds number of elements at the end of the block 39 | buf.put_u16(offsets_len as u16); 40 | buf.into() 41 | } 42 | 43 | pub fn decode(data: &[u8]) -> Self { 44 | // get number of elements in the block 45 | let entry_offsets_len = (&data[data.len() - SIZEOF_U16..]).get_u16() as usize; 46 | let data_end = data.len() - SIZEOF_U16 - entry_offsets_len * SIZEOF_U16; 47 | let offsets_raw = &data[data_end..data.len() - SIZEOF_U16]; 48 | // get offset array 49 | let offsets = offsets_raw 50 | .chunks(SIZEOF_U16) 51 | .map(|mut x| x.get_u16()) 52 | .collect(); 53 | // retrieve data 54 | let data = data[0..data_end].to_vec(); 55 | Self { data, offsets } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/iterators/concat_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::sync::Arc; 19 | 20 | use anyhow::Result; 21 | 22 | use super::StorageIterator; 23 | use crate::{ 24 | key::KeySlice, 25 | table::{SsTable, SsTableIterator}, 26 | }; 27 | 28 | /// Concat multiple iterators ordered in key order and their key ranges do not overlap. We do not want to create the 29 | /// iterators when initializing this iterator to reduce the overhead of seeking. 30 | pub struct SstConcatIterator { 31 | current: Option, 32 | next_sst_idx: usize, 33 | sstables: Vec>, 34 | } 35 | 36 | impl SstConcatIterator { 37 | pub fn create_and_seek_to_first(sstables: Vec>) -> Result { 38 | unimplemented!() 39 | } 40 | 41 | pub fn create_and_seek_to_key(sstables: Vec>, key: KeySlice) -> Result { 42 | unimplemented!() 43 | } 44 | } 45 | 46 | impl StorageIterator for SstConcatIterator { 47 | type KeyType<'a> = KeySlice<'a>; 48 | 49 | fn key(&self) -> KeySlice { 50 | unimplemented!() 51 | } 52 | 53 | fn value(&self) -> &[u8] { 54 | unimplemented!() 55 | } 56 | 57 | fn is_valid(&self) -> bool { 58 | unimplemented!() 59 | } 60 | 61 | fn next(&mut self) -> Result<()> { 62 | unimplemented!() 63 | } 64 | 65 | fn num_active_iterators(&self) -> usize { 66 | 1 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week3_day1.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use bytes::Bytes; 18 | use tempfile::tempdir; 19 | 20 | use crate::key::KeySlice; 21 | use crate::table::{FileObject, SsTable, SsTableBuilder, SsTableIterator}; 22 | 23 | use super::harness::{check_iter_result_by_key_and_ts, generate_sst_with_ts}; 24 | 25 | #[test] 26 | fn test_sst_build_multi_version_simple() { 27 | let mut builder = SsTableBuilder::new(16); 28 | builder.add( 29 | KeySlice::for_testing_from_slice_with_ts(b"233", 233), 30 | b"233333", 31 | ); 32 | builder.add( 33 | KeySlice::for_testing_from_slice_with_ts(b"233", 0), 34 | b"2333333", 35 | ); 36 | let dir = tempdir().unwrap(); 37 | builder.build_for_test(dir.path().join("1.sst")).unwrap(); 38 | } 39 | 40 | fn generate_test_data() -> Vec<((Bytes, u64), Bytes)> { 41 | (0..100) 42 | .map(|id| { 43 | ( 44 | (Bytes::from(format!("key{:05}", id / 5)), 5 - (id % 5)), 45 | Bytes::from(format!("value{:05}", id)), 46 | ) 47 | }) 48 | .collect() 49 | } 50 | 51 | #[test] 52 | fn test_sst_build_multi_version_hard() { 53 | let dir = tempdir().unwrap(); 54 | let data = generate_test_data(); 55 | generate_sst_with_ts(1, dir.path().join("1.sst"), data.clone(), None); 56 | let sst = Arc::new( 57 | SsTable::open( 58 | 1, 59 | None, 60 | FileObject::open(&dir.path().join("1.sst")).unwrap(), 61 | ) 62 | .unwrap(), 63 | ); 64 | check_iter_result_by_key_and_ts( 65 | &mut SsTableIterator::create_and_seek_to_first(sst).unwrap(), 66 | data, 67 | ); 68 | } 69 | -------------------------------------------------------------------------------- /mini-lsm-book/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # LSM in a Week 6 | 7 | [Preface](./00-preface.md) 8 | [Mini-LSM Overview](./00-overview.md) 9 | [Environment Setup](./00-get-started.md) 10 | 11 | - [Week 1 Overview: Mini-LSM](./week1-overview.md) 12 | - [Memtable](./week1-01-memtable.md) 13 | - [Merge Iterator](./week1-02-merge-iterator.md) 14 | - [Block](./week1-03-block.md) 15 | - [Sorted String Table (SST)](./week1-04-sst.md) 16 | - [Read Path](./week1-05-read-path.md) 17 | - [Write Path](./week1-06-write-path.md) 18 | - [Snack Time: SST Optimizations](./week1-07-sst-optimizations.md) 19 | 20 | - [Week 2 Overview: Compaction + Persistence](./week2-overview.md) 21 | - [Compaction Implementation](./week2-01-compaction.md) 22 | - [Simple Compaction Strategy](./week2-02-simple.md) 23 | - [Tiered Compaction Strategy](./week2-03-tiered.md) 24 | - [Leveled Compaction Strategy](./week2-04-leveled.md) 25 | - [Manifest](./week2-05-manifest.md) 26 | - [Write-Ahead Log (WAL)](./week2-06-wal.md) 27 | - [Snack Time: Batch Write and Checksums](./week2-07-snacks.md) 28 | 29 | - [Week 3 Overview: MVCC](./week3-overview.md) 30 | - [Timestamp Encoding + Refactor](./week3-01-ts-key-refactor.md) 31 | - [Snapshots - Memtables and Timestamps](./week3-02-snapshot-read-part-1.md) 32 | - [Snapshots - Transaction API](./week3-03-snapshot-read-part-2.md) 33 | - [Watermark and GC](./week3-04-watermark.md) 34 | - [Transaction and OCC](./week3-05-txn-occ.md) 35 | - [Serializable Snapshot Isolation](./week3-06-serializable.md) 36 | - [Snack Time: Compaction Filters](./week3-07-compaction-filter.md) 37 | - [The Rest of Your Life (TBD)](./week4-overview.md) 38 | 39 | --- 40 | 41 | # DEPRECATED Mini-LSM v1 42 | 43 | - [Overview](./00-v1.md) 44 | - [Store key-value pairs in little blocks](./01-block.md) 45 | - [And make them into an SST](./02-sst.md) 46 | - [Now it's time to merge everything](./03-memtable.md) 47 | - [The engine is on fire](./04-engine.md) 48 | - [Let's do something in the background](./05-compaction.md) 49 | - [Be careful when the system crashes](./06-recovery.md) 50 | - [A good bloom filter makes life easier](./07-bloom-filter.md) 51 | - [Save some space, hopefully](./08-key-compression.md) 52 | - [What's next](./09-whats-next.md) 53 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/compact/leveled.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use serde::{Deserialize, Serialize}; 16 | 17 | use crate::lsm_storage::LsmStorageState; 18 | 19 | #[derive(Debug, Serialize, Deserialize)] 20 | pub struct LeveledCompactionTask { 21 | // if upper_level is `None`, then it is L0 compaction 22 | pub upper_level: Option, 23 | pub upper_level_sst_ids: Vec, 24 | pub lower_level: usize, 25 | pub lower_level_sst_ids: Vec, 26 | pub is_lower_level_bottom_level: bool, 27 | } 28 | 29 | #[derive(Debug, Clone)] 30 | pub struct LeveledCompactionOptions { 31 | pub level_size_multiplier: usize, 32 | pub level0_file_num_compaction_trigger: usize, 33 | pub max_levels: usize, 34 | pub base_level_size_mb: usize, 35 | } 36 | 37 | pub struct LeveledCompactionController { 38 | options: LeveledCompactionOptions, 39 | } 40 | 41 | impl LeveledCompactionController { 42 | pub fn new(options: LeveledCompactionOptions) -> Self { 43 | Self { options } 44 | } 45 | 46 | fn find_overlapping_ssts( 47 | &self, 48 | _snapshot: &LsmStorageState, 49 | _sst_ids: &[usize], 50 | _in_level: usize, 51 | ) -> Vec { 52 | unimplemented!() 53 | } 54 | 55 | pub fn generate_compaction_task( 56 | &self, 57 | _snapshot: &LsmStorageState, 58 | ) -> Option { 59 | unimplemented!() 60 | } 61 | 62 | pub fn apply_compaction_result( 63 | &self, 64 | _snapshot: &LsmStorageState, 65 | _task: &LeveledCompactionTask, 66 | _output: &[usize], 67 | _in_recovery: bool, 68 | ) -> (LsmStorageState, Vec) { 69 | unimplemented!() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /mini-lsm-book/src/discord-badge.svg: -------------------------------------------------------------------------------- 1 | MDbot#9808skyzh's server -------------------------------------------------------------------------------- /mini-lsm-starter/src/mvcc.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | pub mod txn; 19 | pub mod watermark; 20 | 21 | use std::{ 22 | collections::{BTreeMap, HashSet}, 23 | sync::Arc, 24 | }; 25 | 26 | use parking_lot::Mutex; 27 | 28 | use self::{txn::Transaction, watermark::Watermark}; 29 | use crate::lsm_storage::LsmStorageInner; 30 | 31 | pub(crate) struct CommittedTxnData { 32 | pub(crate) key_hashes: HashSet, 33 | #[allow(dead_code)] 34 | pub(crate) read_ts: u64, 35 | #[allow(dead_code)] 36 | pub(crate) commit_ts: u64, 37 | } 38 | 39 | pub(crate) struct LsmMvccInner { 40 | pub(crate) write_lock: Mutex<()>, 41 | pub(crate) commit_lock: Mutex<()>, 42 | pub(crate) ts: Arc>, 43 | pub(crate) committed_txns: Arc>>, 44 | } 45 | 46 | impl LsmMvccInner { 47 | pub fn new(initial_ts: u64) -> Self { 48 | Self { 49 | write_lock: Mutex::new(()), 50 | commit_lock: Mutex::new(()), 51 | ts: Arc::new(Mutex::new((initial_ts, Watermark::new()))), 52 | committed_txns: Arc::new(Mutex::new(BTreeMap::new())), 53 | } 54 | } 55 | 56 | pub fn latest_commit_ts(&self) -> u64 { 57 | self.ts.lock().0 58 | } 59 | 60 | pub fn update_commit_ts(&self, ts: u64) { 61 | self.ts.lock().0 = ts; 62 | } 63 | 64 | /// All ts (strictly) below this ts can be garbage collected. 65 | pub fn watermark(&self) -> u64 { 66 | let ts = self.ts.lock(); 67 | ts.1.watermark().unwrap_or(ts.0) 68 | } 69 | 70 | pub fn new_txn(&self, inner: Arc, serializable: bool) -> Arc { 71 | unimplemented!() 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/table/builder.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::path::Path; 19 | use std::sync::Arc; 20 | 21 | use anyhow::Result; 22 | 23 | use super::{BlockMeta, SsTable}; 24 | use crate::{block::BlockBuilder, key::KeySlice, lsm_storage::BlockCache}; 25 | 26 | /// Builds an SSTable from key-value pairs. 27 | pub struct SsTableBuilder { 28 | builder: BlockBuilder, 29 | first_key: Vec, 30 | last_key: Vec, 31 | data: Vec, 32 | pub(crate) meta: Vec, 33 | block_size: usize, 34 | } 35 | 36 | impl SsTableBuilder { 37 | /// Create a builder based on target block size. 38 | pub fn new(block_size: usize) -> Self { 39 | unimplemented!() 40 | } 41 | 42 | /// Adds a key-value pair to SSTable. 43 | /// 44 | /// Note: You should split a new block when the current block is full.(`std::mem::replace` may 45 | /// be helpful here) 46 | pub fn add(&mut self, key: KeySlice, value: &[u8]) { 47 | unimplemented!() 48 | } 49 | 50 | /// Get the estimated size of the SSTable. 51 | /// 52 | /// Since the data blocks contain much more data than meta blocks, just return the size of data 53 | /// blocks here. 54 | pub fn estimated_size(&self) -> usize { 55 | unimplemented!() 56 | } 57 | 58 | /// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects. 59 | pub fn build( 60 | #[allow(unused_mut)] mut self, 61 | id: usize, 62 | block_cache: Option>, 63 | path: impl AsRef, 64 | ) -> Result { 65 | unimplemented!() 66 | } 67 | 68 | #[cfg(test)] 69 | pub(crate) fn build_for_test(self, path: impl AsRef) -> Result { 70 | self.build(0, None, path) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/iterators/merge_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::cmp::{self}; 19 | use std::collections::BinaryHeap; 20 | 21 | use anyhow::Result; 22 | 23 | use crate::key::KeySlice; 24 | 25 | use super::StorageIterator; 26 | 27 | struct HeapWrapper(pub usize, pub Box); 28 | 29 | impl PartialEq for HeapWrapper { 30 | fn eq(&self, other: &Self) -> bool { 31 | self.cmp(other) == cmp::Ordering::Equal 32 | } 33 | } 34 | 35 | impl Eq for HeapWrapper {} 36 | 37 | impl PartialOrd for HeapWrapper { 38 | fn partial_cmp(&self, other: &Self) -> Option { 39 | Some(self.cmp(other)) 40 | } 41 | } 42 | 43 | impl Ord for HeapWrapper { 44 | fn cmp(&self, other: &Self) -> cmp::Ordering { 45 | self.1 46 | .key() 47 | .cmp(&other.1.key()) 48 | .then(self.0.cmp(&other.0)) 49 | .reverse() 50 | } 51 | } 52 | 53 | /// Merge multiple iterators of the same type. If the same key occurs multiple times in some 54 | /// iterators, prefer the one with smaller index. 55 | pub struct MergeIterator { 56 | iters: BinaryHeap>, 57 | current: Option>, 58 | } 59 | 60 | impl MergeIterator { 61 | pub fn create(iters: Vec>) -> Self { 62 | unimplemented!() 63 | } 64 | } 65 | 66 | impl StorageIterator = KeySlice<'a>>> StorageIterator 67 | for MergeIterator 68 | { 69 | type KeyType<'a> = KeySlice<'a>; 70 | 71 | fn key(&self) -> KeySlice { 72 | unimplemented!() 73 | } 74 | 75 | fn value(&self) -> &[u8] { 76 | unimplemented!() 77 | } 78 | 79 | fn is_valid(&self) -> bool { 80 | unimplemented!() 81 | } 82 | 83 | fn next(&mut self) -> Result<()> { 84 | unimplemented!() 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/compact/simple_leveled.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use serde::{Deserialize, Serialize}; 16 | 17 | use crate::lsm_storage::LsmStorageState; 18 | 19 | #[derive(Debug, Clone)] 20 | pub struct SimpleLeveledCompactionOptions { 21 | pub size_ratio_percent: usize, 22 | pub level0_file_num_compaction_trigger: usize, 23 | pub max_levels: usize, 24 | } 25 | 26 | #[derive(Debug, Serialize, Deserialize)] 27 | pub struct SimpleLeveledCompactionTask { 28 | // if upper_level is `None`, then it is L0 compaction 29 | pub upper_level: Option, 30 | pub upper_level_sst_ids: Vec, 31 | pub lower_level: usize, 32 | pub lower_level_sst_ids: Vec, 33 | pub is_lower_level_bottom_level: bool, 34 | } 35 | 36 | pub struct SimpleLeveledCompactionController { 37 | options: SimpleLeveledCompactionOptions, 38 | } 39 | 40 | impl SimpleLeveledCompactionController { 41 | pub fn new(options: SimpleLeveledCompactionOptions) -> Self { 42 | Self { options } 43 | } 44 | 45 | /// Generates a compaction task. 46 | /// 47 | /// Returns `None` if no compaction needs to be scheduled. The order of SSTs in the compaction task id vector matters. 48 | pub fn generate_compaction_task( 49 | &self, 50 | _snapshot: &LsmStorageState, 51 | ) -> Option { 52 | unimplemented!() 53 | } 54 | 55 | /// Apply the compaction result. 56 | /// 57 | /// The compactor will call this function with the compaction task and the list of SST ids generated. This function applies the 58 | /// result and generates a new LSM state. The functions should only change `l0_sstables` and `levels` without changing memtables 59 | /// and `sstables` hash map. Though there should only be one thread running compaction jobs, you should think about the case 60 | /// where an L0 SST gets flushed while the compactor generates new SSTs, and with that in mind, you should do some sanity checks 61 | /// in your implementation. 62 | pub fn apply_compaction_result( 63 | &self, 64 | _snapshot: &LsmStorageState, 65 | _task: &SimpleLeveledCompactionTask, 66 | _output: &[usize], 67 | ) -> (LsmStorageState, Vec) { 68 | unimplemented!() 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/table/iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::sync::Arc; 19 | 20 | use anyhow::Result; 21 | 22 | use super::SsTable; 23 | use crate::{block::BlockIterator, iterators::StorageIterator, key::KeySlice}; 24 | 25 | /// An iterator over the contents of an SSTable. 26 | pub struct SsTableIterator { 27 | table: Arc, 28 | blk_iter: BlockIterator, 29 | blk_idx: usize, 30 | } 31 | 32 | impl SsTableIterator { 33 | /// Create a new iterator and seek to the first key-value pair in the first data block. 34 | pub fn create_and_seek_to_first(table: Arc) -> Result { 35 | unimplemented!() 36 | } 37 | 38 | /// Seek to the first key-value pair in the first data block. 39 | pub fn seek_to_first(&mut self) -> Result<()> { 40 | unimplemented!() 41 | } 42 | 43 | /// Create a new iterator and seek to the first key-value pair which >= `key`. 44 | pub fn create_and_seek_to_key(table: Arc, key: KeySlice) -> Result { 45 | unimplemented!() 46 | } 47 | 48 | /// Seek to the first key-value pair which >= `key`. 49 | /// Note: You probably want to review the handout for detailed explanation when implementing 50 | /// this function. 51 | pub fn seek_to_key(&mut self, key: KeySlice) -> Result<()> { 52 | unimplemented!() 53 | } 54 | } 55 | 56 | impl StorageIterator for SsTableIterator { 57 | type KeyType<'a> = KeySlice<'a>; 58 | 59 | /// Return the `key` that's held by the underlying block iterator. 60 | fn key(&self) -> KeySlice { 61 | unimplemented!() 62 | } 63 | 64 | /// Return the `value` that's held by the underlying block iterator. 65 | fn value(&self) -> &[u8] { 66 | unimplemented!() 67 | } 68 | 69 | /// Return whether the current block iterator is valid or not. 70 | fn is_valid(&self) -> bool { 71 | unimplemented!() 72 | } 73 | 74 | /// Move to the next `key` in the block. 75 | /// Note: You may want to check if the current block iterator is valid after the move. 76 | fn next(&mut self) -> Result<()> { 77 | unimplemented!() 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/lsm_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use anyhow::Result; 19 | 20 | use crate::{ 21 | iterators::{StorageIterator, merge_iterator::MergeIterator}, 22 | mem_table::MemTableIterator, 23 | }; 24 | 25 | /// Represents the internal type for an LSM iterator. This type will be changed across the course for multiple times. 26 | type LsmIteratorInner = MergeIterator; 27 | 28 | pub struct LsmIterator { 29 | inner: LsmIteratorInner, 30 | } 31 | 32 | impl LsmIterator { 33 | pub(crate) fn new(iter: LsmIteratorInner) -> Result { 34 | Ok(Self { inner: iter }) 35 | } 36 | } 37 | 38 | impl StorageIterator for LsmIterator { 39 | type KeyType<'a> = &'a [u8]; 40 | 41 | fn is_valid(&self) -> bool { 42 | unimplemented!() 43 | } 44 | 45 | fn key(&self) -> &[u8] { 46 | unimplemented!() 47 | } 48 | 49 | fn value(&self) -> &[u8] { 50 | unimplemented!() 51 | } 52 | 53 | fn next(&mut self) -> Result<()> { 54 | unimplemented!() 55 | } 56 | } 57 | 58 | /// A wrapper around existing iterator, will prevent users from calling `next` when the iterator is 59 | /// invalid. If an iterator is already invalid, `next` does not do anything. If `next` returns an error, 60 | /// `is_valid` should return false, and `next` should always return an error. 61 | pub struct FusedIterator { 62 | iter: I, 63 | has_errored: bool, 64 | } 65 | 66 | impl FusedIterator { 67 | pub fn new(iter: I) -> Self { 68 | Self { 69 | iter, 70 | has_errored: false, 71 | } 72 | } 73 | } 74 | 75 | impl StorageIterator for FusedIterator { 76 | type KeyType<'a> 77 | = I::KeyType<'a> 78 | where 79 | Self: 'a; 80 | 81 | fn is_valid(&self) -> bool { 82 | unimplemented!() 83 | } 84 | 85 | fn key(&self) -> Self::KeyType<'_> { 86 | unimplemented!() 87 | } 88 | 89 | fn value(&self) -> &[u8] { 90 | unimplemented!() 91 | } 92 | 93 | fn next(&mut self) -> Result<()> { 94 | unimplemented!() 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/mvcc.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | pub mod txn; 19 | pub mod watermark; 20 | 21 | use std::{ 22 | collections::{BTreeMap, HashSet}, 23 | sync::{Arc, atomic::AtomicBool}, 24 | }; 25 | 26 | use crossbeam_skiplist::SkipMap; 27 | use parking_lot::Mutex; 28 | 29 | use crate::lsm_storage::LsmStorageInner; 30 | 31 | use self::{txn::Transaction, watermark::Watermark}; 32 | 33 | pub(crate) struct CommittedTxnData { 34 | pub(crate) key_hashes: HashSet, 35 | #[allow(dead_code)] 36 | pub(crate) read_ts: u64, 37 | #[allow(dead_code)] 38 | pub(crate) commit_ts: u64, 39 | } 40 | 41 | pub(crate) struct LsmMvccInner { 42 | pub(crate) write_lock: Mutex<()>, 43 | pub(crate) commit_lock: Mutex<()>, 44 | pub(crate) ts: Arc>, 45 | pub(crate) committed_txns: Arc>>, 46 | } 47 | 48 | impl LsmMvccInner { 49 | pub fn new(initial_ts: u64) -> Self { 50 | Self { 51 | write_lock: Mutex::new(()), 52 | commit_lock: Mutex::new(()), 53 | ts: Arc::new(Mutex::new((initial_ts, Watermark::new()))), 54 | committed_txns: Arc::new(Mutex::new(BTreeMap::new())), 55 | } 56 | } 57 | 58 | pub fn latest_commit_ts(&self) -> u64 { 59 | self.ts.lock().0 60 | } 61 | 62 | pub fn update_commit_ts(&self, ts: u64) { 63 | self.ts.lock().0 = ts; 64 | } 65 | 66 | /// All ts (strictly) below this ts can be garbage collected. 67 | pub fn watermark(&self) -> u64 { 68 | let ts = self.ts.lock(); 69 | ts.1.watermark().unwrap_or(ts.0) 70 | } 71 | 72 | pub fn new_txn(&self, inner: Arc, serializable: bool) -> Arc { 73 | let mut ts = self.ts.lock(); 74 | let read_ts = ts.0; 75 | ts.1.add_reader(read_ts); 76 | Arc::new(Transaction { 77 | inner, 78 | read_ts, 79 | local_storage: Arc::new(SkipMap::new()), 80 | committed: Arc::new(AtomicBool::new(false)), 81 | key_hashes: if serializable { 82 | Some(Mutex::new((HashSet::new(), HashSet::new()))) 83 | } else { 84 | None 85 | }, 86 | }) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week3_day2.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::time::Duration; 16 | 17 | use tempfile::tempdir; 18 | 19 | use crate::{ 20 | compact::CompactionOptions, 21 | lsm_storage::{LsmStorageOptions, MiniLsm}, 22 | tests::harness::dump_files_in_dir, 23 | }; 24 | 25 | #[test] 26 | fn test_task3_compaction_integration() { 27 | let dir = tempdir().unwrap(); 28 | let mut options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); 29 | options.enable_wal = true; 30 | let storage = MiniLsm::open(&dir, options.clone()).unwrap(); 31 | let _txn = storage.new_txn().unwrap(); 32 | for i in 0..=20000 { 33 | storage 34 | .put(b"0", format!("{:02000}", i).as_bytes()) 35 | .unwrap(); 36 | } 37 | std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush 38 | while { 39 | let snapshot = storage.inner.state.read(); 40 | !snapshot.imm_memtables.is_empty() 41 | } { 42 | storage.inner.force_flush_next_imm_memtable().unwrap(); 43 | } 44 | assert!(storage.inner.state.read().l0_sstables.len() > 1); 45 | storage.force_full_compaction().unwrap(); 46 | storage.dump_structure(); 47 | dump_files_in_dir(&dir); 48 | assert!(storage.inner.state.read().l0_sstables.is_empty()); 49 | assert_eq!(storage.inner.state.read().levels.len(), 1); 50 | // same key in the same SST 51 | assert_eq!(storage.inner.state.read().levels[0].1.len(), 1); 52 | for i in 0..=100 { 53 | storage 54 | .put(b"1", format!("{:02000}", i).as_bytes()) 55 | .unwrap(); 56 | } 57 | storage 58 | .inner 59 | .force_freeze_memtable(&storage.inner.state_lock.lock()) 60 | .unwrap(); 61 | std::thread::sleep(Duration::from_secs(1)); // wait until all memtables flush 62 | while { 63 | let snapshot = storage.inner.state.read(); 64 | !snapshot.imm_memtables.is_empty() 65 | } { 66 | storage.inner.force_flush_next_imm_memtable().unwrap(); 67 | } 68 | storage.force_full_compaction().unwrap(); 69 | storage.dump_structure(); 70 | dump_files_in_dir(&dir); 71 | assert!(storage.inner.state.read().l0_sstables.is_empty()); 72 | assert_eq!(storage.inner.state.read().levels.len(), 1); 73 | // same key in the same SST, now we should split two 74 | assert_eq!(storage.inner.state.read().levels[0].1.len(), 2); 75 | } 76 | -------------------------------------------------------------------------------- /mini-lsm-book/src/lsm-tutorial/week1-01-single.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | week1-01-single 13 | 14 | 15 | Layer 1 16 | 17 | 18 | 19 | 20 | 21 | On Disk 22 | 23 | 24 | 25 | 26 | In Memory 27 | 28 | 29 | 30 | 31 | 32 | 33 | Mem 34 | Table 35 | 36 | 37 | 38 | 39 | 40 | 41 | key + value 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /mini-lsm-book/src/00-overview.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Mini-LSM Course Overview 6 | 7 | ## Course Structure 8 | 9 | ![Course Overview](lsm-tutorial/00-full-overview.svg) 10 | 11 | We have three parts (weeks) for this course. In the first week, we will focus on the storage structure and the storage format of an LSM storage engine. In the second week, we will deeply dive into compactions and implement persistence support for the storage engine. In the third week, we will implement multi-version concurrency control. 12 | 13 | * [The First Week: Mini-LSM](./week1-overview.md) 14 | * [The Second Week: Compaction and Persistence](./week2-overview.md) 15 | * [The Third Week: Multi-Version Concurrency Control](./week3-overview.md) 16 | 17 | Please look at [Environment Setup](./00-get-started.md) to set up the environment. 18 | 19 | ## Overview of LSM 20 | 21 | An LSM storage engine generally contains three parts: 22 | 23 | 1. Write-ahead log to persist temporary data for recovery. 24 | 2. SSTs on the disk to maintain an LSM-tree structure. 25 | 3. Mem-tables in memory for batching small writes. 26 | 27 | The storage engine generally provides the following interfaces: 28 | 29 | * `Put(key, value)`: store a key-value pair in the LSM tree. 30 | * `Delete(key)`: remove a key and its corresponding value. 31 | * `Get(key)`: get the value corresponding to a key. 32 | * `Scan(range)`: get a range of key-value pairs. 33 | 34 | To ensure persistence, 35 | 36 | * `Sync()`: ensure all the operations before `sync` are persisted to the disk. 37 | 38 | Some engines choose to combine `Put` and `Delete` into a single operation called `WriteBatch`, which accepts a batch of key-value pairs. 39 | 40 | In this course, we assume the LSM tree is using a leveled compaction algorithm, which is commonly used in real-world systems. 41 | 42 | ### Write Path 43 | 44 | ![Write Path](lsm-tutorial/00-lsm-write-flow.svg) 45 | 46 | The write path of LSM contains four steps: 47 | 48 | 1. Write the key-value pair to the write-ahead log so that it can be recovered after the storage engine crashes. 49 | 2. Write the key-value pair to memtable. After (1) and (2) are completed, we can notify the user that the write operation is completed. 50 | 3. (In the background) When a mem-table is full, we will freeze them into immutable mem-tables and flush them to the disk as SST files in the background. 51 | 4. (In the background) The engine will compact some files in some levels into lower levels to maintain a good shape for the LSM tree so that the read amplification is low. 52 | 53 | ### Read Path 54 | 55 | ![Read Path](lsm-tutorial/00-lsm-read-flow.svg) 56 | 57 | When we want to read a key, 58 | 59 | 1. We will first probe all the mem-tables from the latest to the oldest. 60 | 2. If the key is not found, we will then search the entire LSM tree containing SSTs to find the data. 61 | 62 | There are two types of read: lookup and scan. Lookup finds one key in the LSM tree, while scan iterates all keys within a range in the storage engine. We will cover both of them throughout the course. 63 | 64 | {{#include copyright.md}} 65 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/block/iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::sync::Arc; 19 | 20 | use crate::key::{KeySlice, KeyVec}; 21 | 22 | use super::Block; 23 | 24 | /// Iterates on a block. 25 | pub struct BlockIterator { 26 | /// The internal `Block`, wrapped by an `Arc` 27 | block: Arc, 28 | /// The current key, empty represents the iterator is invalid 29 | key: KeyVec, 30 | /// the current value range in the block.data, corresponds to the current key 31 | value_range: (usize, usize), 32 | /// Current index of the key-value pair, should be in range of [0, num_of_elements) 33 | idx: usize, 34 | /// The first key in the block 35 | first_key: KeyVec, 36 | } 37 | 38 | impl BlockIterator { 39 | fn new(block: Arc) -> Self { 40 | Self { 41 | block, 42 | key: KeyVec::new(), 43 | value_range: (0, 0), 44 | idx: 0, 45 | first_key: KeyVec::new(), 46 | } 47 | } 48 | 49 | /// Creates a block iterator and seek to the first entry. 50 | pub fn create_and_seek_to_first(block: Arc) -> Self { 51 | unimplemented!() 52 | } 53 | 54 | /// Creates a block iterator and seek to the first key that >= `key`. 55 | pub fn create_and_seek_to_key(block: Arc, key: KeySlice) -> Self { 56 | unimplemented!() 57 | } 58 | 59 | /// Returns the key of the current entry. 60 | pub fn key(&self) -> KeySlice { 61 | unimplemented!() 62 | } 63 | 64 | /// Returns the value of the current entry. 65 | pub fn value(&self) -> &[u8] { 66 | unimplemented!() 67 | } 68 | 69 | /// Returns true if the iterator is valid. 70 | /// Note: You may want to make use of `key` 71 | pub fn is_valid(&self) -> bool { 72 | unimplemented!() 73 | } 74 | 75 | /// Seeks to the first key in the block. 76 | pub fn seek_to_first(&mut self) { 77 | unimplemented!() 78 | } 79 | 80 | /// Move to the next key in the block. 81 | pub fn next(&mut self) { 82 | unimplemented!() 83 | } 84 | 85 | /// Seek to the first key that >= `key`. 86 | /// Note: You should assume the key-value pairs in the block are sorted when being added by 87 | /// callers. 88 | pub fn seek_to_key(&mut self, key: KeySlice) { 89 | unimplemented!() 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /mini-lsm-book/src/week3-07-compaction-filter.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Snack Time: Compaction Filters 6 | 7 | Congratulations! You made it there! In the previous chapter, you made your LSM engine multi-version capable, and the users can use transaction APIs to interact with your storage engine. At the end of this week, we will implement some easy but important features of the storage engine. Welcome to Mini-LSM's week 3 snack time! 8 | 9 | In this chapter, we will generalize our compaction garbage collection logic to become compaction filters. 10 | 11 | For now, our compaction will simply retain the keys above the watermark and the latest version of the keys below the watermark. We can add some magic to the compaction process to help the user collect some unused data automatically as a background job. 12 | 13 | Consider a case that the user uses Mini-LSM to store database tables. Each row in the table are prefixed with the table name. For example, 14 | 15 | ``` 16 | table1_key1 -> row 17 | table1_key2 -> row 18 | table1_key3 -> row 19 | table2_key1 -> row 20 | table2_key2 -> row 21 | ``` 22 | 23 | Now the user executes `DROP TABLE table1`. The engine will need to clean up all the data beginning with `table1`. 24 | 25 | There are a lot of ways to achieve the goal. The user of Mini-LSM can scan all the keys beginning with `table1` and requests the engine to delete it. However, scanning a very large database might be slow, and it will generate the same number of delete tombstones as the existing keys. Therefore, scan-and-delete will not free up the space occupied by the dropped table -- instead, it will add more data to the engine and the space can only be reclaimed when the tombstones reach the bottom level of the engine. 26 | 27 | Or, they can create column families (we will talk about this in *rest of your life* chapter). They store each table in a column family, which is a standalone LSM state, and directly remove the SST files corresponding to the column family when the user drop the table. 28 | 29 | In this course, we will implement the third approach: compaction filters. Compaction filters can be dynamically added to the engine at runtime. During the compaction, if a key matching the compaction filter is found, we can silently remove it in the background. Therefore, the user can attach a compaction filter of `prefix=table1` to the engine, and all these keys will be removed during compaction. 30 | 31 | ## Task 1: Compaction Filter 32 | 33 | In this task, you will need to modify: 34 | 35 | ``` 36 | src/compact.rs 37 | ``` 38 | 39 | You can iterate all compaction filters in `LsmStorageInner::compaction_filters`. If the first version of the key below watermark matches the compaction filter, simply remove it instead of keeping it in the SST file. 40 | 41 | To run test cases, 42 | 43 | ``` 44 | cargo x copy-test --week 3 --day 7 45 | cargo x scheck 46 | ``` 47 | 48 | You can assume that the user will not get the keys within the prefix filter range. And, they will not scan the keys in the prefix range. Therefore, it is okay to return a wrong value when a user requests the keys in the prefix filter range (i.e., undefined behavior). 49 | 50 | {{#include copyright.md}} 51 | -------------------------------------------------------------------------------- /mini-lsm/src/iterators/two_merge_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use anyhow::Result; 16 | 17 | use super::StorageIterator; 18 | 19 | /// Merges two iterators of different types into one. If the two iterators have the same key, only 20 | /// produce the key once and prefer the entry from A. 21 | pub struct TwoMergeIterator { 22 | a: A, 23 | b: B, 24 | choose_a: bool, 25 | } 26 | 27 | impl< 28 | A: 'static + StorageIterator, 29 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 30 | > TwoMergeIterator 31 | { 32 | fn choose_a(a: &A, b: &B) -> bool { 33 | if !a.is_valid() { 34 | return false; 35 | } 36 | if !b.is_valid() { 37 | return true; 38 | } 39 | a.key() < b.key() 40 | } 41 | 42 | fn skip_b(&mut self) -> Result<()> { 43 | if self.a.is_valid() && self.b.is_valid() && self.b.key() == self.a.key() { 44 | self.b.next()?; 45 | } 46 | Ok(()) 47 | } 48 | 49 | pub fn create(a: A, b: B) -> Result { 50 | let mut iter = Self { 51 | choose_a: false, 52 | a, 53 | b, 54 | }; 55 | iter.skip_b()?; 56 | iter.choose_a = Self::choose_a(&iter.a, &iter.b); 57 | Ok(iter) 58 | } 59 | } 60 | 61 | impl< 62 | A: 'static + StorageIterator, 63 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 64 | > StorageIterator for TwoMergeIterator 65 | { 66 | type KeyType<'a> = A::KeyType<'a>; 67 | 68 | fn key(&self) -> Self::KeyType<'_> { 69 | if self.choose_a { 70 | self.a.key() 71 | } else { 72 | self.b.key() 73 | } 74 | } 75 | 76 | fn value(&self) -> &[u8] { 77 | if self.choose_a { 78 | self.a.value() 79 | } else { 80 | self.b.value() 81 | } 82 | } 83 | 84 | fn is_valid(&self) -> bool { 85 | if self.choose_a { 86 | self.a.is_valid() 87 | } else { 88 | self.b.is_valid() 89 | } 90 | } 91 | 92 | fn next(&mut self) -> Result<()> { 93 | if self.choose_a { 94 | self.a.next()?; 95 | } else { 96 | self.b.next()?; 97 | } 98 | self.skip_b()?; 99 | self.choose_a = Self::choose_a(&self.a, &self.b); 100 | Ok(()) 101 | } 102 | 103 | fn num_active_iterators(&self) -> usize { 104 | self.a.num_active_iterators() + self.b.num_active_iterators() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/iterators/two_merge_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use anyhow::Result; 16 | 17 | use super::StorageIterator; 18 | 19 | /// Merges two iterators of different types into one. If the two iterators have the same key, only 20 | /// produce the key once and prefer the entry from A. 21 | pub struct TwoMergeIterator { 22 | a: A, 23 | b: B, 24 | choose_a: bool, 25 | } 26 | 27 | impl< 28 | A: 'static + StorageIterator, 29 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 30 | > TwoMergeIterator 31 | { 32 | fn choose_a(a: &A, b: &B) -> bool { 33 | if !a.is_valid() { 34 | return false; 35 | } 36 | if !b.is_valid() { 37 | return true; 38 | } 39 | a.key() < b.key() 40 | } 41 | 42 | fn skip_b(&mut self) -> Result<()> { 43 | if self.a.is_valid() && self.b.is_valid() && self.b.key() == self.a.key() { 44 | self.b.next()?; 45 | } 46 | Ok(()) 47 | } 48 | 49 | pub fn create(a: A, b: B) -> Result { 50 | let mut iter = Self { 51 | choose_a: false, 52 | a, 53 | b, 54 | }; 55 | iter.skip_b()?; 56 | iter.choose_a = Self::choose_a(&iter.a, &iter.b); 57 | Ok(iter) 58 | } 59 | } 60 | 61 | impl< 62 | A: 'static + StorageIterator, 63 | B: 'static + for<'a> StorageIterator = A::KeyType<'a>>, 64 | > StorageIterator for TwoMergeIterator 65 | { 66 | type KeyType<'a> = A::KeyType<'a>; 67 | 68 | fn key(&self) -> A::KeyType<'_> { 69 | if self.choose_a { 70 | debug_assert!(self.a.is_valid()); 71 | self.a.key() 72 | } else { 73 | debug_assert!(self.b.is_valid()); 74 | self.b.key() 75 | } 76 | } 77 | 78 | fn value(&self) -> &[u8] { 79 | if self.choose_a { 80 | self.a.value() 81 | } else { 82 | self.b.value() 83 | } 84 | } 85 | 86 | fn is_valid(&self) -> bool { 87 | if self.choose_a { 88 | self.a.is_valid() 89 | } else { 90 | self.b.is_valid() 91 | } 92 | } 93 | 94 | fn next(&mut self) -> Result<()> { 95 | if self.choose_a { 96 | self.a.next()?; 97 | } else { 98 | self.b.next()?; 99 | } 100 | self.skip_b()?; 101 | self.choose_a = Self::choose_a(&self.a, &self.b); 102 | Ok(()) 103 | } 104 | 105 | fn num_active_iterators(&self) -> usize { 106 | self.a.num_active_iterators() + self.b.num_active_iterators() 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /mini-lsm/src/tests/week2_day6.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use tempfile::tempdir; 16 | 17 | use crate::{ 18 | compact::{ 19 | CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, 20 | TieredCompactionOptions, 21 | }, 22 | lsm_storage::{LsmStorageOptions, MiniLsm}, 23 | tests::harness::dump_files_in_dir, 24 | }; 25 | 26 | #[test] 27 | fn test_integration_leveled() { 28 | test_integration(CompactionOptions::Leveled(LeveledCompactionOptions { 29 | level_size_multiplier: 2, 30 | level0_file_num_compaction_trigger: 2, 31 | max_levels: 3, 32 | base_level_size_mb: 1, 33 | })) 34 | } 35 | 36 | #[test] 37 | fn test_integration_tiered() { 38 | test_integration(CompactionOptions::Tiered(TieredCompactionOptions { 39 | num_tiers: 3, 40 | max_size_amplification_percent: 200, 41 | size_ratio: 1, 42 | min_merge_width: 3, 43 | max_merge_width: None, 44 | })) 45 | } 46 | 47 | #[test] 48 | fn test_integration_simple() { 49 | test_integration(CompactionOptions::Simple(SimpleLeveledCompactionOptions { 50 | size_ratio_percent: 200, 51 | level0_file_num_compaction_trigger: 2, 52 | max_levels: 3, 53 | })); 54 | } 55 | 56 | fn test_integration(compaction_options: CompactionOptions) { 57 | let dir = tempdir().unwrap(); 58 | let mut options = LsmStorageOptions::default_for_week2_test(compaction_options); 59 | options.enable_wal = true; 60 | let storage = MiniLsm::open(&dir, options.clone()).unwrap(); 61 | for i in 0..=20 { 62 | storage.put(b"0", format!("v{}", i).as_bytes()).unwrap(); 63 | if i % 2 == 0 { 64 | storage.put(b"1", format!("v{}", i).as_bytes()).unwrap(); 65 | } else { 66 | storage.delete(b"1").unwrap(); 67 | } 68 | if i % 2 == 1 { 69 | storage.put(b"2", format!("v{}", i).as_bytes()).unwrap(); 70 | } else { 71 | storage.delete(b"2").unwrap(); 72 | } 73 | storage 74 | .inner 75 | .force_freeze_memtable(&storage.inner.state_lock.lock()) 76 | .unwrap(); 77 | } 78 | storage.close().unwrap(); 79 | // ensure some SSTs are not flushed 80 | assert!( 81 | !storage.inner.state.read().memtable.is_empty() 82 | || !storage.inner.state.read().imm_memtables.is_empty() 83 | ); 84 | storage.dump_structure(); 85 | drop(storage); 86 | dump_files_in_dir(&dir); 87 | 88 | let storage = MiniLsm::open(&dir, options).unwrap(); 89 | assert_eq!(&storage.get(b"0").unwrap().unwrap()[..], b"v20".as_slice()); 90 | assert_eq!(&storage.get(b"1").unwrap().unwrap()[..], b"v20".as_slice()); 91 | assert_eq!(storage.get(b"2").unwrap(), None); 92 | } 93 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week3_day7.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use bytes::Bytes; 16 | use tempfile::tempdir; 17 | 18 | use crate::{ 19 | compact::CompactionOptions, 20 | lsm_storage::{CompactionFilter, LsmStorageOptions, MiniLsm, WriteBatchRecord}, 21 | }; 22 | 23 | use super::harness::{check_iter_result_by_key, construct_merge_iterator_over_storage}; 24 | 25 | #[test] 26 | fn test_task3_mvcc_compaction() { 27 | let dir = tempdir().unwrap(); 28 | let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); 29 | let storage = MiniLsm::open(&dir, options.clone()).unwrap(); 30 | storage 31 | .write_batch(&[ 32 | WriteBatchRecord::Put("table1_a", "1"), 33 | WriteBatchRecord::Put("table1_b", "1"), 34 | WriteBatchRecord::Put("table1_c", "1"), 35 | WriteBatchRecord::Put("table2_a", "1"), 36 | WriteBatchRecord::Put("table2_b", "1"), 37 | WriteBatchRecord::Put("table2_c", "1"), 38 | ]) 39 | .unwrap(); 40 | storage.force_flush().unwrap(); 41 | let snapshot0 = storage.new_txn().unwrap(); 42 | storage 43 | .write_batch(&[ 44 | WriteBatchRecord::Put("table1_a", "2"), 45 | WriteBatchRecord::Del("table1_b"), 46 | WriteBatchRecord::Put("table1_c", "2"), 47 | WriteBatchRecord::Put("table2_a", "2"), 48 | WriteBatchRecord::Del("table2_b"), 49 | WriteBatchRecord::Put("table2_c", "2"), 50 | ]) 51 | .unwrap(); 52 | storage.force_flush().unwrap(); 53 | storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); 54 | storage.force_full_compaction().unwrap(); 55 | 56 | let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); 57 | check_iter_result_by_key( 58 | &mut iter, 59 | vec![ 60 | (Bytes::from("table1_a"), Bytes::from("2")), 61 | (Bytes::from("table1_a"), Bytes::from("1")), 62 | (Bytes::from("table1_b"), Bytes::new()), 63 | (Bytes::from("table1_b"), Bytes::from("1")), 64 | (Bytes::from("table1_c"), Bytes::from("2")), 65 | (Bytes::from("table1_c"), Bytes::from("1")), 66 | (Bytes::from("table2_a"), Bytes::from("2")), 67 | (Bytes::from("table2_b"), Bytes::new()), 68 | (Bytes::from("table2_c"), Bytes::from("2")), 69 | ], 70 | ); 71 | 72 | drop(snapshot0); 73 | 74 | storage.force_full_compaction().unwrap(); 75 | 76 | let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); 77 | check_iter_result_by_key( 78 | &mut iter, 79 | vec![ 80 | (Bytes::from("table1_a"), Bytes::from("2")), 81 | (Bytes::from("table1_c"), Bytes::from("2")), 82 | ], 83 | ); 84 | } 85 | -------------------------------------------------------------------------------- /mini-lsm/src/manifest.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::fs::{File, OpenOptions}; 16 | use std::io::{Read, Write}; 17 | use std::path::Path; 18 | use std::sync::Arc; 19 | 20 | use anyhow::{Context, Result, bail}; 21 | use bytes::{Buf, BufMut}; 22 | use parking_lot::{Mutex, MutexGuard}; 23 | use serde::{Deserialize, Serialize}; 24 | 25 | use crate::compact::CompactionTask; 26 | 27 | pub struct Manifest { 28 | file: Arc>, 29 | } 30 | 31 | #[derive(Serialize, Deserialize)] 32 | pub enum ManifestRecord { 33 | Flush(usize), 34 | NewMemtable(usize), 35 | Compaction(CompactionTask, Vec), 36 | } 37 | 38 | impl Manifest { 39 | pub fn create(path: impl AsRef) -> Result { 40 | Ok(Self { 41 | file: Arc::new(Mutex::new( 42 | OpenOptions::new() 43 | .read(true) 44 | .create_new(true) 45 | .write(true) 46 | .open(path) 47 | .context("failed to create manifest")?, 48 | )), 49 | }) 50 | } 51 | 52 | pub fn recover(path: impl AsRef) -> Result<(Self, Vec)> { 53 | let mut file = OpenOptions::new() 54 | .read(true) 55 | .append(true) 56 | .open(path) 57 | .context("failed to recover manifest")?; 58 | let mut buf = Vec::new(); 59 | file.read_to_end(&mut buf)?; 60 | let mut buf_ptr = buf.as_slice(); 61 | let mut records = Vec::new(); 62 | while buf_ptr.has_remaining() { 63 | let len = buf_ptr.get_u64(); 64 | let slice = &buf_ptr[..len as usize]; 65 | let json = serde_json::from_slice::(slice)?; 66 | buf_ptr.advance(len as usize); 67 | let checksum = buf_ptr.get_u32(); 68 | if checksum != crc32fast::hash(slice) { 69 | bail!("checksum mismatched!"); 70 | } 71 | records.push(json); 72 | } 73 | Ok(( 74 | Self { 75 | file: Arc::new(Mutex::new(file)), 76 | }, 77 | records, 78 | )) 79 | } 80 | 81 | pub fn add_record( 82 | &self, 83 | _state_lock_observer: &MutexGuard<()>, 84 | record: ManifestRecord, 85 | ) -> Result<()> { 86 | self.add_record_when_init(record) 87 | } 88 | 89 | pub fn add_record_when_init(&self, record: ManifestRecord) -> Result<()> { 90 | let mut file = self.file.lock(); 91 | let mut buf = serde_json::to_vec(&record)?; 92 | let hash = crc32fast::hash(&buf); 93 | file.write_all(&(buf.len() as u64).to_be_bytes())?; 94 | buf.put_u32(hash); 95 | file.write_all(&buf)?; 96 | file.sync_all()?; 97 | Ok(()) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/manifest.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::fs::{File, OpenOptions}; 16 | use std::io::{Read, Write}; 17 | use std::path::Path; 18 | use std::sync::Arc; 19 | 20 | use anyhow::{Context, Result, bail}; 21 | use bytes::{Buf, BufMut}; 22 | use parking_lot::{Mutex, MutexGuard}; 23 | use serde::{Deserialize, Serialize}; 24 | 25 | use crate::compact::CompactionTask; 26 | 27 | pub struct Manifest { 28 | file: Arc>, 29 | } 30 | 31 | #[derive(Serialize, Deserialize)] 32 | pub enum ManifestRecord { 33 | Flush(usize), 34 | NewMemtable(usize), 35 | Compaction(CompactionTask, Vec), 36 | } 37 | 38 | impl Manifest { 39 | pub fn create(path: impl AsRef) -> Result { 40 | Ok(Self { 41 | file: Arc::new(Mutex::new( 42 | OpenOptions::new() 43 | .read(true) 44 | .create_new(true) 45 | .write(true) 46 | .open(path) 47 | .context("failed to create manifest")?, 48 | )), 49 | }) 50 | } 51 | 52 | pub fn recover(path: impl AsRef) -> Result<(Self, Vec)> { 53 | let mut file = OpenOptions::new() 54 | .read(true) 55 | .append(true) 56 | .open(path) 57 | .context("failed to recover manifest")?; 58 | let mut buf = Vec::new(); 59 | file.read_to_end(&mut buf)?; 60 | let mut buf_ptr = buf.as_slice(); 61 | let mut records = Vec::new(); 62 | while buf_ptr.has_remaining() { 63 | let len = buf_ptr.get_u64(); 64 | let slice = &buf_ptr[..len as usize]; 65 | let json = serde_json::from_slice::(slice)?; 66 | buf_ptr.advance(len as usize); 67 | let checksum = buf_ptr.get_u32(); 68 | if checksum != crc32fast::hash(slice) { 69 | bail!("checksum mismatched!"); 70 | } 71 | records.push(json); 72 | } 73 | Ok(( 74 | Self { 75 | file: Arc::new(Mutex::new(file)), 76 | }, 77 | records, 78 | )) 79 | } 80 | 81 | pub fn add_record( 82 | &self, 83 | _state_lock_observer: &MutexGuard<()>, 84 | record: ManifestRecord, 85 | ) -> Result<()> { 86 | self.add_record_when_init(record) 87 | } 88 | 89 | pub fn add_record_when_init(&self, record: ManifestRecord) -> Result<()> { 90 | let mut file = self.file.lock(); 91 | let mut buf = serde_json::to_vec(&record)?; 92 | let hash = crc32fast::hash(&buf); 93 | file.write_all(&(buf.len() as u64).to_be_bytes())?; 94 | buf.put_u32(hash); 95 | file.write_all(&buf)?; 96 | file.sync_all()?; 97 | Ok(()) 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/tests/week3_day5.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::ops::Bound; 16 | 17 | use bytes::Bytes; 18 | use tempfile::tempdir; 19 | 20 | use crate::{ 21 | compact::CompactionOptions, 22 | lsm_storage::{LsmStorageOptions, MiniLsm}, 23 | tests::harness::check_lsm_iter_result_by_key, 24 | }; 25 | 26 | #[test] 27 | fn test_txn_integration() { 28 | let dir = tempdir().unwrap(); 29 | let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); 30 | let storage = MiniLsm::open(&dir, options.clone()).unwrap(); 31 | let txn1 = storage.new_txn().unwrap(); 32 | let txn2 = storage.new_txn().unwrap(); 33 | txn1.put(b"test1", b"233"); 34 | txn2.put(b"test2", b"233"); 35 | check_lsm_iter_result_by_key( 36 | &mut txn1.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 37 | vec![(Bytes::from("test1"), Bytes::from("233"))], 38 | ); 39 | check_lsm_iter_result_by_key( 40 | &mut txn2.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 41 | vec![(Bytes::from("test2"), Bytes::from("233"))], 42 | ); 43 | let txn3 = storage.new_txn().unwrap(); 44 | check_lsm_iter_result_by_key( 45 | &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 46 | vec![], 47 | ); 48 | txn1.commit().unwrap(); 49 | txn2.commit().unwrap(); 50 | check_lsm_iter_result_by_key( 51 | &mut txn3.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 52 | vec![], 53 | ); 54 | drop(txn3); 55 | check_lsm_iter_result_by_key( 56 | &mut storage.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 57 | vec![ 58 | (Bytes::from("test1"), Bytes::from("233")), 59 | (Bytes::from("test2"), Bytes::from("233")), 60 | ], 61 | ); 62 | let txn4 = storage.new_txn().unwrap(); 63 | assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); 64 | assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("233"))); 65 | check_lsm_iter_result_by_key( 66 | &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 67 | vec![ 68 | (Bytes::from("test1"), Bytes::from("233")), 69 | (Bytes::from("test2"), Bytes::from("233")), 70 | ], 71 | ); 72 | txn4.put(b"test2", b"2333"); 73 | assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); 74 | assert_eq!(txn4.get(b"test2").unwrap(), Some(Bytes::from("2333"))); 75 | check_lsm_iter_result_by_key( 76 | &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 77 | vec![ 78 | (Bytes::from("test1"), Bytes::from("233")), 79 | (Bytes::from("test2"), Bytes::from("2333")), 80 | ], 81 | ); 82 | txn4.delete(b"test2"); 83 | assert_eq!(txn4.get(b"test1").unwrap(), Some(Bytes::from("233"))); 84 | assert_eq!(txn4.get(b"test2").unwrap(), None); 85 | check_lsm_iter_result_by_key( 86 | &mut txn4.scan(Bound::Unbounded, Bound::Unbounded).unwrap(), 87 | vec![(Bytes::from("test1"), Bytes::from("233"))], 88 | ); 89 | } 90 | -------------------------------------------------------------------------------- /mini-lsm/src/block/builder.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use bytes::BufMut; 16 | 17 | use crate::key::{KeySlice, KeyVec}; 18 | 19 | use super::{Block, SIZEOF_U16}; 20 | 21 | /// Builds a block. 22 | pub struct BlockBuilder { 23 | /// Offsets of each key-value entries. 24 | offsets: Vec, 25 | /// All serialized key-value pairs in the block. 26 | data: Vec, 27 | /// The expected block size. 28 | block_size: usize, 29 | /// The first key in the block 30 | first_key: KeyVec, 31 | } 32 | 33 | fn compute_overlap(first_key: KeySlice, key: KeySlice) -> usize { 34 | let mut i = 0; 35 | loop { 36 | if i >= first_key.len() || i >= key.len() { 37 | break; 38 | } 39 | if first_key.raw_ref()[i] != key.raw_ref()[i] { 40 | break; 41 | } 42 | i += 1; 43 | } 44 | i 45 | } 46 | 47 | impl BlockBuilder { 48 | /// Creates a new block builder. 49 | pub fn new(block_size: usize) -> Self { 50 | Self { 51 | offsets: Vec::new(), 52 | data: Vec::new(), 53 | block_size, 54 | first_key: KeyVec::new(), 55 | } 56 | } 57 | 58 | fn estimated_size(&self) -> usize { 59 | SIZEOF_U16 /* number of key-value pairs in the block */ + self.offsets.len() * SIZEOF_U16 /* offsets */ + self.data.len() 60 | // key-value pairs 61 | } 62 | 63 | /// Adds a key-value pair to the block. Returns false when the block is full. 64 | #[must_use] 65 | pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { 66 | assert!(!key.is_empty(), "key must not be empty"); 67 | if self.estimated_size() + key.len() + value.len() + SIZEOF_U16 * 3 /* key_len, value_len and offset */ > self.block_size 68 | && !self.is_empty() 69 | { 70 | return false; 71 | } 72 | // Add the offset of the data into the offset array. 73 | self.offsets.push(self.data.len() as u16); 74 | let overlap = compute_overlap(self.first_key.as_key_slice(), key); 75 | // Encode key overlap. 76 | self.data.put_u16(overlap as u16); 77 | // Encode key length. 78 | self.data.put_u16((key.len() - overlap) as u16); 79 | // Encode key content. 80 | self.data.put(&key.raw_ref()[overlap..]); 81 | // Encode value length. 82 | self.data.put_u16(value.len() as u16); 83 | // Encode value content. 84 | self.data.put(value); 85 | 86 | if self.first_key.is_empty() { 87 | self.first_key = key.to_key_vec(); 88 | } 89 | 90 | true 91 | } 92 | 93 | /// Check if there are no key-value pairs in the block. 94 | pub fn is_empty(&self) -> bool { 95 | self.offsets.is_empty() 96 | } 97 | 98 | /// Finalize the block. 99 | pub fn build(self) -> Block { 100 | if self.is_empty() { 101 | panic!("block should not be empty"); 102 | } 103 | Block { 104 | data: self.data, 105 | offsets: self.offsets, 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /mini-lsm/src/tests/week1_day7.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use tempfile::tempdir; 16 | 17 | use crate::{ 18 | key::{KeySlice, TS_ENABLED}, 19 | table::{FileObject, SsTable, SsTableBuilder, bloom::Bloom}, 20 | }; 21 | 22 | fn key_of(idx: usize) -> Vec { 23 | format!("key_{:010}", idx * 5).into_bytes() 24 | } 25 | 26 | fn value_of(idx: usize) -> Vec { 27 | format!("value_{:010}", idx).into_bytes() 28 | } 29 | 30 | fn num_of_keys() -> usize { 31 | 100 32 | } 33 | 34 | #[test] 35 | fn test_task1_bloom_filter() { 36 | let mut key_hashes = Vec::new(); 37 | for idx in 0..num_of_keys() { 38 | let key = key_of(idx); 39 | key_hashes.push(farmhash::fingerprint32(&key)); 40 | } 41 | let bits_per_key = Bloom::bloom_bits_per_key(key_hashes.len(), 0.01); 42 | println!("bits per key: {}", bits_per_key); 43 | let bloom = Bloom::build_from_key_hashes(&key_hashes, bits_per_key); 44 | println!("bloom size: {}, k={}", bloom.filter.len(), bloom.k); 45 | assert!(bloom.k < 30); 46 | for idx in 0..num_of_keys() { 47 | let key = key_of(idx); 48 | assert!(bloom.may_contain(farmhash::fingerprint32(&key))); 49 | } 50 | let mut x = 0; 51 | let mut cnt = 0; 52 | for idx in num_of_keys()..(num_of_keys() * 10) { 53 | let key = key_of(idx); 54 | if bloom.may_contain(farmhash::fingerprint32(&key)) { 55 | x += 1; 56 | } 57 | cnt += 1; 58 | } 59 | assert_ne!(x, cnt, "bloom filter not taking effect?"); 60 | assert_ne!(x, 0, "bloom filter not taking effect?"); 61 | } 62 | 63 | #[test] 64 | fn test_task2_sst_decode() { 65 | let mut builder = SsTableBuilder::new(128); 66 | for idx in 0..num_of_keys() { 67 | let key = key_of(idx); 68 | let value = value_of(idx); 69 | builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); 70 | } 71 | let dir = tempdir().unwrap(); 72 | let path = dir.path().join("1.sst"); 73 | let sst = builder.build_for_test(&path).unwrap(); 74 | let sst2 = SsTable::open(0, None, FileObject::open(&path).unwrap()).unwrap(); 75 | let bloom_1 = sst.bloom.as_ref().unwrap(); 76 | let bloom_2 = sst2.bloom.as_ref().unwrap(); 77 | assert_eq!(bloom_1.k, bloom_2.k); 78 | assert_eq!(bloom_1.filter, bloom_2.filter); 79 | } 80 | 81 | #[test] 82 | fn test_task3_block_key_compression() { 83 | let mut builder = SsTableBuilder::new(128); 84 | for idx in 0..num_of_keys() { 85 | let key = key_of(idx); 86 | let value = value_of(idx); 87 | builder.add(KeySlice::for_testing_from_slice_no_ts(&key[..]), &value[..]); 88 | } 89 | let dir = tempdir().unwrap(); 90 | let path = dir.path().join("1.sst"); 91 | let sst = builder.build_for_test(path).unwrap(); 92 | if TS_ENABLED { 93 | assert!( 94 | sst.block_meta.len() <= 34, 95 | "you have {} blocks, expect 34", 96 | sst.block_meta.len() 97 | ); 98 | } else { 99 | assert!( 100 | sst.block_meta.len() <= 25, 101 | "you have {} blocks, expect 25", 102 | sst.block_meta.len() 103 | ); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/block/builder.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use bytes::BufMut; 16 | 17 | use crate::key::{KeySlice, KeyVec}; 18 | 19 | use super::{Block, SIZEOF_U16}; 20 | 21 | /// Builds a block. 22 | pub struct BlockBuilder { 23 | /// Offsets of each key-value entries. 24 | offsets: Vec, 25 | /// All serialized key-value pairs in the block. 26 | data: Vec, 27 | /// The expected block size. 28 | block_size: usize, 29 | /// The first key in the block 30 | first_key: KeyVec, 31 | } 32 | 33 | fn compute_overlap(first_key: KeySlice, key: KeySlice) -> usize { 34 | let mut i = 0; 35 | loop { 36 | if i >= first_key.key_len() || i >= key.key_len() { 37 | break; 38 | } 39 | if first_key.key_ref()[i] != key.key_ref()[i] { 40 | break; 41 | } 42 | i += 1; 43 | } 44 | i 45 | } 46 | 47 | impl BlockBuilder { 48 | /// Creates a new block builder. 49 | pub fn new(block_size: usize) -> Self { 50 | Self { 51 | offsets: Vec::new(), 52 | data: Vec::new(), 53 | block_size, 54 | first_key: KeyVec::new(), 55 | } 56 | } 57 | 58 | fn estimated_size(&self) -> usize { 59 | SIZEOF_U16 /* number of key-value pairs in the block */ + self.offsets.len() * SIZEOF_U16 /* offsets */ + self.data.len() 60 | // key-value pairs 61 | } 62 | 63 | /// Adds a key-value pair to the block. Returns false when the block is full. 64 | #[must_use] 65 | pub fn add(&mut self, key: KeySlice, value: &[u8]) -> bool { 66 | assert!(!key.is_empty(), "key must not be empty"); 67 | if self.estimated_size() + key.raw_len() + value.len() + SIZEOF_U16 * 3 /* key_len, value_len and offset */ > self.block_size 68 | && !self.is_empty() 69 | { 70 | return false; 71 | } 72 | // Add the offset of the data into the offset array. 73 | self.offsets.push(self.data.len() as u16); 74 | let overlap = compute_overlap(self.first_key.as_key_slice(), key); 75 | // Encode key overlap. 76 | self.data.put_u16(overlap as u16); 77 | // Encode key length. 78 | self.data.put_u16((key.key_len() - overlap) as u16); 79 | // Encode key content. 80 | self.data.put(&key.key_ref()[overlap..]); 81 | // Encode key ts 82 | self.data.put_u64(key.ts()); 83 | // Encode value length. 84 | self.data.put_u16(value.len() as u16); 85 | // Encode value content. 86 | self.data.put(value); 87 | 88 | if self.first_key.is_empty() { 89 | self.first_key = key.to_key_vec(); 90 | } 91 | 92 | true 93 | } 94 | 95 | /// Check if there are no key-value pairs in the block. 96 | pub fn is_empty(&self) -> bool { 97 | self.offsets.is_empty() 98 | } 99 | 100 | /// Finalize the block. 101 | pub fn build(self) -> Block { 102 | if self.is_empty() { 103 | panic!("block should not be empty"); 104 | } 105 | Block { 106 | data: self.data, 107 | offsets: self.offsets, 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/table/bloom.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. 16 | 17 | use anyhow::Result; 18 | use bytes::{BufMut, Bytes, BytesMut}; 19 | 20 | /// Implements a bloom filter 21 | pub struct Bloom { 22 | /// data of filter in bits 23 | pub(crate) filter: Bytes, 24 | /// number of hash functions 25 | pub(crate) k: u8, 26 | } 27 | 28 | pub trait BitSlice { 29 | fn get_bit(&self, idx: usize) -> bool; 30 | fn bit_len(&self) -> usize; 31 | } 32 | 33 | pub trait BitSliceMut { 34 | fn set_bit(&mut self, idx: usize, val: bool); 35 | } 36 | 37 | impl> BitSlice for T { 38 | fn get_bit(&self, idx: usize) -> bool { 39 | let pos = idx / 8; 40 | let offset = idx % 8; 41 | (self.as_ref()[pos] & (1 << offset)) != 0 42 | } 43 | 44 | fn bit_len(&self) -> usize { 45 | self.as_ref().len() * 8 46 | } 47 | } 48 | 49 | impl> BitSliceMut for T { 50 | fn set_bit(&mut self, idx: usize, val: bool) { 51 | let pos = idx / 8; 52 | let offset = idx % 8; 53 | if val { 54 | self.as_mut()[pos] |= 1 << offset; 55 | } else { 56 | self.as_mut()[pos] &= !(1 << offset); 57 | } 58 | } 59 | } 60 | 61 | impl Bloom { 62 | /// Decode a bloom filter 63 | pub fn decode(buf: &[u8]) -> Result { 64 | let filter = &buf[..buf.len() - 1]; 65 | let k = buf[buf.len() - 1]; 66 | Ok(Self { 67 | filter: filter.to_vec().into(), 68 | k, 69 | }) 70 | } 71 | 72 | /// Encode a bloom filter 73 | pub fn encode(&self, buf: &mut Vec) { 74 | buf.extend(&self.filter); 75 | buf.put_u8(self.k); 76 | } 77 | 78 | /// Get bloom filter bits per key from entries count and FPR 79 | pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { 80 | let size = 81 | -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); 82 | let locs = (size / (entries as f64)).ceil(); 83 | locs as usize 84 | } 85 | 86 | /// Build bloom filter from key hashes 87 | pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self { 88 | let k = (bits_per_key as f64 * 0.69) as u32; 89 | let k = k.clamp(1, 30); 90 | let nbits = (keys.len() * bits_per_key).max(64); 91 | let nbytes = (nbits + 7) / 8; 92 | let nbits = nbytes * 8; 93 | let mut filter = BytesMut::with_capacity(nbytes); 94 | filter.resize(nbytes, 0); 95 | 96 | // TODO: build the bloom filter 97 | 98 | Self { 99 | filter: filter.freeze(), 100 | k: k as u8, 101 | } 102 | } 103 | 104 | /// Check if a bloom filter may contain some data 105 | pub fn may_contain(&self, h: u32) -> bool { 106 | if self.k > 30 { 107 | // potential new encoding for short bloom filters 108 | true 109 | } else { 110 | let nbits = self.filter.bit_len(); 111 | let delta = h.rotate_left(15); 112 | 113 | // TODO: probe the bloom filter 114 | 115 | true 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /mini-lsm-book/src/week3-04-watermark.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Watermark and Garbage Collection 6 | 7 | In this chapter, you will implement necessary structures to track the lowest read timestamp being used by the user, and collect unused versions from SSTs when doing the compaction. 8 | 9 | To run test cases, 10 | 11 | ``` 12 | cargo x copy-test --week 3 --day 4 13 | cargo x scheck 14 | ``` 15 | 16 | ## Task 1: Implement Watermark 17 | 18 | In this task, you will need to modify: 19 | 20 | ``` 21 | src/mvcc/watermark.rs 22 | ``` 23 | 24 | Watermark is the structure to track the lowest `read_ts` in the system. When a new transaction is created, it should call `add_reader` to add its read timestamp for tracking. When a transaction aborts or commits, it should remove itself from the watermark. The watermark structures returns the lowest `read_ts` in the system when `watermark()` is called. If there are no ongoing transactions, it simply returns `None`. 25 | 26 | You may implement watermark using a `BTreeMap`. It maintains a counter that how many snapshots are using this read timestamp for each `read_ts`. You should not have entries with 0 readers in the b-tree map. 27 | 28 | ## Task 2: Maintain Watermark in Transactions 29 | 30 | In this task, you will need to modify: 31 | 32 | ``` 33 | src/mvcc/txn.rs 34 | src/mvcc.rs 35 | ``` 36 | 37 | You will need to add the `read_ts` to the watermark when a transaction starts, and remove it when `drop` is called for the transaction. 38 | 39 | ## Task 3: Garbage Collection in Compaction 40 | 41 | In this task, you will need to modify: 42 | 43 | ``` 44 | src/compact.rs 45 | ``` 46 | 47 | Now that we have a watermark for the system, we can clean up unused versions during the compaction process. 48 | 49 | * If a version of a key is above watermark, keep it. 50 | * For all versions of a key below or equal to the watermark, keep the latest version. 51 | 52 | For example, if we have watermark=3 and the following data: 53 | 54 | ``` 55 | a@4=del <- above watermark 56 | a@3=3 <- latest version below or equal to watermark 57 | a@2=2 <- can be removed, no one will read it 58 | a@1=1 <- can be removed, no one will read it 59 | b@1=1 <- latest version below or equal to watermark 60 | c@4=4 <- above watermark 61 | d@3=del <- can be removed if compacting to bottom-most level 62 | d@2=2 <- can be removed 63 | ``` 64 | 65 | If we do a compaction over these keys, we will get: 66 | 67 | ``` 68 | a@4=del 69 | a@3=3 70 | b@1=1 71 | c@4=4 72 | d@3=del (can be removed if compacting to bottom-most level) 73 | ``` 74 | 75 | Assume these are all keys in the engine. If we do a scan at ts=3, we will get `a=3,b=1,c=4` before/after compaction. If we do a scan at ts=4, we will get `b=1,c=4` before/after compaction. Compaction *will not* and *should not* affect transactions with read timestamp >= watermark. 76 | 77 | ## Test Your Understanding 78 | 79 | * In our implementation, we manage watermarks by ourselves with the lifecycle of `Transaction` (so-called un-managed mode). If the user intends to manage key timestamps and the watermarks by themselves (i.e., when they have their own timestamp generator), what do you need to do in the write_batch/get/scan API to validate their requests? Is there any architectural assumption we had that might be hard to maintain in this case? 80 | * Why do we need to store an `Arc` of `Transaction` inside a transaction iterator? 81 | * What is the condition to fully remove a key from the SST file? 82 | * For now, we only remove a key when compacting to the bottom-most level. Is there any other prior time that we can remove the key? (Hint: you know the start/end key of each SST in all levels.) 83 | * Consider the case that the user creates a long-running transaction and we could not garbage collect anything. The user keeps updating a single key. Eventually, there could be a key with thousands of versions in a single SST file. How would it affect performance, and how would you deal with it? 84 | 85 | ## Bonus Tasks 86 | 87 | * **O(1) Watermark.** You may implement an amortized O(1) watermark structure by using a hash map or a cyclic queue. 88 | 89 | {{#include copyright.md}} 90 | -------------------------------------------------------------------------------- /mini-lsm/src/wal.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::fs::{File, OpenOptions}; 16 | use std::hash::Hasher; 17 | use std::io::{BufWriter, Read, Write}; 18 | use std::path::Path; 19 | use std::sync::Arc; 20 | 21 | use anyhow::{Context, Result, bail}; 22 | use bytes::{Buf, BufMut, Bytes}; 23 | use crossbeam_skiplist::SkipMap; 24 | use parking_lot::Mutex; 25 | 26 | use crate::key::KeySlice; 27 | 28 | pub struct Wal { 29 | file: Arc>>, 30 | } 31 | 32 | impl Wal { 33 | pub fn create(path: impl AsRef) -> Result { 34 | Ok(Self { 35 | file: Arc::new(Mutex::new(BufWriter::new( 36 | OpenOptions::new() 37 | .read(true) 38 | .create_new(true) 39 | .write(true) 40 | .open(path) 41 | .context("failed to create WAL")?, 42 | ))), 43 | }) 44 | } 45 | 46 | pub fn recover(path: impl AsRef, skiplist: &SkipMap) -> Result { 47 | let path = path.as_ref(); 48 | let mut file = OpenOptions::new() 49 | .read(true) 50 | .append(true) 51 | .open(path) 52 | .context("failed to recover from WAL")?; 53 | let mut buf = Vec::new(); 54 | file.read_to_end(&mut buf)?; 55 | let mut rbuf: &[u8] = buf.as_slice(); 56 | while rbuf.has_remaining() { 57 | let mut hasher = crc32fast::Hasher::new(); 58 | let key_len = rbuf.get_u16() as usize; 59 | hasher.write_u16(key_len as u16); 60 | let key = Bytes::copy_from_slice(&rbuf[..key_len]); 61 | hasher.write(&key); 62 | rbuf.advance(key_len); 63 | let value_len = rbuf.get_u16() as usize; 64 | hasher.write_u16(value_len as u16); 65 | let value = Bytes::copy_from_slice(&rbuf[..value_len]); 66 | hasher.write(&value); 67 | rbuf.advance(value_len); 68 | let checksum = rbuf.get_u32(); 69 | if hasher.finalize() != checksum { 70 | bail!("checksum mismatch"); 71 | } 72 | skiplist.insert(key, value); 73 | } 74 | Ok(Self { 75 | file: Arc::new(Mutex::new(BufWriter::new(file))), 76 | }) 77 | } 78 | 79 | pub fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { 80 | let mut file = self.file.lock(); 81 | let mut buf: Vec = 82 | Vec::with_capacity(key.len() + value.len() + std::mem::size_of::()); 83 | let mut hasher = crc32fast::Hasher::new(); 84 | hasher.write_u16(key.len() as u16); 85 | buf.put_u16(key.len() as u16); 86 | hasher.write(key); 87 | buf.put_slice(key); 88 | hasher.write_u16(value.len() as u16); 89 | buf.put_u16(value.len() as u16); 90 | buf.put_slice(value); 91 | hasher.write(value); 92 | // add checksum: week 2 day 7 93 | buf.put_u32(hasher.finalize()); 94 | file.write_all(&buf)?; 95 | Ok(()) 96 | } 97 | 98 | /// Implement this in week 3, day 5; if you want to implement this earlier, use `&[u8]` as the key type. 99 | pub fn put_batch(&self, _data: &[(KeySlice, &[u8])]) -> Result<()> { 100 | unimplemented!() 101 | } 102 | 103 | pub fn sync(&self) -> Result<()> { 104 | let mut file = self.file.lock(); 105 | file.flush()?; 106 | file.get_mut().sync_all()?; 107 | Ok(()) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /mini-lsm/src/table/iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use anyhow::Result; 18 | 19 | use super::SsTable; 20 | use crate::block::BlockIterator; 21 | use crate::iterators::StorageIterator; 22 | use crate::key::KeySlice; 23 | 24 | /// An iterator over the contents of an SSTable. 25 | pub struct SsTableIterator { 26 | table: Arc, 27 | blk_iter: BlockIterator, 28 | blk_idx: usize, 29 | } 30 | 31 | impl SsTableIterator { 32 | fn seek_to_first_inner(table: &Arc) -> Result<(usize, BlockIterator)> { 33 | Ok(( 34 | 0, 35 | BlockIterator::create_and_seek_to_first(table.read_block_cached(0)?), 36 | )) 37 | } 38 | 39 | /// Create a new iterator and seek to the first key-value pair. 40 | pub fn create_and_seek_to_first(table: Arc) -> Result { 41 | let (blk_idx, blk_iter) = Self::seek_to_first_inner(&table)?; 42 | let iter = Self { 43 | blk_iter, 44 | table, 45 | blk_idx, 46 | }; 47 | Ok(iter) 48 | } 49 | 50 | /// Seek to the first key-value pair. 51 | pub fn seek_to_first(&mut self) -> Result<()> { 52 | let (blk_idx, blk_iter) = Self::seek_to_first_inner(&self.table)?; 53 | self.blk_idx = blk_idx; 54 | self.blk_iter = blk_iter; 55 | Ok(()) 56 | } 57 | 58 | fn seek_to_key_inner(table: &Arc, key: KeySlice) -> Result<(usize, BlockIterator)> { 59 | let mut blk_idx = table.find_block_idx(key); 60 | let mut blk_iter = 61 | BlockIterator::create_and_seek_to_key(table.read_block_cached(blk_idx)?, key); 62 | if !blk_iter.is_valid() { 63 | blk_idx += 1; 64 | if blk_idx < table.num_of_blocks() { 65 | blk_iter = 66 | BlockIterator::create_and_seek_to_first(table.read_block_cached(blk_idx)?); 67 | } 68 | } 69 | Ok((blk_idx, blk_iter)) 70 | } 71 | 72 | /// Create a new iterator and seek to the first key-value pair which >= `key`. 73 | pub fn create_and_seek_to_key(table: Arc, key: KeySlice) -> Result { 74 | let (blk_idx, blk_iter) = Self::seek_to_key_inner(&table, key)?; 75 | let iter = Self { 76 | blk_iter, 77 | table, 78 | blk_idx, 79 | }; 80 | Ok(iter) 81 | } 82 | 83 | /// Seek to the first key-value pair which >= `key`. 84 | pub fn seek_to_key(&mut self, key: KeySlice) -> Result<()> { 85 | let (blk_idx, blk_iter) = Self::seek_to_key_inner(&self.table, key)?; 86 | self.blk_iter = blk_iter; 87 | self.blk_idx = blk_idx; 88 | Ok(()) 89 | } 90 | } 91 | 92 | impl StorageIterator for SsTableIterator { 93 | type KeyType<'a> = KeySlice<'a>; 94 | 95 | fn value(&self) -> &[u8] { 96 | self.blk_iter.value() 97 | } 98 | 99 | fn key(&self) -> KeySlice { 100 | self.blk_iter.key() 101 | } 102 | 103 | fn is_valid(&self) -> bool { 104 | self.blk_iter.is_valid() 105 | } 106 | 107 | fn next(&mut self) -> Result<()> { 108 | self.blk_iter.next(); 109 | if !self.blk_iter.is_valid() { 110 | self.blk_idx += 1; 111 | if self.blk_idx < self.table.num_of_blocks() { 112 | self.blk_iter = BlockIterator::create_and_seek_to_first( 113 | self.table.read_block_cached(self.blk_idx)?, 114 | ); 115 | } 116 | } 117 | Ok(()) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/table/iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use anyhow::Result; 18 | 19 | use super::SsTable; 20 | use crate::block::BlockIterator; 21 | use crate::iterators::StorageIterator; 22 | use crate::key::KeySlice; 23 | 24 | /// An iterator over the contents of an SSTable. 25 | pub struct SsTableIterator { 26 | table: Arc, 27 | blk_iter: BlockIterator, 28 | blk_idx: usize, 29 | } 30 | 31 | impl SsTableIterator { 32 | fn seek_to_first_inner(table: &Arc) -> Result<(usize, BlockIterator)> { 33 | Ok(( 34 | 0, 35 | BlockIterator::create_and_seek_to_first(table.read_block_cached(0)?), 36 | )) 37 | } 38 | 39 | /// Create a new iterator and seek to the first key-value pair. 40 | pub fn create_and_seek_to_first(table: Arc) -> Result { 41 | let (blk_idx, blk_iter) = Self::seek_to_first_inner(&table)?; 42 | let iter = Self { 43 | blk_iter, 44 | table, 45 | blk_idx, 46 | }; 47 | Ok(iter) 48 | } 49 | 50 | /// Seek to the first key-value pair. 51 | pub fn seek_to_first(&mut self) -> Result<()> { 52 | let (blk_idx, blk_iter) = Self::seek_to_first_inner(&self.table)?; 53 | self.blk_idx = blk_idx; 54 | self.blk_iter = blk_iter; 55 | Ok(()) 56 | } 57 | 58 | fn seek_to_key_inner(table: &Arc, key: KeySlice) -> Result<(usize, BlockIterator)> { 59 | let mut blk_idx = table.find_block_idx(key); 60 | let mut blk_iter = 61 | BlockIterator::create_and_seek_to_key(table.read_block_cached(blk_idx)?, key); 62 | if !blk_iter.is_valid() { 63 | blk_idx += 1; 64 | if blk_idx < table.num_of_blocks() { 65 | blk_iter = 66 | BlockIterator::create_and_seek_to_first(table.read_block_cached(blk_idx)?); 67 | } 68 | } 69 | Ok((blk_idx, blk_iter)) 70 | } 71 | 72 | /// Create a new iterator and seek to the first key-value pair which >= `key`. 73 | pub fn create_and_seek_to_key(table: Arc, key: KeySlice) -> Result { 74 | let (blk_idx, blk_iter) = Self::seek_to_key_inner(&table, key)?; 75 | let iter = Self { 76 | blk_iter, 77 | table, 78 | blk_idx, 79 | }; 80 | Ok(iter) 81 | } 82 | 83 | /// Seek to the first key-value pair which >= `key`. 84 | pub fn seek_to_key(&mut self, key: KeySlice) -> Result<()> { 85 | let (blk_idx, blk_iter) = Self::seek_to_key_inner(&self.table, key)?; 86 | self.blk_iter = blk_iter; 87 | self.blk_idx = blk_idx; 88 | Ok(()) 89 | } 90 | } 91 | 92 | impl StorageIterator for SsTableIterator { 93 | type KeyType<'a> = KeySlice<'a>; 94 | 95 | fn value(&self) -> &[u8] { 96 | self.blk_iter.value() 97 | } 98 | 99 | fn key(&self) -> KeySlice { 100 | self.blk_iter.key() 101 | } 102 | 103 | fn is_valid(&self) -> bool { 104 | self.blk_iter.is_valid() 105 | } 106 | 107 | fn next(&mut self) -> Result<()> { 108 | self.blk_iter.next(); 109 | if !self.blk_iter.is_valid() { 110 | self.blk_idx += 1; 111 | if self.blk_idx < self.table.num_of_blocks() { 112 | self.blk_iter = BlockIterator::create_and_seek_to_first( 113 | self.table.read_block_cached(self.blk_idx)?, 114 | ); 115 | } 116 | } 117 | Ok(()) 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /mini-lsm-book/src/week2-06-wal.md: -------------------------------------------------------------------------------- 1 | 4 | 5 | # Write-Ahead Log (WAL) 6 | 7 | ![Chapter Overview](./lsm-tutorial/week2-06-overview.svg) 8 | 9 | In this chapter, you will: 10 | 11 | * Implement encoding and decoding of the write-ahead log file. 12 | * Recover memtables from the WALs when the system restarts. 13 | 14 | To copy the test cases into the starter code and run them, 15 | 16 | ``` 17 | cargo x copy-test --week 2 --day 6 18 | cargo x scheck 19 | ``` 20 | 21 | ## Task 1: WAL Encoding 22 | 23 | In this task, you will need to modify: 24 | 25 | ``` 26 | src/wal.rs 27 | ``` 28 | 29 | In the previous chapter, we have implemented the manifest file, so that the LSM state can be persisted. And we implemented the `close` function to flush all memtables to SSTs before stopping the engine. Now, what if the system crashes (i.e., powered off)? We can log memtable modifications to WAL (write-ahead log), and recover WALs when restarting the database. WAL is only enabled when `self.options.enable_wal = true`. 30 | 31 | The WAL encoding is simply a list of key-value pairs. 32 | 33 | ``` 34 | | key_len | key | value_len | value | 35 | ``` 36 | 37 | You will also need to implement the `recover` function to read the WAL and recover the state of a memtable. 38 | 39 | Note that we are using a `BufWriter` for writing the WAL. Using a `BufWriter` can reduce the number of syscalls into the OS, so as to reduce the latency of the write path. The data is not guaranteed to be written to the disk when the user modifies a key. Instead, the engine only guarantee that the data is persisted when `sync` is called. To correctly persist the data to the disk, you will need to first flush the data from the buffer writer to the file object by calling `flush()`, and then do a fsync on the file by using `get_mut().sync_all()`. Note that you *only* need to fsync when the engine's `sync` gets called. You *do not* need to fsync every time on writing data. 40 | 41 | ## Task 2: Integrate WALs 42 | 43 | In this task, you will need to modify: 44 | 45 | ``` 46 | src/mem_table.rs 47 | src/wal.rs 48 | src/lsm_storage.rs 49 | ``` 50 | 51 | `MemTable` has a WAL field. If the `wal` field is set to `Some(wal)`, you will need to append to the WAL when updating the memtable. In your LSM engine, you will need to create WALs if `enable_wal = true`. You will also need update the manifest using the `ManifestRecord::NewMemtable` record when new memtable is created. 52 | 53 | You can create a memtable with WAL by using the `create_with_wal` function. WAL should be written to `.wal` in the storage directory. The memtable id should be the same as the SST id if this memtable gets flushed as an L0 SST. 54 | 55 | ## Task 3: Recover from the WALs 56 | 57 | In this task, you will need to modify: 58 | 59 | ``` 60 | src/lsm_storage.rs 61 | ``` 62 | 63 | If WAL is enabled, you will need to recover the memtables based on WALs when loading the database. You will also need to implement the `sync` function of the database. The basic guarantee of `sync` is that the engine is sure that the data is persisted to the disk (and will be recovered when it restarts). To achieve this, you can simply sync the WAL corresponding to the current memtable. 64 | 65 | ``` 66 | cargo run --bin mini-lsm-cli -- --enable-wal 67 | ``` 68 | 69 | Remember to recover the correct `next_sst_id` from the state, which should be `max{memtable id, sst id}` + 1. In your `close` function, you should not flush memtables to SSTs if `enable_wal` is set to true, as WAL itself provides persistency. You should wait until all compaction and flush threads to exit before closing the database. 70 | 71 | ## Test Your Understanding 72 | 73 | * When should you call `fsync` in your engine? What happens if you call `fsync` too often (i.e., on every put key request)? 74 | * How costly is the `fsync` operation in general on an SSD (solid state drive)? 75 | * When can you tell the user that their modifications (put/delete) have been persisted? 76 | * How can you handle corrupted data in WAL? 77 | * Is it possible to design an LSM engine without WAL (i.e., use L0 as WAL)? What will be the implications of this design? 78 | 79 | We do not provide reference answers to the questions, and feel free to discuss about them in the Discord community. 80 | 81 | {{#include copyright.md}} 82 | -------------------------------------------------------------------------------- /mini-lsm-book/src/lsm-tutorial/week2-00-two-extremes-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | week2-00-two-extremes-1 13 | 14 | 15 | Layer 1 16 | 17 | 18 | 19 | 20 | SST 21 | 22 | 23 | 24 | 25 | L0 26 | 27 | 28 | 29 | 30 | 31 | 32 | SST 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | SST 43 | 44 | 45 | 46 | 47 | 48 | 49 | SST 50 | 51 | 52 | 53 | 54 | 55 | 56 | SST 57 | 58 | 59 | 60 | 61 | 62 | 63 | SST 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /mini-lsm-starter/src/mvcc/txn.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod 16 | #![allow(dead_code)] // TODO(you): remove this lint after implementing this mod 17 | 18 | use std::{ 19 | collections::HashSet, 20 | ops::Bound, 21 | sync::{Arc, atomic::AtomicBool}, 22 | }; 23 | 24 | use anyhow::Result; 25 | use bytes::Bytes; 26 | use crossbeam_skiplist::SkipMap; 27 | use ouroboros::self_referencing; 28 | use parking_lot::Mutex; 29 | 30 | use crate::{ 31 | iterators::{StorageIterator, two_merge_iterator::TwoMergeIterator}, 32 | lsm_iterator::{FusedIterator, LsmIterator}, 33 | lsm_storage::LsmStorageInner, 34 | }; 35 | 36 | pub struct Transaction { 37 | pub(crate) read_ts: u64, 38 | pub(crate) inner: Arc, 39 | pub(crate) local_storage: Arc>, 40 | pub(crate) committed: Arc, 41 | /// Write set and read set 42 | pub(crate) key_hashes: Option, HashSet)>>, 43 | } 44 | 45 | impl Transaction { 46 | pub fn get(&self, key: &[u8]) -> Result> { 47 | unimplemented!() 48 | } 49 | 50 | pub fn scan(self: &Arc, lower: Bound<&[u8]>, upper: Bound<&[u8]>) -> Result { 51 | unimplemented!() 52 | } 53 | 54 | pub fn put(&self, key: &[u8], value: &[u8]) { 55 | unimplemented!() 56 | } 57 | 58 | pub fn delete(&self, key: &[u8]) { 59 | unimplemented!() 60 | } 61 | 62 | pub fn commit(&self) -> Result<()> { 63 | unimplemented!() 64 | } 65 | } 66 | 67 | impl Drop for Transaction { 68 | fn drop(&mut self) {} 69 | } 70 | 71 | type SkipMapRangeIter<'a> = 72 | crossbeam_skiplist::map::Range<'a, Bytes, (Bound, Bound), Bytes, Bytes>; 73 | 74 | #[self_referencing] 75 | pub struct TxnLocalIterator { 76 | /// Stores a reference to the skipmap. 77 | map: Arc>, 78 | /// Stores a skipmap iterator that refers to the lifetime of `TxnLocalIterator` itself. 79 | #[borrows(map)] 80 | #[not_covariant] 81 | iter: SkipMapRangeIter<'this>, 82 | /// Stores the current key-value pair. 83 | item: (Bytes, Bytes), 84 | } 85 | 86 | impl StorageIterator for TxnLocalIterator { 87 | type KeyType<'a> = &'a [u8]; 88 | 89 | fn value(&self) -> &[u8] { 90 | unimplemented!() 91 | } 92 | 93 | fn key(&self) -> &[u8] { 94 | unimplemented!() 95 | } 96 | 97 | fn is_valid(&self) -> bool { 98 | unimplemented!() 99 | } 100 | 101 | fn next(&mut self) -> Result<()> { 102 | unimplemented!() 103 | } 104 | } 105 | 106 | pub struct TxnIterator { 107 | _txn: Arc, 108 | iter: TwoMergeIterator>, 109 | } 110 | 111 | impl TxnIterator { 112 | pub fn create( 113 | txn: Arc, 114 | iter: TwoMergeIterator>, 115 | ) -> Result { 116 | unimplemented!() 117 | } 118 | } 119 | 120 | impl StorageIterator for TxnIterator { 121 | type KeyType<'a> 122 | = &'a [u8] 123 | where 124 | Self: 'a; 125 | 126 | fn value(&self) -> &[u8] { 127 | self.iter.value() 128 | } 129 | 130 | fn key(&self) -> Self::KeyType<'_> { 131 | self.iter.key() 132 | } 133 | 134 | fn is_valid(&self) -> bool { 135 | self.iter.is_valid() 136 | } 137 | 138 | fn next(&mut self) -> Result<()> { 139 | unimplemented!() 140 | } 141 | 142 | fn num_active_iterators(&self) -> usize { 143 | self.iter.num_active_iterators() 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /mini-lsm-book/src/lsm-tutorial/week1-01-frozen.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | week1-01-frozen 18 | 19 | 20 | Layer 1 21 | 22 | 23 | 24 | 25 | 26 | On Disk 27 | 28 | 29 | 30 | 31 | In Memory 32 | 33 | 34 | 35 | 36 | 37 | 38 | Current 39 | 40 | 41 | 42 | 43 | 44 | 45 | key + value 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | Frozen 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | Frozen 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /mini-lsm/src/iterators/concat_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use anyhow::Result; 18 | 19 | use crate::{ 20 | key::KeySlice, 21 | table::{SsTable, SsTableIterator}, 22 | }; 23 | 24 | use super::StorageIterator; 25 | 26 | /// Concat multiple iterators ordered in key order and their key ranges do not overlap. We do not want to create the 27 | /// iterators when initializing this iterator to reduce the overhead of seeking. 28 | pub struct SstConcatIterator { 29 | current: Option, 30 | next_sst_idx: usize, 31 | sstables: Vec>, 32 | } 33 | 34 | impl SstConcatIterator { 35 | fn check_sst_valid(sstables: &[Arc]) { 36 | for sst in sstables { 37 | assert!(sst.first_key() <= sst.last_key()); 38 | } 39 | if !sstables.is_empty() { 40 | for i in 0..(sstables.len() - 1) { 41 | assert!(sstables[i].last_key() < sstables[i + 1].first_key()); 42 | } 43 | } 44 | } 45 | 46 | pub fn create_and_seek_to_first(sstables: Vec>) -> Result { 47 | Self::check_sst_valid(&sstables); 48 | if sstables.is_empty() { 49 | return Ok(Self { 50 | current: None, 51 | next_sst_idx: 0, 52 | sstables, 53 | }); 54 | } 55 | let mut iter = Self { 56 | current: Some(SsTableIterator::create_and_seek_to_first( 57 | sstables[0].clone(), 58 | )?), 59 | next_sst_idx: 1, 60 | sstables, 61 | }; 62 | iter.move_until_valid()?; 63 | Ok(iter) 64 | } 65 | 66 | pub fn create_and_seek_to_key(sstables: Vec>, key: KeySlice) -> Result { 67 | Self::check_sst_valid(&sstables); 68 | let idx: usize = sstables 69 | .partition_point(|table| table.first_key().as_key_slice() <= key) 70 | .saturating_sub(1); 71 | if idx >= sstables.len() { 72 | return Ok(Self { 73 | current: None, 74 | next_sst_idx: sstables.len(), 75 | sstables, 76 | }); 77 | } 78 | let mut iter = Self { 79 | current: Some(SsTableIterator::create_and_seek_to_key( 80 | sstables[idx].clone(), 81 | key, 82 | )?), 83 | next_sst_idx: idx + 1, 84 | sstables, 85 | }; 86 | iter.move_until_valid()?; 87 | Ok(iter) 88 | } 89 | 90 | fn move_until_valid(&mut self) -> Result<()> { 91 | while let Some(iter) = self.current.as_mut() { 92 | if iter.is_valid() { 93 | break; 94 | } 95 | if self.next_sst_idx >= self.sstables.len() { 96 | self.current = None; 97 | } else { 98 | self.current = Some(SsTableIterator::create_and_seek_to_first( 99 | self.sstables[self.next_sst_idx].clone(), 100 | )?); 101 | self.next_sst_idx += 1; 102 | } 103 | } 104 | Ok(()) 105 | } 106 | } 107 | 108 | impl StorageIterator for SstConcatIterator { 109 | type KeyType<'a> = KeySlice<'a>; 110 | 111 | fn key(&self) -> KeySlice { 112 | self.current.as_ref().unwrap().key() 113 | } 114 | 115 | fn value(&self) -> &[u8] { 116 | self.current.as_ref().unwrap().value() 117 | } 118 | 119 | fn is_valid(&self) -> bool { 120 | if let Some(current) = &self.current { 121 | assert!(current.is_valid()); 122 | true 123 | } else { 124 | false 125 | } 126 | } 127 | 128 | fn next(&mut self) -> Result<()> { 129 | self.current.as_mut().unwrap().next()?; 130 | self.move_until_valid()?; 131 | Ok(()) 132 | } 133 | 134 | fn num_active_iterators(&self) -> usize { 135 | 1 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/iterators/concat_iterator.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use anyhow::Result; 18 | 19 | use crate::{ 20 | key::KeySlice, 21 | table::{SsTable, SsTableIterator}, 22 | }; 23 | 24 | use super::StorageIterator; 25 | 26 | /// Concat multiple iterators ordered in key order and their key ranges do not overlap. We do not want to create the 27 | /// iterators when initializing this iterator to reduce the overhead of seeking. 28 | pub struct SstConcatIterator { 29 | current: Option, 30 | next_sst_idx: usize, 31 | sstables: Vec>, 32 | } 33 | 34 | impl SstConcatIterator { 35 | fn check_sst_valid(sstables: &[Arc]) { 36 | for sst in sstables { 37 | assert!(sst.first_key() <= sst.last_key()); 38 | } 39 | if !sstables.is_empty() { 40 | for i in 0..(sstables.len() - 1) { 41 | assert!(sstables[i].last_key() < sstables[i + 1].first_key()); 42 | } 43 | } 44 | } 45 | 46 | pub fn create_and_seek_to_first(sstables: Vec>) -> Result { 47 | Self::check_sst_valid(&sstables); 48 | if sstables.is_empty() { 49 | return Ok(Self { 50 | current: None, 51 | next_sst_idx: 0, 52 | sstables, 53 | }); 54 | } 55 | let mut iter = Self { 56 | current: Some(SsTableIterator::create_and_seek_to_first( 57 | sstables[0].clone(), 58 | )?), 59 | next_sst_idx: 1, 60 | sstables, 61 | }; 62 | iter.move_until_valid()?; 63 | Ok(iter) 64 | } 65 | 66 | pub fn create_and_seek_to_key(sstables: Vec>, key: KeySlice) -> Result { 67 | Self::check_sst_valid(&sstables); 68 | let idx: usize = sstables 69 | .partition_point(|table| table.first_key().as_key_slice() <= key) 70 | .saturating_sub(1); 71 | if idx >= sstables.len() { 72 | return Ok(Self { 73 | current: None, 74 | next_sst_idx: sstables.len(), 75 | sstables, 76 | }); 77 | } 78 | let mut iter = Self { 79 | current: Some(SsTableIterator::create_and_seek_to_key( 80 | sstables[idx].clone(), 81 | key, 82 | )?), 83 | next_sst_idx: idx + 1, 84 | sstables, 85 | }; 86 | iter.move_until_valid()?; 87 | Ok(iter) 88 | } 89 | 90 | fn move_until_valid(&mut self) -> Result<()> { 91 | while let Some(iter) = self.current.as_mut() { 92 | if iter.is_valid() { 93 | break; 94 | } 95 | if self.next_sst_idx >= self.sstables.len() { 96 | self.current = None; 97 | } else { 98 | self.current = Some(SsTableIterator::create_and_seek_to_first( 99 | self.sstables[self.next_sst_idx].clone(), 100 | )?); 101 | self.next_sst_idx += 1; 102 | } 103 | } 104 | Ok(()) 105 | } 106 | } 107 | 108 | impl StorageIterator for SstConcatIterator { 109 | type KeyType<'a> = KeySlice<'a>; 110 | 111 | fn key(&self) -> KeySlice { 112 | self.current.as_ref().unwrap().key() 113 | } 114 | 115 | fn value(&self) -> &[u8] { 116 | self.current.as_ref().unwrap().value() 117 | } 118 | 119 | fn is_valid(&self) -> bool { 120 | if let Some(current) = &self.current { 121 | assert!(current.is_valid()); 122 | true 123 | } else { 124 | false 125 | } 126 | } 127 | 128 | fn next(&mut self) -> Result<()> { 129 | self.current.as_mut().unwrap().next()?; 130 | self.move_until_valid()?; 131 | Ok(()) 132 | } 133 | 134 | fn num_active_iterators(&self) -> usize { 135 | 1 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /mini-lsm/src/table/bloom.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. 16 | 17 | use anyhow::{Result, bail}; 18 | use bytes::{Buf, BufMut, Bytes, BytesMut}; 19 | 20 | /// Implements a bloom filter 21 | pub struct Bloom { 22 | /// data of filter in bits 23 | pub(crate) filter: Bytes, 24 | /// number of hash functions 25 | pub(crate) k: u8, 26 | } 27 | 28 | pub trait BitSlice { 29 | fn get_bit(&self, idx: usize) -> bool; 30 | fn bit_len(&self) -> usize; 31 | } 32 | 33 | pub trait BitSliceMut { 34 | fn set_bit(&mut self, idx: usize, val: bool); 35 | } 36 | 37 | impl> BitSlice for T { 38 | fn get_bit(&self, idx: usize) -> bool { 39 | let pos = idx / 8; 40 | let offset = idx % 8; 41 | (self.as_ref()[pos] & (1 << offset)) != 0 42 | } 43 | 44 | fn bit_len(&self) -> usize { 45 | self.as_ref().len() * 8 46 | } 47 | } 48 | 49 | impl> BitSliceMut for T { 50 | fn set_bit(&mut self, idx: usize, val: bool) { 51 | let pos = idx / 8; 52 | let offset = idx % 8; 53 | if val { 54 | self.as_mut()[pos] |= 1 << offset; 55 | } else { 56 | self.as_mut()[pos] &= !(1 << offset); 57 | } 58 | } 59 | } 60 | 61 | impl Bloom { 62 | /// Decode a bloom filter 63 | pub fn decode(buf: &[u8]) -> Result { 64 | let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32(); 65 | if checksum != crc32fast::hash(&buf[..buf.len() - 4]) { 66 | bail!("checksum mismatched for bloom filters"); 67 | } 68 | let filter = &buf[..buf.len() - 5]; 69 | let k = buf[buf.len() - 5]; 70 | Ok(Self { 71 | filter: filter.to_vec().into(), 72 | k, 73 | }) 74 | } 75 | 76 | /// Encode a bloom filter 77 | pub fn encode(&self, buf: &mut Vec) { 78 | let offset = buf.len(); 79 | buf.extend(&self.filter); 80 | buf.put_u8(self.k); 81 | let checksum = crc32fast::hash(&buf[offset..]); 82 | buf.put_u32(checksum); 83 | } 84 | 85 | /// Get bloom filter bits per key from entries count and FPR 86 | pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { 87 | let size = 88 | -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); 89 | let locs = (size / (entries as f64)).ceil(); 90 | locs as usize 91 | } 92 | 93 | /// Build bloom filter from key hashes 94 | pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self { 95 | let k = (bits_per_key as f64 * 0.69) as u32; 96 | let k = k.clamp(1, 30); 97 | let nbits = (keys.len() * bits_per_key).max(64); 98 | let nbytes = (nbits + 7) / 8; 99 | let nbits = nbytes * 8; 100 | let mut filter = BytesMut::with_capacity(nbytes); 101 | filter.resize(nbytes, 0); 102 | for h in keys { 103 | let mut h = *h; 104 | let delta = h.rotate_left(15); 105 | for _ in 0..k { 106 | let bit_pos = (h as usize) % nbits; 107 | filter.set_bit(bit_pos, true); 108 | h = h.wrapping_add(delta); 109 | } 110 | } 111 | Self { 112 | filter: filter.freeze(), 113 | k: k as u8, 114 | } 115 | } 116 | 117 | /// Check if a bloom filter may contain some data 118 | pub fn may_contain(&self, mut h: u32) -> bool { 119 | if self.k > 30 { 120 | // potential new encoding for short bloom filters 121 | true 122 | } else { 123 | let nbits = self.filter.bit_len(); 124 | let delta = h.rotate_left(15); 125 | for _ in 0..self.k { 126 | let bit_pos = h % (nbits as u32); 127 | if !self.filter.get_bit(bit_pos as usize) { 128 | return false; 129 | } 130 | h = h.wrapping_add(delta); 131 | } 132 | true 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /mini-lsm-mvcc/src/table/bloom.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. 16 | 17 | use anyhow::{Result, bail}; 18 | use bytes::{Buf, BufMut, Bytes, BytesMut}; 19 | 20 | /// Implements a bloom filter 21 | pub struct Bloom { 22 | /// data of filter in bits 23 | pub(crate) filter: Bytes, 24 | /// number of hash functions 25 | pub(crate) k: u8, 26 | } 27 | 28 | pub trait BitSlice { 29 | fn get_bit(&self, idx: usize) -> bool; 30 | fn bit_len(&self) -> usize; 31 | } 32 | 33 | pub trait BitSliceMut { 34 | fn set_bit(&mut self, idx: usize, val: bool); 35 | } 36 | 37 | impl> BitSlice for T { 38 | fn get_bit(&self, idx: usize) -> bool { 39 | let pos = idx / 8; 40 | let offset = idx % 8; 41 | (self.as_ref()[pos] & (1 << offset)) != 0 42 | } 43 | 44 | fn bit_len(&self) -> usize { 45 | self.as_ref().len() * 8 46 | } 47 | } 48 | 49 | impl> BitSliceMut for T { 50 | fn set_bit(&mut self, idx: usize, val: bool) { 51 | let pos = idx / 8; 52 | let offset = idx % 8; 53 | if val { 54 | self.as_mut()[pos] |= 1 << offset; 55 | } else { 56 | self.as_mut()[pos] &= !(1 << offset); 57 | } 58 | } 59 | } 60 | 61 | impl Bloom { 62 | /// Decode a bloom filter 63 | pub fn decode(buf: &[u8]) -> Result { 64 | let checksum = (&buf[buf.len() - 4..buf.len()]).get_u32(); 65 | if checksum != crc32fast::hash(&buf[..buf.len() - 4]) { 66 | bail!("checksum mismatched for bloom filters"); 67 | } 68 | let filter = &buf[..buf.len() - 5]; 69 | let k = buf[buf.len() - 5]; 70 | Ok(Self { 71 | filter: filter.to_vec().into(), 72 | k, 73 | }) 74 | } 75 | 76 | /// Encode a bloom filter 77 | pub fn encode(&self, buf: &mut Vec) { 78 | let offset = buf.len(); 79 | buf.extend(&self.filter); 80 | buf.put_u8(self.k); 81 | let checksum = crc32fast::hash(&buf[offset..]); 82 | buf.put_u32(checksum); 83 | } 84 | 85 | /// Get bloom filter bits per key from entries count and FPR 86 | pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { 87 | let size = 88 | -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); 89 | let locs = (size / (entries as f64)).ceil(); 90 | locs as usize 91 | } 92 | 93 | /// Build bloom filter from key hashes 94 | pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self { 95 | let k = (bits_per_key as f64 * 0.69) as u32; 96 | let k = k.clamp(1, 30); 97 | let nbits = (keys.len() * bits_per_key).max(64); 98 | let nbytes = (nbits + 7) / 8; 99 | let nbits = nbytes * 8; 100 | let mut filter = BytesMut::with_capacity(nbytes); 101 | filter.resize(nbytes, 0); 102 | for h in keys { 103 | let mut h = *h; 104 | let delta = h.rotate_left(15); 105 | for _ in 0..k { 106 | let bit_pos = (h as usize) % nbits; 107 | filter.set_bit(bit_pos, true); 108 | h = h.wrapping_add(delta); 109 | } 110 | } 111 | Self { 112 | filter: filter.freeze(), 113 | k: k as u8, 114 | } 115 | } 116 | 117 | /// Check if a bloom filter may contain some data 118 | pub fn may_contain(&self, mut h: u32) -> bool { 119 | if self.k > 30 { 120 | // potential new encoding for short bloom filters 121 | true 122 | } else { 123 | let nbits = self.filter.bit_len(); 124 | let delta = h.rotate_left(15); 125 | for _ in 0..self.k { 126 | let bit_pos = h % (nbits as u32); 127 | if !self.filter.get_bit(bit_pos as usize) { 128 | return false; 129 | } 130 | h = h.wrapping_add(delta); 131 | } 132 | true 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /mini-lsm/src/table/builder.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022-2025 Alex Chi Z 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::path::Path; 16 | use std::sync::Arc; 17 | 18 | use anyhow::Result; 19 | use bytes::BufMut; 20 | 21 | use super::bloom::Bloom; 22 | use super::{BlockMeta, FileObject, SsTable}; 23 | use crate::block::BlockBuilder; 24 | use crate::key::{KeySlice, KeyVec}; 25 | use crate::lsm_storage::BlockCache; 26 | 27 | /// Builds an SSTable from key-value pairs. 28 | pub struct SsTableBuilder { 29 | builder: BlockBuilder, 30 | first_key: KeyVec, 31 | last_key: KeyVec, 32 | data: Vec, 33 | pub(crate) meta: Vec, 34 | block_size: usize, 35 | key_hashes: Vec, 36 | } 37 | 38 | impl SsTableBuilder { 39 | /// Create a builder based on target block size. 40 | pub fn new(block_size: usize) -> Self { 41 | Self { 42 | data: Vec::new(), 43 | meta: Vec::new(), 44 | first_key: KeyVec::new(), 45 | last_key: KeyVec::new(), 46 | block_size, 47 | builder: BlockBuilder::new(block_size), 48 | key_hashes: Vec::new(), 49 | } 50 | } 51 | 52 | /// Adds a key-value pair to SSTable 53 | pub fn add(&mut self, key: KeySlice, value: &[u8]) { 54 | if self.first_key.is_empty() { 55 | self.first_key.set_from_slice(key); 56 | } 57 | 58 | self.key_hashes.push(farmhash::fingerprint32(key.raw_ref())); 59 | 60 | if self.builder.add(key, value) { 61 | self.last_key.set_from_slice(key); 62 | return; 63 | } 64 | 65 | // create a new block builder and append block data 66 | self.finish_block(); 67 | 68 | // add the key-value pair to the next block 69 | assert!(self.builder.add(key, value)); 70 | self.first_key.set_from_slice(key); 71 | self.last_key.set_from_slice(key); 72 | } 73 | 74 | /// Get the estimated size of the SSTable. 75 | pub fn estimated_size(&self) -> usize { 76 | self.data.len() 77 | } 78 | 79 | fn finish_block(&mut self) { 80 | let builder = std::mem::replace(&mut self.builder, BlockBuilder::new(self.block_size)); 81 | let encoded_block = builder.build().encode(); 82 | self.meta.push(BlockMeta { 83 | offset: self.data.len(), 84 | first_key: std::mem::take(&mut self.first_key).into_key_bytes(), 85 | last_key: std::mem::take(&mut self.last_key).into_key_bytes(), 86 | }); 87 | let checksum = crc32fast::hash(&encoded_block); 88 | self.data.extend(encoded_block); 89 | self.data.put_u32(checksum); 90 | } 91 | 92 | /// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects. 93 | pub fn build( 94 | mut self, 95 | id: usize, 96 | block_cache: Option>, 97 | path: impl AsRef, 98 | ) -> Result { 99 | self.finish_block(); 100 | let mut buf = self.data; 101 | let meta_offset = buf.len(); 102 | BlockMeta::encode_block_meta(&self.meta, &mut buf); 103 | buf.put_u32(meta_offset as u32); 104 | let bloom = Bloom::build_from_key_hashes( 105 | &self.key_hashes, 106 | Bloom::bloom_bits_per_key(self.key_hashes.len(), 0.01), 107 | ); 108 | let bloom_offset = buf.len(); 109 | bloom.encode(&mut buf); 110 | buf.put_u32(bloom_offset as u32); 111 | let file = FileObject::create(path.as_ref(), buf)?; 112 | Ok(SsTable { 113 | id, 114 | file, 115 | first_key: self.meta.first().unwrap().first_key.clone(), 116 | last_key: self.meta.last().unwrap().last_key.clone(), 117 | block_meta: self.meta, 118 | block_meta_offset: meta_offset, 119 | block_cache, 120 | bloom: Some(bloom), 121 | max_ts: 0, // will be changed to latest ts in week 2 122 | }) 123 | } 124 | 125 | #[cfg(test)] 126 | pub(crate) fn build_for_test(self, path: impl AsRef) -> Result { 127 | self.build(0, None, path) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /mini-lsm-book/src/lsm-tutorial/week1-01-overview.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | week1-01-overview 13 | 14 | 15 | Layer 1 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | On Disk 26 | 27 | 28 | 29 | 30 | In Memory 31 | 32 | 33 | 34 | 35 | 36 | 37 | Mem 38 | Table 39 | 40 | 41 | 42 | 43 | 44 | 45 | Mem 46 | Table 47 | 48 | 49 | 50 | 51 | 52 | 53 | Mem 54 | Table 55 | 56 | 57 | 58 | 59 | 60 | 61 | key + value 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | Day 1: Memtables 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /mini-lsm-book/src/lsm-tutorial/week1-02-overview.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | week1-02-overview 13 | 14 | 15 | Layer 1 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | On Disk 26 | 27 | 28 | 29 | 30 | In Memory 31 | 32 | 33 | 34 | 35 | 36 | 37 | Mem 38 | Table 39 | 40 | 41 | 42 | 43 | 44 | 45 | Mem 46 | Table 47 | 48 | 49 | 50 | 51 | 52 | 53 | Mem 54 | Table 55 | 56 | 57 | 58 | 59 | 60 | 61 | key + value 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | Day 2: Merge Iterators 70 | 71 | 72 | 73 | 74 | 75 | --------------------------------------------------------------------------------