├── .cargo └── config.toml ├── .github ├── pull_request_template.md ├── semantic.yml ├── template │ ├── generate.sh │ ├── main-override.yml │ ├── pr-override.yml │ └── template.yml └── workflows │ ├── main.yml │ └── pull-request.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── Makefile.toml ├── README.md ├── bench ├── Cargo.toml ├── bench_kv │ └── main.rs └── etc │ ├── exhauster.toml │ ├── lsm_tree.toml │ ├── rudder.toml │ └── wheel.toml ├── client ├── Cargo.toml └── src │ ├── client.rs │ ├── error.rs │ ├── lib.rs │ ├── router.rs │ └── worker │ ├── heartbeater.rs │ └── mod.rs ├── codecov.yml ├── common ├── Cargo.toml ├── benches │ └── bench_sharded_hash_map.rs └── src │ ├── atomic.rs │ ├── channel_pool.rs │ ├── coding.rs │ ├── config.rs │ ├── context.rs │ ├── lib.rs │ ├── log.rs │ ├── notify_pool.rs │ ├── packer.rs │ ├── prometheus.rs │ ├── sharded_hash_map.rs │ ├── sync.rs │ ├── time.rs │ └── tracing_slog_drain.rs ├── etc ├── exhauster.toml ├── grafana-dashboards │ └── runkv-overview.json ├── grafana-provisioning │ ├── dashboards │ │ └── runkv-dashboards.yml │ └── datasources │ │ └── runkv-prometheus.yml ├── grafana.ini ├── lsm_tree.toml ├── prometheus.yml ├── rudder.toml └── wheel.toml ├── exhauster ├── Cargo.toml └── src │ ├── compaction_filter.rs │ ├── config.rs │ ├── error.rs │ ├── lib.rs │ ├── main.rs │ ├── partitioner.rs │ ├── service.rs │ └── worker │ ├── heartbeater.rs │ └── mod.rs ├── make ├── common.toml ├── grafana.toml ├── jaeger.toml ├── minio.toml └── prometheus.toml ├── proto ├── Cargo.toml ├── build.rs └── src │ ├── lib.rs │ └── proto │ ├── buf.yaml │ ├── common.proto │ ├── exhauster.proto │ ├── kv.proto │ ├── manifest.proto │ ├── meta.proto │ ├── prototool.yaml │ ├── rudder.proto │ └── wheel.proto ├── rudder ├── Cargo.toml └── src │ ├── config.rs │ ├── error.rs │ ├── lib.rs │ ├── main.rs │ ├── meta │ ├── mem.rs │ ├── mod.rs │ └── object.rs │ ├── service.rs │ └── worker │ ├── compaction_detector.rs │ └── mod.rs ├── run ├── rust-toolchain ├── rustfmt.toml ├── storage ├── Cargo.toml ├── bench │ ├── bench_raft_log_store │ │ └── main.rs │ └── file_cache_bench │ │ ├── README.md │ │ ├── analyze.rs │ │ ├── bench.rs │ │ ├── bpf.rs │ │ ├── main.rs │ │ ├── rate.rs │ │ └── utils.rs ├── benches │ ├── bench_block_iter.rs │ └── bench_compression.rs └── src │ ├── error.rs │ ├── lib.rs │ ├── lsm_tree │ ├── components │ │ ├── block.rs │ │ ├── block_cache.rs │ │ ├── memtable.rs │ │ ├── metrics.rs │ │ ├── mod.rs │ │ ├── skiplist │ │ │ ├── arena.rs │ │ │ ├── key.rs │ │ │ ├── list.rs │ │ │ └── mod.rs │ │ ├── sstable.rs │ │ └── sstable_store.rs │ ├── iterator │ │ ├── block_iterator.rs │ │ ├── concat_iterator.rs │ │ ├── memtable_iterator.rs │ │ ├── merge_iterator.rs │ │ ├── mod.rs │ │ ├── sstable_iterator.rs │ │ └── user_key_iterator.rs │ ├── manifest │ │ ├── error.rs │ │ ├── mod.rs │ │ └── version.rs │ └── mod.rs │ ├── object_store │ ├── mem.rs │ ├── mod.rs │ └── s3.rs │ ├── raft_log_store │ ├── block_cache.rs │ ├── entry.rs │ ├── error.rs │ ├── file.rs │ ├── log.rs │ ├── mem.rs │ ├── metrics.rs │ ├── mod.rs │ ├── queue.rs │ └── store.rs │ ├── tiered_cache │ ├── file_cache │ │ ├── alloc.rs │ │ ├── buffer.rs │ │ ├── cache.rs │ │ ├── error.rs │ │ ├── file.rs │ │ ├── meta.rs │ │ ├── metrics.rs │ │ ├── mod.rs │ │ ├── store.rs │ │ ├── test_utils.rs │ │ └── utils.rs │ └── mod.rs │ └── utils │ ├── bloom.rs │ ├── coding.rs │ ├── lru_cache.rs │ └── mod.rs ├── tests ├── Cargo.toml ├── etc │ ├── exhauster.toml │ ├── lsm_tree.toml │ ├── port.toml │ ├── rudder.toml │ └── wheel.toml ├── integrations │ ├── lib.rs │ ├── test_concurrent_put_get.rs │ └── test_multi_raft_group_concurrent_put_get.rs └── src │ └── lib.rs └── wheel ├── Cargo.toml └── src ├── components ├── command.rs ├── fsm.rs ├── lsm_tree.rs ├── mod.rs ├── raft_log_store.rs ├── raft_manager.rs ├── raft_network.rs └── read_only_cmd_pool.rs ├── config.rs ├── error.rs ├── lib.rs ├── main.rs ├── meta ├── mem.rs ├── mod.rs └── object.rs ├── service.rs ├── trace.rs └── worker ├── heartbeater.rs ├── mod.rs ├── raft.rs └── sstable_uploader.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | # Flags for all targets. 2 | [target.'cfg(all())'] 3 | rustflags = ["--cfg", "tokio_unstable"] 4 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## What changes were proposed in this pull request? 2 | 3 | (Please fill in changes proposed in this fix) 4 | 5 | ## Which issues is this PR related to? 6 | 7 | (Please list the issues related. Hint: use markdown list for better looking) 8 | 9 | ## How was this patch tested? 10 | 11 | (Please explain how this patch was tested. E.g. unit tests, integration tests) 12 | (If this patch involves UI changes, please attach a screen-shot; otherwise, remove this) 13 | 14 | ## Will this help MrCroxx run or graduate? 15 | 16 | (Run is OK, but better say graduate) 17 | -------------------------------------------------------------------------------- /.github/semantic.yml: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/zeke/semantic-pull-requests#configuration . 2 | titleAndCommits: true 3 | anyCommit: true 4 | types: 5 | - feat 6 | - fix 7 | - docs 8 | - style 9 | - refactor 10 | - perf 11 | - test 12 | - build 13 | - ci 14 | - chore 15 | - revert 16 | allowMergeCommits: true 17 | allowMergeCommits: true -------------------------------------------------------------------------------- /.github/template/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 6 | cd "$DIR" 7 | 8 | # You will need to install yq >= 4.16 to use this tool. 9 | # brew install yq 10 | 11 | HEADER=""" 12 | # ================= THIS FILE IS AUTOMATICALLY GENERATED ================= 13 | # 14 | # Please run generate.sh and commit after editing the workflow templates. 15 | # 16 | # ======================================================================== 17 | """ 18 | 19 | # Generate workflow for main branch 20 | echo "$HEADER" > ../workflows/main.yml 21 | # shellcheck disable=SC2016 22 | yq ea '. as $item ireduce ({}; . * $item )' template.yml main-override.yml | yq eval '... comments=""' - >> ../workflows/main.yml 23 | echo "$HEADER" >> ../workflows/main.yml 24 | 25 | # Generate workflow for pull requests 26 | echo "$HEADER" > ../workflows/pull-request.yml 27 | # shellcheck disable=SC2016 28 | yq ea '. as $item ireduce ({}; . * $item )' template.yml pr-override.yml | yq eval '... comments=""' - >> ../workflows/pull-request.yml 29 | echo "$HEADER" >> ../workflows/pull-request.yml 30 | 31 | if [ "$1" == "--check" ] ; then 32 | if ! git diff --exit-code; then 33 | echo "Please run generate.sh and commit after editing the workflow templates." 34 | exit 1 35 | fi 36 | fi 37 | -------------------------------------------------------------------------------- /.github/template/main-override.yml: -------------------------------------------------------------------------------- 1 | name: CI (main) 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | workflow_dispatch: 7 | -------------------------------------------------------------------------------- /.github/template/pr-override.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | 7 | concurrency: 8 | group: environment-${{ github.ref }} 9 | cancel-in-progress: true 10 | -------------------------------------------------------------------------------- /.github/template/template.yml: -------------------------------------------------------------------------------- 1 | name: 2 | 3 | on: 4 | 5 | env: 6 | RUST_TOOLCHAIN: nightly-2022-10-16 7 | CARGO_TERM_COLOR: always 8 | CACHE_KEY_SUFFIX: 20221107 9 | RUNKV_CI: true 10 | 11 | jobs: 12 | misc-check: 13 | name: misc check 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v3 18 | - name: Install tools 19 | run: | 20 | wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq 21 | sudo apt install -y protobuf-compiler -o Acquire::Retries=3 22 | curl -sSL \ 23 | https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \ 24 | sudo tar -C /usr/local --strip-components 1 -xz 25 | curl -sSL \ 26 | "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \ 27 | sudo tar -xvzf - -C /usr/local --strip-components 1 28 | env: 29 | YQ_VERSION: v4.16.1 30 | BINARY: yq_linux_amd64 31 | BUF_VERSION: 1.0.0-rc6 32 | - name: Check if CI workflows are up-to-date 33 | run: | 34 | ./.github/template/generate.sh --check 35 | - name: Run ShellCheck 36 | uses: ludeeus/action-shellcheck@master 37 | - name: Check protobuf style 38 | run: | 39 | cd proto/src/proto && prototool format -d && buf lint 40 | rust-test: 41 | name: rust test with codecov 42 | runs-on: ubuntu-latest 43 | steps: 44 | - name: Checkout 45 | uses: actions/checkout@v3 46 | - name: Install rust toolchain@v1 47 | uses: actions-rs/toolchain@v1 48 | with: 49 | toolchain: ${{ env.RUST_TOOLCHAIN }} 50 | components: rustfmt, clippy, llvm-tools-preview 51 | - name: Cache Cargo home 52 | uses: actions/cache@v2 53 | id: cache 54 | with: 55 | path: | 56 | ~/.cargo/bin/ 57 | ~/.cargo/registry/index/ 58 | ~/.cargo/registry/cache/ 59 | ~/.cargo/git/db/ 60 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }} 61 | - name: Install cargo-sort 62 | if: steps.cache.outputs.cache-hit != 'true' 63 | run: | 64 | cargo install cargo-sort 65 | - name: Run rust cargo-sort check 66 | run: | 67 | cargo sort -w -c 68 | - name: Run rust format check 69 | run: | 70 | cargo fmt --all -- --check 71 | - name: Run rust clippy check 72 | run: | 73 | # If new CI checks are added, the one with `--locked` must be run first. 74 | cargo clippy --all-targets --locked -- -D warnings 75 | - if: steps.cache.outputs.cache-hit != 'true' 76 | uses: taiki-e/install-action@cargo-llvm-cov 77 | - if: steps.cache.outputs.cache-hit != 'true' 78 | uses: taiki-e/install-action@nextest 79 | - name: Run rust test with coverage 80 | run: | 81 | cargo llvm-cov nextest --lcov --output-path lcov.info 82 | - uses: codecov/codecov-action@v2 83 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | 2 | # ================= THIS FILE IS AUTOMATICALLY GENERATED ================= 3 | # 4 | # Please run generate.sh and commit after editing the workflow templates. 5 | # 6 | # ======================================================================== 7 | 8 | name: CI (main) 9 | on: 10 | push: 11 | branches: [main] 12 | workflow_dispatch: 13 | env: 14 | RUST_TOOLCHAIN: nightly-2022-10-16 15 | CARGO_TERM_COLOR: always 16 | CACHE_KEY_SUFFIX: 20221107 17 | RUNKV_CI: true 18 | jobs: 19 | misc-check: 20 | name: misc check 21 | runs-on: ubuntu-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Install tools 26 | run: | 27 | wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq 28 | sudo apt install -y protobuf-compiler -o Acquire::Retries=3 29 | curl -sSL \ 30 | https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \ 31 | sudo tar -C /usr/local --strip-components 1 -xz 32 | curl -sSL \ 33 | "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \ 34 | sudo tar -xvzf - -C /usr/local --strip-components 1 35 | env: 36 | YQ_VERSION: v4.16.1 37 | BINARY: yq_linux_amd64 38 | BUF_VERSION: 1.0.0-rc6 39 | - name: Check if CI workflows are up-to-date 40 | run: | 41 | ./.github/template/generate.sh --check 42 | - name: Run ShellCheck 43 | uses: ludeeus/action-shellcheck@master 44 | - name: Check protobuf style 45 | run: "cd proto/src/proto && prototool format -d && buf lint \n" 46 | rust-test: 47 | name: rust test with codecov 48 | runs-on: ubuntu-latest 49 | steps: 50 | - name: Checkout 51 | uses: actions/checkout@v3 52 | - name: Install rust toolchain@v1 53 | uses: actions-rs/toolchain@v1 54 | with: 55 | toolchain: ${{ env.RUST_TOOLCHAIN }} 56 | components: rustfmt, clippy, llvm-tools-preview 57 | - name: Cache Cargo home 58 | uses: actions/cache@v2 59 | id: cache 60 | with: 61 | path: | 62 | ~/.cargo/bin/ 63 | ~/.cargo/registry/index/ 64 | ~/.cargo/registry/cache/ 65 | ~/.cargo/git/db/ 66 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }} 67 | - name: Install cargo-sort 68 | if: steps.cache.outputs.cache-hit != 'true' 69 | run: | 70 | cargo install cargo-sort 71 | - name: Run rust cargo-sort check 72 | run: | 73 | cargo sort -w -c 74 | - name: Run rust format check 75 | run: | 76 | cargo fmt --all -- --check 77 | - name: Run rust clippy check 78 | run: | 79 | # If new CI checks are added, the one with `--locked` must be run first. 80 | cargo clippy --all-targets --locked -- -D warnings 81 | - if: steps.cache.outputs.cache-hit != 'true' 82 | uses: taiki-e/install-action@cargo-llvm-cov 83 | - if: steps.cache.outputs.cache-hit != 'true' 84 | uses: taiki-e/install-action@nextest 85 | - name: Run rust test with coverage 86 | run: | 87 | cargo llvm-cov nextest --lcov --output-path lcov.info 88 | - uses: codecov/codecov-action@v2 89 | 90 | # ================= THIS FILE IS AUTOMATICALLY GENERATED ================= 91 | # 92 | # Please run generate.sh and commit after editing the workflow templates. 93 | # 94 | # ======================================================================== 95 | 96 | -------------------------------------------------------------------------------- /.github/workflows/pull-request.yml: -------------------------------------------------------------------------------- 1 | 2 | # ================= THIS FILE IS AUTOMATICALLY GENERATED ================= 3 | # 4 | # Please run generate.sh and commit after editing the workflow templates. 5 | # 6 | # ======================================================================== 7 | 8 | name: CI 9 | on: 10 | pull_request: 11 | branches: [main] 12 | env: 13 | RUST_TOOLCHAIN: nightly-2022-10-16 14 | CARGO_TERM_COLOR: always 15 | CACHE_KEY_SUFFIX: 20221107 16 | RUNKV_CI: true 17 | jobs: 18 | misc-check: 19 | name: misc check 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v3 24 | - name: Install tools 25 | run: | 26 | wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq 27 | sudo apt install -y protobuf-compiler -o Acquire::Retries=3 28 | curl -sSL \ 29 | https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \ 30 | sudo tar -C /usr/local --strip-components 1 -xz 31 | curl -sSL \ 32 | "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \ 33 | sudo tar -xvzf - -C /usr/local --strip-components 1 34 | env: 35 | YQ_VERSION: v4.16.1 36 | BINARY: yq_linux_amd64 37 | BUF_VERSION: 1.0.0-rc6 38 | - name: Check if CI workflows are up-to-date 39 | run: | 40 | ./.github/template/generate.sh --check 41 | - name: Run ShellCheck 42 | uses: ludeeus/action-shellcheck@master 43 | - name: Check protobuf style 44 | run: "cd proto/src/proto && prototool format -d && buf lint \n" 45 | rust-test: 46 | name: rust test with codecov 47 | runs-on: ubuntu-latest 48 | steps: 49 | - name: Checkout 50 | uses: actions/checkout@v3 51 | - name: Install rust toolchain@v1 52 | uses: actions-rs/toolchain@v1 53 | with: 54 | toolchain: ${{ env.RUST_TOOLCHAIN }} 55 | components: rustfmt, clippy, llvm-tools-preview 56 | - name: Cache Cargo home 57 | uses: actions/cache@v2 58 | id: cache 59 | with: 60 | path: | 61 | ~/.cargo/bin/ 62 | ~/.cargo/registry/index/ 63 | ~/.cargo/registry/cache/ 64 | ~/.cargo/git/db/ 65 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }} 66 | - name: Install cargo-sort 67 | if: steps.cache.outputs.cache-hit != 'true' 68 | run: | 69 | cargo install cargo-sort 70 | - name: Run rust cargo-sort check 71 | run: | 72 | cargo sort -w -c 73 | - name: Run rust format check 74 | run: | 75 | cargo fmt --all -- --check 76 | - name: Run rust clippy check 77 | run: | 78 | # If new CI checks are added, the one with `--locked` must be run first. 79 | cargo clippy --all-targets --locked -- -D warnings 80 | - if: steps.cache.outputs.cache-hit != 'true' 81 | uses: taiki-e/install-action@cargo-llvm-cov 82 | - if: steps.cache.outputs.cache-hit != 'true' 83 | uses: taiki-e/install-action@nextest 84 | - name: Run rust test with coverage 85 | run: | 86 | cargo llvm-cov nextest --lcov --output-path lcov.info 87 | - uses: codecov/codecov-action@v2 88 | concurrency: 89 | group: environment-${{ github.ref }} 90 | cancel-in-progress: true 91 | 92 | # ================= THIS FILE IS AUTOMATICALLY GENERATED ================= 93 | # 94 | # Please run generate.sh and commit after editing the workflow templates. 95 | # 96 | # ======================================================================== 97 | 98 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | .vscode 4 | .idea 5 | 6 | /.run 7 | /.bin 8 | 9 | .DS_Store 10 | 11 | perf.data* 12 | flamegraph.svg 13 | 14 | *.log 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "bench", 4 | "client", 5 | "common", 6 | "exhauster", 7 | "proto", 8 | "rudder", 9 | "storage", 10 | "tests", 11 | "wheel", 12 | ] 13 | 14 | [profile.bench] 15 | debug = true 16 | 17 | [profile.release] 18 | debug = true 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 MrCroxx 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | .PHONY: proto 3 | 4 | fmt: 5 | cargo sort -w && cargo fmt --all && cargo clippy --all-targets --all-features && cargo clippy --all-targets 6 | 7 | fmt_check: 8 | cargo sort -c -w && cargo fmt --all -- --check && cargo clippy --all-targets --all-features --locked -- -D warnings && cargo clippy --all-targets --locked -- -D warnings 9 | 10 | clean: 11 | cargo clean 12 | 13 | check: 14 | cargo check --tests 15 | 16 | test: 17 | cargo nextest run --features deadlock 18 | 19 | proto: 20 | cd proto/src/proto && prototool format -w && buf lint 21 | 22 | proto_check: 23 | cd proto/src/proto && prototool format -d && buf lint 24 | 25 | update_ci: 26 | cd .github/template && ./generate.sh 27 | 28 | bench_kv: 29 | RUNKV_METRICS=true RUST_BACKTRACE=1 cargo run --release --package runkv-bench --bin bench_kv -------------------------------------------------------------------------------- /Makefile.toml: -------------------------------------------------------------------------------- 1 | extend = [ 2 | { path = "make/common.toml" }, 3 | { path = "make/minio.toml" }, 4 | { path = "make/jaeger.toml" }, 5 | { path = "make/prometheus.toml" }, 6 | { path = "make/grafana.toml" }, 7 | ] 8 | 9 | [env] 10 | RUST_BACKTRACE = 1 11 | CARGO_MAKE_EXTEND_WORKSPACE_MAKEFILE = true 12 | 13 | [config] 14 | skip_core_tasks = true 15 | 16 | [tasks.clean-all] 17 | category = "Misc" 18 | description = "Clean all downloaded binaries by deleting .run folder." 19 | dependencies = ["prepare"] 20 | script = ''' 21 | #!@duckscript 22 | rm -rf "${PREFIX}" 23 | ''' 24 | 25 | [tasks.clean-data] 26 | category = "Misc" 27 | description = "Clean data by deleting files in .run/data and ./run/log folder." 28 | dependencies = ["prepare"] 29 | script = ''' 30 | #!/bin/bash 31 | set -e 32 | rm -rf "${PREFIX_DATA}" 33 | rm -rf "${PREFIX_LOG}" 34 | ''' 35 | 36 | [tasks.d] 37 | alias = "dev" 38 | 39 | [tasks.dev] 40 | category = "Develop" 41 | description = "Start development envirorment." 42 | dependencies = ["pre-dev", "setup-minio"] 43 | script = ''' 44 | #!@duckscript 45 | echo success 46 | ''' 47 | 48 | [tasks.pre-dev] 49 | category = "Develop" 50 | description = "Downloads necessary tools and build required components." 51 | dependencies = [ 52 | "clean-data", 53 | "download-minio", 54 | "download-mcli", 55 | "download-jaeger", 56 | "download-prometheus", 57 | "download-grafana", 58 | ] 59 | script = ''' 60 | #!/bin/bash 61 | # run minio 62 | tmux new -d -s runkv-minio ${PREFIX_BIN}/minio server --address 0.0.0.0:9000 --console-address 0.0.0.0:9090 ${PREFIX_DATA} 63 | # run jaeger 64 | tmux new -d -s runkv-jaeger ${PREFIX_BIN}/jaeger 65 | # run prometheus 66 | tmux new -d -s runkv-prometheus ${PREFIX_BIN}/prometheus/prometheus --config.file=etc/prometheus.yml --web.listen-address=0.0.0.0:9091 --storage.tsdb.path=${PREFIX_DATA}/prometheus-data 67 | # run grafana 68 | tmux new -d -s runkv-grafana ${PREFIX_BIN}/grafana/bin/grafana-server -homepath .run/bin/grafana -config etc/grafana.ini 69 | sleep 2 70 | ''' 71 | 72 | [tasks.setup-minio] 73 | category = "Misc" 74 | description = "Clean minio data with mcli." 75 | dependencies = ["prepare"] 76 | script = ''' 77 | #!/bin/bash 78 | set -e 79 | ${PREFIX_BIN}/mcli alias set local http://127.0.0.1:9000 minioadmin minioadmin 80 | ${PREFIX_BIN}/mcli mb --quiet local/test 81 | ''' 82 | 83 | [tasks.k] 84 | alias = "kill" 85 | 86 | [tasks.kill] 87 | category = "Develop" 88 | description = "Kill development environment." 89 | script = ''' 90 | #!/bin/bash 91 | tmux send-keys -t runkv-minio C-c 92 | tmux send-keys -t runkv-jaeger C-c 93 | tmux send-keys -t runkv-prometheus C-c 94 | tmux send-keys -t runkv-grafana C-c 95 | sleep 2 96 | ''' 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RunKV 2 | 3 | ![main](https://github.com/MrCroxx/RunKV/actions/workflows/main.yml/badge.svg) 4 | [![codecov](https://codecov.io/gh/MrCroxx/RunKV/branch/main/graph/badge.svg?token=LKT7JWROUW)](https://codecov.io/gh/MrCroxx/RunKV) 5 | 6 | **Still working in progress.** 7 | 8 | **TOY ONLY!** 9 | 10 | RunKV is an experimental key-value storage engine for OLTP workload based on S3 and EBS. The goal is to reduce storage costs while the performace fallback is tolerable. 11 | 12 | *This is my master graduation project. Better gonna run, or I must run.* 13 | -------------------------------------------------------------------------------- /bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-bench" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | bytes = "1" 10 | bytesize = "1.1.0" 11 | clap = { version = "3.1.6", features = ["derive"] } 12 | env_logger = "*" 13 | futures = "0.3" 14 | itertools = "0.10.3" 15 | lazy_static = "1.4.0" 16 | rand = "0.8.5" 17 | runkv-client = { path = "../client" } 18 | runkv-common = { path = "../common" } 19 | runkv-exhauster = { path = "../exhauster" } 20 | runkv-proto = { path = "../proto" } 21 | runkv-rudder = { path = "../rudder" } 22 | runkv-storage = { path = "../storage" } 23 | runkv-tests = { path = "../tests" } 24 | runkv-wheel = { path = "../wheel" } 25 | tempfile = "3" 26 | test-log = "0.2.10" 27 | tokio = { version = "1", features = [ 28 | "rt-multi-thread", 29 | "sync", 30 | "macros", 31 | "time", 32 | "tracing", 33 | ] } 34 | toml = "0.4.2" 35 | tonic = "0.6.2" 36 | tracing = "0.1" 37 | 38 | [target.'cfg(not(target_env = "msvc"))'.dependencies] 39 | tikv-jemallocator = "0.4.3" 40 | 41 | [features] 42 | tracing = ["runkv-wheel/tracing"] 43 | deadlock = [ 44 | "runkv-tests/deadlock", 45 | "runkv-storage/deadlock", 46 | "runkv-wheel/deadlock", 47 | ] 48 | console = ["tokio/tracing", "runkv-common/console"] 49 | trace-notify-pool = ["runkv-common/trace-notify-pool"] 50 | verbose-release-log = [ 51 | "tracing/release_max_level_trace", 52 | "runkv-common/verbose-release-log", 53 | "runkv-exhauster/verbose-release-log", 54 | "runkv-rudder/verbose-release-log", 55 | "runkv-storage/verbose-release-log", 56 | "runkv-tests/verbose-release-log", 57 | "runkv-wheel/verbose-release-log", 58 | ] 59 | 60 | [[bin]] 61 | name = "bench_kv" 62 | path = "bench_kv/main.rs" 63 | -------------------------------------------------------------------------------- /bench/bench_kv/main.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(target_env = "msvc"))] 2 | use tikv_jemallocator::Jemalloc; 3 | 4 | #[cfg(not(target_env = "msvc"))] 5 | #[global_allocator] 6 | static GLOBAL: Jemalloc = Jemalloc; 7 | 8 | use clap::Parser; 9 | use runkv_tests::{run, Args, Options}; 10 | 11 | const RUDDER_CONFIG_PATH: &str = "bench/etc/rudder.toml"; 12 | const WHEEL_CONFIG_PATH: &str = "bench/etc/wheel.toml"; 13 | const EXHAUSTER_CONFIG_PATH: &str = "bench/etc/exhauster.toml"; 14 | const LSM_TREE_CONFIG_PATH: &str = "bench/etc/lsm_tree.toml"; 15 | 16 | const RUDDER_NODE_ID: u64 = 10000; 17 | const WHEEL_NODE_ID_BASE: u64 = 0; 18 | const EXHAUSTER_NODE_ID_BASE: u64 = 100; 19 | 20 | const RUDDER_PORT: u16 = 12300; 21 | const WHEEL_PORT_BASE: u16 = 12300; 22 | const WHEEL_PROMETHEUS_PORT_BASE: u16 = 9890; 23 | const EXHAUSTER_PORT_BASE: u16 = 12400; 24 | 25 | #[tokio::main] 26 | async fn main() { 27 | let args = Args::parse(); 28 | println!("{:#?}", args); 29 | 30 | let options = Options { 31 | log: true, 32 | rudder_config_path: RUDDER_CONFIG_PATH.to_string(), 33 | wheel_config_path: WHEEL_CONFIG_PATH.to_string(), 34 | exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(), 35 | lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(), 36 | rudder_node_id: RUDDER_NODE_ID, 37 | wheel_node_id_base: WHEEL_NODE_ID_BASE, 38 | exhauster_node_id_base: EXHAUSTER_NODE_ID_BASE, 39 | rudder_port: RUDDER_PORT, 40 | wheel_port_base: WHEEL_PORT_BASE, 41 | wheel_prometheus_port_base: WHEEL_PROMETHEUS_PORT_BASE, 42 | exhauster_port_base: EXHAUSTER_PORT_BASE, 43 | }; 44 | println!("{:#?}", options); 45 | 46 | run(args, options).await; 47 | } 48 | -------------------------------------------------------------------------------- /bench/etc/exhauster.toml: -------------------------------------------------------------------------------- 1 | id = 0 2 | host = "127.0.0.1" 3 | port = 0 4 | data_path = "data" 5 | meta_path = "meta" 6 | heartbeat_interval = "1 s" 7 | 8 | [rudder] 9 | id = 1 10 | host = "127.0.0.1" 11 | port = 0 12 | 13 | [minio] 14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 15 | 16 | [s3] 17 | bucket = "runkv" 18 | 19 | [buffer] 20 | write_buffer_capacity = "64 MiB" 21 | 22 | [cache] 23 | block_cache_capacity = "512 MiB" 24 | meta_cache_capacity = "256 MiB" 25 | -------------------------------------------------------------------------------- /bench/etc/lsm_tree.toml: -------------------------------------------------------------------------------- 1 | [lsm_tree] 2 | l1_capacity = "32 MiB" 3 | level_multiplier = 2 4 | 5 | trigger_l0_compaction_ssts = 4 6 | trigger_l0_compaction_interval = "500 ms" 7 | trigger_lmax_compaction_interval = "5 s" 8 | trigger_compaction_interval = "2 s" 9 | 10 | sstable_capacity = "4 MiB" 11 | block_capacity = "1 MiB" 12 | restart_interval = 2 13 | bloom_false_positive = 0.1 14 | 15 | compaction_pin_ttl = "15 s" 16 | 17 | [[lsm_tree.levels_options]] 18 | compaction_strategy = "Overlap" 19 | compression_algorithm = "None" 20 | 21 | [[lsm_tree.levels_options]] 22 | compaction_strategy = "NonOverlap" 23 | compression_algorithm = "None" 24 | 25 | [[lsm_tree.levels_options]] 26 | compaction_strategy = "NonOverlap" 27 | compression_algorithm = "None" 28 | 29 | [[lsm_tree.levels_options]] 30 | compaction_strategy = "NonOverlap" 31 | compression_algorithm = "Lz4" 32 | -------------------------------------------------------------------------------- /bench/etc/rudder.toml: -------------------------------------------------------------------------------- 1 | id = 0 2 | host = "127.0.0.1" 3 | port = 0 4 | data_path = "data" 5 | meta_path = "meta" 6 | health_timeout = "10 s" 7 | 8 | # [minio] 9 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 10 | 11 | # [s3] 12 | # bucket = "runkv" 13 | 14 | [cache] 15 | block_cache_capacity = "0 B" 16 | meta_cache_capacity = "64 kiB" 17 | -------------------------------------------------------------------------------- /bench/etc/wheel.toml: -------------------------------------------------------------------------------- 1 | id = 0 2 | host = "127.0.0.1" 3 | port = 0 4 | log = ".run/log/" 5 | data_path = "data" 6 | meta_path = "meta" 7 | poll_interval = "100ms" 8 | heartbeat_interval = "100ms" 9 | 10 | [rudder] 11 | id = 1 12 | host = "127.0.0.1" 13 | port = 0 14 | 15 | # [minio] 16 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 17 | 18 | # [s3] 19 | # bucket = "runkv" 20 | 21 | [buffer] 22 | write_buffer_capacity = "4 MiB" 23 | 24 | [cache] 25 | block_cache_capacity = "64 MiB" 26 | meta_cache_capacity = "64 MiB" 27 | 28 | [raft_log_store] 29 | log_dir_path = "{/path/to/log/dir}" 30 | log_file_capacity = "64 MiB" 31 | block_cache_capacity = "256 MiB" 32 | persist = "{persist}" 33 | 34 | [prometheus] 35 | host = "127.0.0.1" 36 | port = 0 37 | 38 | [tiered_cache] 39 | type = "FileCache" 40 | [tiered_cache.args] 41 | dir = "/path/to/file/cache/dir" 42 | capacity = "1 GiB" 43 | total_buffer_capacity = "1 GiB" 44 | cache_file_fallocate_unit = "256 MiB" 45 | cache_meta_fallocate_unit = "64 MiB" 46 | cache_file_max_write_size = "4 MiB" -------------------------------------------------------------------------------- /client/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-client" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-trait = "0.1" 10 | itertools = "0.10.3" 11 | parking_lot = "0.12" 12 | runkv-common = { path = "../common" } 13 | runkv-proto = { path = "../proto" } 14 | thiserror = "1.0" 15 | tokio = { version = "1", features = [ 16 | "rt-multi-thread", 17 | "sync", 18 | "macros", 19 | "time", 20 | ] } 21 | tonic = "0.6.2" 22 | tracing = "0.1" 23 | -------------------------------------------------------------------------------- /client/src/error.rs: -------------------------------------------------------------------------------- 1 | use tonic::Status; 2 | 3 | #[derive(thiserror::Error, Debug)] 4 | pub enum Error { 5 | #[error("rpc status error: {0}")] 6 | RpcStatus(#[from] Status), 7 | #[error("kv error: {0}")] 8 | KvError(#[from] KvError), 9 | #[error("config error: {0}")] 10 | ConfigError(String), 11 | #[error("other: {0}")] 12 | Other(String), 13 | } 14 | 15 | impl Error { 16 | pub fn err(e: impl Into>) -> Error { 17 | Error::Other(e.into().to_string()) 18 | } 19 | 20 | pub fn config_err(e: impl Into>) -> Error { 21 | Error::ConfigError(e.into().to_string()) 22 | } 23 | 24 | pub fn redirect(&self) -> bool { 25 | matches!(self, Self::KvError(KvError::Redirect)) 26 | } 27 | } 28 | 29 | #[derive(thiserror::Error, Debug)] 30 | pub enum KvError { 31 | #[error("temporarily no leader for key: {0:?}")] 32 | TemporarilyNoLeader(Vec), 33 | #[error("valid leader changed, need redirect")] 34 | Redirect, 35 | } 36 | 37 | pub type Result = std::result::Result; 38 | -------------------------------------------------------------------------------- /client/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | pub mod error; 3 | pub mod router; 4 | pub mod worker; 5 | -------------------------------------------------------------------------------- /client/src/router.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, HashMap}; 2 | use std::sync::Arc; 3 | 4 | use itertools::Itertools; 5 | use parking_lot::RwLock; 6 | use runkv_proto::meta::{KeyRange, KeyRangeInfo}; 7 | 8 | fn _is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool { 9 | !(r1.start_key > r2.end_key || r1.end_key < r2.start_key) 10 | } 11 | 12 | fn in_range(key: &[u8], range: &KeyRange) -> bool { 13 | key >= &range.start_key[..] && key < &range.end_key[..] 14 | } 15 | 16 | pub struct LeaderInfo { 17 | pub node: u64, 18 | pub group: u64, 19 | pub raft_node: u64, 20 | } 21 | 22 | struct RouterCore { 23 | /// { key range -> raft group id } 24 | key_range_groups: BTreeMap, 25 | /// { raft group id -> [raft node id] } 26 | group_raft_nodes: HashMap>, 27 | /// { raft group id -> leader raft node id } 28 | group_leader: HashMap, 29 | /// { raft node id -> node id } 30 | raft_nodes: HashMap, 31 | } 32 | 33 | #[derive(Clone)] 34 | pub struct Router { 35 | core: Arc>, 36 | } 37 | 38 | impl Default for Router { 39 | fn default() -> Self { 40 | Self { 41 | core: Arc::new(RwLock::new(RouterCore { 42 | key_range_groups: BTreeMap::default(), 43 | group_raft_nodes: HashMap::default(), 44 | group_leader: HashMap::default(), 45 | raft_nodes: HashMap::default(), 46 | })), 47 | } 48 | } 49 | } 50 | 51 | impl Router { 52 | pub fn leader(&self, key: &[u8]) -> Option { 53 | let core = self.core.read(); 54 | for (key_range, &group) in core.key_range_groups.iter() { 55 | if in_range(key, key_range) { 56 | if let Some(&leader) = core.group_leader.get(&group) { 57 | let node = core.raft_nodes.get(&leader).copied().unwrap(); 58 | return Some(LeaderInfo { 59 | node, 60 | group, 61 | raft_node: leader, 62 | }); 63 | } 64 | return None; 65 | } 66 | } 67 | None 68 | } 69 | 70 | pub fn update_key_ranges(&self, key_range_infos: Vec) { 71 | let mut updated = RouterCore { 72 | key_range_groups: BTreeMap::default(), 73 | group_raft_nodes: HashMap::default(), 74 | group_leader: HashMap::default(), 75 | raft_nodes: HashMap::default(), 76 | }; 77 | for KeyRangeInfo { 78 | group, 79 | key_range, 80 | raft_nodes, 81 | leader, 82 | } in key_range_infos 83 | { 84 | let key_range = key_range.unwrap(); 85 | 86 | updated.key_range_groups.insert(key_range, group); 87 | updated 88 | .group_raft_nodes 89 | .insert(group, raft_nodes.keys().copied().collect_vec()); 90 | updated.group_leader.insert(group, leader); 91 | for (raft_node, node) in raft_nodes { 92 | updated.raft_nodes.insert(raft_node, node); 93 | } 94 | } 95 | let mut core = self.core.write(); 96 | *core = updated; 97 | } 98 | } 99 | 100 | #[cfg(test)] 101 | mod tests { 102 | use super::*; 103 | 104 | fn is_send_sync_clone() {} 105 | 106 | #[test] 107 | fn ensure_send_sync_clone() { 108 | is_send_sync_clone::(); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /client/src/worker/heartbeater.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use async_trait::async_trait; 4 | use runkv_common::channel_pool::ChannelPool; 5 | use runkv_common::config::Node; 6 | use runkv_common::Worker; 7 | use runkv_proto::rudder::control_service_client::ControlServiceClient; 8 | use runkv_proto::rudder::RouterRequest; 9 | use tonic::Request; 10 | use tracing::warn; 11 | 12 | use crate::error::{Error, Result}; 13 | use crate::router::Router; 14 | 15 | pub struct HeartbeaterOptions { 16 | pub rudder: u64, 17 | pub heartbeat_interval: Duration, 18 | 19 | pub router: Router, 20 | pub channel_pool: ChannelPool, 21 | } 22 | 23 | pub struct Heartbeater { 24 | rudder: u64, 25 | heartbeat_interval: Duration, 26 | 27 | router: Router, 28 | channel_pool: ChannelPool, 29 | } 30 | 31 | impl Heartbeater { 32 | pub fn new(options: HeartbeaterOptions) -> Self { 33 | Self { 34 | rudder: options.rudder, 35 | heartbeat_interval: options.heartbeat_interval, 36 | 37 | router: options.router, 38 | channel_pool: options.channel_pool, 39 | } 40 | } 41 | 42 | async fn run_inner(&mut self) -> Result<()> { 43 | loop { 44 | tokio::time::sleep(self.heartbeat_interval).await; 45 | let channel = self 46 | .channel_pool 47 | .get(self.rudder) 48 | .await 49 | .map_err(Error::err)?; 50 | let mut client = ControlServiceClient::new(channel); 51 | let rsp = client 52 | .router(Request::new(RouterRequest::default())) 53 | .await? 54 | .into_inner(); 55 | self.router.update_key_ranges(rsp.key_ranges); 56 | for (node, endpoint) in rsp.wheels { 57 | self.channel_pool 58 | .put_node(Node { 59 | id: node, 60 | host: endpoint.host, 61 | port: endpoint.port as u16, 62 | }) 63 | .await; 64 | } 65 | } 66 | } 67 | } 68 | 69 | #[async_trait] 70 | impl Worker for Heartbeater { 71 | async fn run(&mut self) -> anyhow::Result<()> { 72 | // TODO: Gracefully kill. 73 | loop { 74 | match self.run_inner().await { 75 | Ok(_) => {} 76 | Err(e) => warn!("error occur when heartbeater running: {}", e), 77 | } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /client/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod heartbeater; 2 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | threshold: 5% 6 | patch: off 7 | ignore: 8 | - "**/bench" 9 | - "**/benches" -------------------------------------------------------------------------------- /common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-common" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-trait = "0.1" 10 | bincode = "1.3.3" 11 | bytes = "1" 12 | bytesize = { version = "1.1.0", features = ["serde"] } 13 | chrono = "0.4" 14 | clap = { version = "3.1.6", features = ["derive"] } 15 | console-subscriber = { version = "0.1.6", optional = true } 16 | futures = "0.3" 17 | http = "0.2.6" 18 | humantime = "2.1.0" 19 | humantime-serde = "1.1.1" 20 | hyper = { version = "^0.14", features = ["server", "http1", "tcp"] } 21 | isahc = "1" # isahc is the http client used for tracing. Always set it as the same version as opentelemetry-jaeger's. 22 | itertools = "0.10.3" 23 | lazy_static = "1.4.0" 24 | opentelemetry = { version = "0.17", features = ["rt-tokio", "trace"] } 25 | opentelemetry-jaeger = { version = "0.16", features = [ 26 | "rt-tokio", 27 | "collector_client", 28 | "isahc", 29 | "isahc_collector_client", 30 | ] } 31 | ouroboros = "0.15.0" 32 | parking_lot = "0.12" 33 | prometheus = "0.13.0" 34 | rand = "0.8.5" 35 | serde = "1.0" 36 | serde_derive = "1.0" 37 | slog = "2.7" 38 | tokio = { version = "1", features = ["rt-multi-thread", "sync"] } 39 | toml = "0.4.2" 40 | tonic = "0.6.2" 41 | tracing = "0.1" 42 | tracing-appender = "0.2" 43 | tracing-opentelemetry = "0.17" 44 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 45 | 46 | [dev-dependencies] 47 | criterion = { version = "0.3", features = ["async", "async_tokio"] } 48 | env_logger = "*" 49 | test-log = "0.2.10" 50 | 51 | [features] 52 | console = ["console-subscriber"] 53 | trace-notify-pool = [] 54 | tracing = [] 55 | verbose-release-log = ["tracing/release_max_level_trace"] 56 | 57 | [[bench]] 58 | name = "bench_sharded_hash_map" 59 | harness = false 60 | # Uncomment this line if you are generating flame graph. 61 | # debug = true 62 | -------------------------------------------------------------------------------- /common/benches/bench_sharded_hash_map.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | use std::time::Duration; 4 | 5 | use criterion::{criterion_group, criterion_main, Criterion}; 6 | use itertools::Itertools; 7 | use parking_lot::RwLock; 8 | use runkv_common::sharded_hash_map::ShardedHashMap; 9 | 10 | const CONCURRENCY: u64 = 1024; 11 | const SLEEP: Duration = Duration::from_micros(200); 12 | 13 | fn sharded_hash_map_concurrent_put_get(shards: u16) { 14 | let map = ShardedHashMap::new(shards); 15 | 16 | let job = |map: ShardedHashMap, i: u64, total: u64| { 17 | { 18 | assert_eq!(map.insert(i, i), None); 19 | } 20 | { 21 | let read = map.read(&i); 22 | assert_eq!(read.get(), Some(&i)); 23 | std::thread::sleep(SLEEP); 24 | drop(read); 25 | } 26 | { 27 | let mut write = map.write(&i); 28 | assert_eq!(write.get(), Some(&i)); 29 | *write.get_mut().unwrap() += total; 30 | assert_eq!(write.get(), Some(&(i + total))); 31 | std::thread::sleep(SLEEP); 32 | drop(write); 33 | } 34 | { 35 | let read = map.read(&i); 36 | assert_eq!(read.get(), Some(&(i + total))); 37 | std::thread::sleep(SLEEP); 38 | drop(read); 39 | } 40 | }; 41 | 42 | let handles = (0..CONCURRENCY) 43 | .into_iter() 44 | .map(|i| { 45 | let map_clone = map.clone(); 46 | std::thread::spawn(move || job(map_clone, i, CONCURRENCY)) 47 | }) 48 | .collect_vec(); 49 | 50 | for handle in handles { 51 | handle.join().unwrap(); 52 | } 53 | } 54 | 55 | fn hash_map_concurrent_put_get() { 56 | let map = Arc::new(RwLock::new(HashMap::default())); 57 | 58 | let job = |map: Arc>>, i: u64, total: u64| { 59 | { 60 | assert_eq!(map.write().insert(i, i), None); 61 | } 62 | { 63 | let read = map.read(); 64 | assert_eq!(read.get(&i), Some(&i)); 65 | std::thread::sleep(SLEEP); 66 | drop(read); 67 | } 68 | { 69 | let mut write = map.write(); 70 | assert_eq!(write.get(&i), Some(&i)); 71 | *write.get_mut(&i).unwrap() += total; 72 | assert_eq!(write.get(&i), Some(&(i + total))); 73 | std::thread::sleep(SLEEP); 74 | drop(write); 75 | } 76 | { 77 | let read = map.read(); 78 | assert_eq!(read.get(&i), Some(&(i + total))); 79 | std::thread::sleep(SLEEP); 80 | drop(read); 81 | } 82 | }; 83 | 84 | let handles = (0..CONCURRENCY) 85 | .into_iter() 86 | .map(|i| { 87 | let map_clone = map.clone(); 88 | std::thread::spawn(move || job(map_clone, i, CONCURRENCY)) 89 | }) 90 | .collect_vec(); 91 | 92 | for handle in handles { 93 | handle.join().unwrap(); 94 | } 95 | } 96 | 97 | fn bench_hash_map_concurrent_put_get(c: &mut Criterion) { 98 | let mut group = c.benchmark_group("10 samples"); 99 | group.sample_size(10); 100 | 101 | group.bench_function("hash map curruent put get", |b| { 102 | b.iter(hash_map_concurrent_put_get) 103 | }); 104 | 105 | group.bench_function("sharded hash map curruent put get - 1 shard(s)", |b| { 106 | b.iter(|| sharded_hash_map_concurrent_put_get(1)) 107 | }); 108 | 109 | group.bench_function("sharded hash map curruent put get - 16 shard(s)", |b| { 110 | b.iter(|| sharded_hash_map_concurrent_put_get(16)) 111 | }); 112 | 113 | group.bench_function("sharded hash map curruent put get - 64 shard(s)", |b| { 114 | b.iter(|| sharded_hash_map_concurrent_put_get(64)) 115 | }); 116 | 117 | group.bench_function("sharded hash map curruent put get - 256 shard(s)", |b| { 118 | b.iter(|| sharded_hash_map_concurrent_put_get(256)) 119 | }); 120 | 121 | group.bench_function("sharded hash map curruent put get - 1024 shard(s)", |b| { 122 | b.iter(|| sharded_hash_map_concurrent_put_get(1024)) 123 | }); 124 | 125 | group.finish(); 126 | } 127 | 128 | criterion_group!(benches, bench_hash_map_concurrent_put_get); 129 | criterion_main!(benches); 130 | -------------------------------------------------------------------------------- /common/src/atomic.rs: -------------------------------------------------------------------------------- 1 | #[macro_export] 2 | macro_rules! may_advance_atomic { 3 | ($atomic:expr, $val:expr) => { 4 | let mut old = $atomic.load(Ordering::Relaxed); 5 | while $val > old { 6 | match $atomic.compare_exchange_weak(old, $val, Ordering::SeqCst, Ordering::Relaxed) { 7 | Ok(_) => break, 8 | Err(v) => old = v, 9 | } 10 | } 11 | }; 12 | } 13 | -------------------------------------------------------------------------------- /common/src/channel_pool.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::sync::Arc; 3 | 4 | use tokio::sync::Mutex; 5 | use tonic::transport::{Channel, Endpoint}; 6 | 7 | use crate::config::Node; 8 | 9 | struct ChannelPoolCore { 10 | endpoints: BTreeMap, 11 | channels: BTreeMap, 12 | } 13 | 14 | #[derive(Clone)] 15 | pub struct ChannelPool { 16 | core: Arc>, 17 | } 18 | 19 | fn endpoint(node: &Node) -> Endpoint { 20 | Endpoint::from_shared(format!("http://{}:{}", node.host, node.port)).unwrap() 21 | } 22 | 23 | impl Default for ChannelPool { 24 | fn default() -> Self { 25 | Self::with_nodes(vec![]) 26 | } 27 | } 28 | 29 | impl ChannelPool { 30 | pub fn with_nodes(nodes: Vec) -> Self { 31 | Self { 32 | core: Arc::new(Mutex::new(ChannelPoolCore { 33 | endpoints: BTreeMap::from_iter( 34 | nodes.into_iter().map(|node| (node.id, endpoint(&node))), 35 | ), 36 | channels: BTreeMap::default(), 37 | })), 38 | } 39 | } 40 | 41 | pub async fn put_node(&self, node: Node) { 42 | let mut guard = self.core.lock().await; 43 | guard.endpoints.insert(node.id, endpoint(&node)); 44 | } 45 | 46 | pub async fn get(&self, node: u64) -> anyhow::Result { 47 | let mut guard = self.core.lock().await; 48 | if let Some(channel) = guard.channels.get(&node) { 49 | return Ok(channel.clone()); 50 | } 51 | if let Some(endpoint) = guard.endpoints.get(&node) { 52 | let channel = endpoint.connect().await?; 53 | guard.channels.insert(node, channel.clone()); 54 | return Ok(channel); 55 | } 56 | Err(anyhow::anyhow!("endpoint of node {} not found", node)) 57 | } 58 | 59 | pub async fn release(&self, node: u64) -> anyhow::Result<()> { 60 | let mut guard = self.core.lock().await; 61 | match guard.channels.remove(&node) { 62 | Some(_) => Ok(()), 63 | None => Err(anyhow::anyhow!("channel to node {} not exists", node)), 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /common/src/coding.rs: -------------------------------------------------------------------------------- 1 | use bytes::{Buf, BufMut}; 2 | use serde::Deserialize; 3 | 4 | pub trait BytesSerde<'de>: serde::Serialize + serde::Deserialize<'de> + Sized { 5 | fn encode_to_vec(&self) -> anyhow::Result> { 6 | bincode::serialize(self).map_err(|e| anyhow::anyhow!("bincode serialize error: {}", e)) 7 | } 8 | 9 | fn decode(slice: &'de [u8]) -> anyhow::Result { 10 | bincode::deserialize(slice).map_err(|e| anyhow::anyhow!("bincode deserialize error: {}", e)) 11 | } 12 | } 13 | 14 | #[derive(Deserialize, Clone, Copy, Debug)] 15 | pub enum CompressionAlgorithm { 16 | None, 17 | Lz4, 18 | } 19 | 20 | impl CompressionAlgorithm { 21 | pub fn encode(&self, buf: &mut impl BufMut) { 22 | let v = match self { 23 | Self::None => 0, 24 | Self::Lz4 => 1, 25 | }; 26 | buf.put_u8(v); 27 | } 28 | 29 | pub fn decode(buf: &mut impl Buf) -> Result { 30 | match buf.get_u8() { 31 | 0 => Ok(Self::None), 32 | 1 => Ok(Self::Lz4), 33 | _ => Err(anyhow::anyhow!("not valid compression algorithm")), 34 | } 35 | } 36 | } 37 | 38 | impl From for u8 { 39 | fn from(ca: CompressionAlgorithm) -> Self { 40 | match ca { 41 | CompressionAlgorithm::None => 0, 42 | CompressionAlgorithm::Lz4 => 1, 43 | } 44 | } 45 | } 46 | 47 | impl From for u64 { 48 | fn from(ca: CompressionAlgorithm) -> Self { 49 | match ca { 50 | CompressionAlgorithm::None => 0, 51 | CompressionAlgorithm::Lz4 => 1, 52 | } 53 | } 54 | } 55 | 56 | impl TryFrom for CompressionAlgorithm { 57 | type Error = anyhow::Error; 58 | fn try_from(v: u8) -> core::result::Result { 59 | match v { 60 | 0 => Ok(Self::None), 61 | 1 => Ok(Self::Lz4), 62 | _ => Err(anyhow::anyhow!("not valid compression algorithm")), 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /common/src/config.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | 3 | use serde::Deserialize; 4 | 5 | use crate::coding::CompressionAlgorithm; 6 | 7 | #[derive(Deserialize, Clone, Copy, PartialEq, Eq, Debug)] 8 | pub enum LevelCompactionStrategy { 9 | Overlap, 10 | NonOverlap, 11 | } 12 | 13 | #[derive(Deserialize, Clone, Debug)] 14 | pub struct LevelOptions { 15 | pub compaction_strategy: LevelCompactionStrategy, 16 | pub compression_algorithm: CompressionAlgorithm, 17 | } 18 | 19 | #[derive(Deserialize, Clone, Default, Debug)] 20 | pub struct LsmTreeConfig { 21 | pub l1_capacity: String, 22 | pub level_multiplier: usize, 23 | pub trigger_l0_compaction_ssts: usize, 24 | pub trigger_l0_compaction_interval: String, 25 | pub trigger_lmax_compaction_interval: String, 26 | pub trigger_compaction_interval: String, 27 | pub sstable_capacity: String, 28 | pub block_capacity: String, 29 | pub restart_interval: usize, 30 | pub bloom_false_positive: f64, 31 | pub compaction_pin_ttl: String, 32 | pub levels_options: Vec, 33 | } 34 | 35 | impl FromStr for LsmTreeConfig { 36 | type Err = anyhow::Error; 37 | 38 | fn from_str(s: &str) -> Result { 39 | let c = toml::from_str(s)?; 40 | Ok(c) 41 | } 42 | } 43 | 44 | // TODO: Fill me. 45 | #[derive(Deserialize, Clone, Debug)] 46 | pub struct S3Config { 47 | pub bucket: String, 48 | } 49 | 50 | #[derive(Deserialize, Clone, Debug)] 51 | pub struct MinioConfig { 52 | pub url: String, 53 | } 54 | 55 | #[derive(Deserialize, Clone, Debug)] 56 | pub struct CacheConfig { 57 | pub block_cache_capacity: String, 58 | pub meta_cache_capacity: String, 59 | } 60 | 61 | #[derive(Deserialize, Clone, Debug)] 62 | pub struct Node { 63 | pub id: u64, 64 | pub host: String, 65 | pub port: u16, 66 | } 67 | 68 | #[derive(Deserialize, Clone, Debug)] 69 | pub struct PrometheusConfig { 70 | pub host: String, 71 | pub port: u16, 72 | } 73 | 74 | #[cfg(test)] 75 | mod tests { 76 | 77 | use test_log::test; 78 | 79 | use super::*; 80 | 81 | #[test] 82 | fn lsm_tree_config_serde() { 83 | let s = r#" 84 | l1_capacity = "1 MiB" 85 | level_multiplier = 10 86 | 87 | trigger_l0_compaction_ssts = 4 88 | trigger_l0_compaction_interval = "1 s" 89 | trigger_lmax_compaction_interval = "10 s" 90 | trigger_compaction_interval = "5 s" 91 | 92 | sstable_capacity = "64 KiB" 93 | block_capacity = "4 KiB" 94 | restart_interval = 2 95 | bloom_false_positive = 0.1 96 | 97 | compaction_pin_ttl = "15 s" 98 | 99 | [[levels_options]] 100 | compaction_strategy = "Overlap" 101 | compression_algorithm = "None" 102 | 103 | [[levels_options]] 104 | compaction_strategy = "NonOverlap" 105 | compression_algorithm = "None" 106 | 107 | [[levels_options]] 108 | compaction_strategy = "NonOverlap" 109 | compression_algorithm = "None" 110 | 111 | [[levels_options]] 112 | compaction_strategy = "NonOverlap" 113 | compression_algorithm = "None" 114 | 115 | [[levels_options]] 116 | compaction_strategy = "NonOverlap" 117 | compression_algorithm = "Lz4" 118 | 119 | [[levels_options]] 120 | compaction_strategy = "NonOverlap" 121 | compression_algorithm = "Lz4" 122 | 123 | [[levels_options]] 124 | compaction_strategy = "NonOverlap" 125 | compression_algorithm = "Lz4""#; 126 | LsmTreeConfig::from_str(s).unwrap(); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /common/src/context.rs: -------------------------------------------------------------------------------- 1 | use crate::coding::BytesSerde; 2 | 3 | #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] 4 | pub struct Context { 5 | pub span_id: u64, 6 | pub request_id: u64, 7 | } 8 | 9 | impl<'de> BytesSerde<'de> for Context {} 10 | -------------------------------------------------------------------------------- /common/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod atomic; 2 | pub mod channel_pool; 3 | pub mod coding; 4 | pub mod config; 5 | pub mod context; 6 | pub mod log; 7 | pub mod notify_pool; 8 | pub mod packer; 9 | pub mod prometheus; 10 | pub mod sharded_hash_map; 11 | pub mod sync; 12 | pub mod time; 13 | pub mod tracing_slog_drain; 14 | 15 | use async_trait::async_trait; 16 | 17 | #[async_trait] 18 | pub trait Worker: Sync + Send + 'static { 19 | async fn run(&mut self) -> anyhow::Result<()>; 20 | } 21 | 22 | pub type BoxedWorker = Box; 23 | -------------------------------------------------------------------------------- /common/src/log.rs: -------------------------------------------------------------------------------- 1 | use isahc::config::Configurable; 2 | use tracing_subscriber::filter::Targets; 3 | use tracing_subscriber::fmt::format::FmtSpan; 4 | use tracing_subscriber::layer::{Layer, SubscriberExt}; 5 | use tracing_subscriber::util::SubscriberInitExt; 6 | 7 | pub struct LogGuard { 8 | _file_appender_guard: Option, 9 | pub jaeger_enabled: bool, 10 | pub tokio_console_enabled: bool, 11 | } 12 | 13 | impl Drop for LogGuard { 14 | fn drop(&mut self) { 15 | if self.jaeger_enabled { 16 | opentelemetry::global::shutdown_tracer_provider(); 17 | } 18 | } 19 | } 20 | 21 | pub fn init_runkv_logger(service: &str, id: u64, log_path: &str) -> LogGuard { 22 | let tokio_console_enabled = cfg!(feature = "console"); 23 | let jaeger_enabled = cfg!(feature = "tracing"); 24 | 25 | if tokio_console_enabled { 26 | #[cfg(feature = "console")] 27 | { 28 | console_subscriber::init(); 29 | return LogGuard { 30 | _file_appender_guard: None, 31 | jaeger_enabled, 32 | tokio_console_enabled, 33 | }; 34 | } 35 | } 36 | 37 | let (file_appender, file_appender_guard) = tracing_appender::non_blocking( 38 | tracing_appender::rolling::daily(log_path, format!("runkv-{}-{}.log", service, id)), 39 | ); 40 | 41 | let guard = LogGuard { 42 | _file_appender_guard: Some(file_appender_guard), 43 | jaeger_enabled, 44 | tokio_console_enabled, 45 | }; 46 | 47 | let fmt_layer = { 48 | let runkv_log_level = if cfg!(feature = "verbose-release-log") || cfg!(debug_assertions) { 49 | tracing::Level::DEBUG 50 | } else { 51 | tracing::Level::INFO 52 | }; 53 | 54 | // Configure RunKV's own crates to log at TRACE level, and ignore all third-party crates. 55 | let filter = Targets::new() 56 | // Enable trace for most modules. 57 | .with_target("runkv_common", runkv_log_level) 58 | .with_target("runkv_storage", runkv_log_level) 59 | .with_target("runkv_rudder", runkv_log_level) 60 | .with_target("runkv_wheel", runkv_log_level) 61 | .with_target("runkv_exhauster", runkv_log_level) 62 | .with_target("runkv_tests", runkv_log_level) 63 | .with_target("openraft::raft", tracing::Level::INFO) 64 | .with_target("raft", tracing::Level::INFO) 65 | .with_target("events", tracing::Level::WARN); 66 | 67 | tracing_subscriber::fmt::layer() 68 | .with_span_events(FmtSpan::ACTIVE) 69 | .with_target(true) 70 | .with_level(true) 71 | .with_writer(file_appender) 72 | .with_ansi(false) 73 | .with_filter(filter) 74 | }; 75 | 76 | if jaeger_enabled { 77 | opentelemetry::global::set_text_map_propagator(opentelemetry_jaeger::Propagator::new()); 78 | 79 | // Configure RunKV's own crates to log at TRACE level, and ignore all third-party crates. 80 | let filter = Targets::new() 81 | // Enable trace for most modules. 82 | .with_target("runkv_common", tracing::Level::TRACE) 83 | .with_target("runkv_storage", tracing::Level::TRACE) 84 | .with_target("runkv_rudder", tracing::Level::TRACE) 85 | .with_target("runkv_wheel", tracing::Level::TRACE) 86 | .with_target("runkv_exhauster", tracing::Level::TRACE) 87 | .with_target("runkv_tests", tracing::Level::TRACE) 88 | .with_target("openraft::raft", tracing::Level::TRACE) 89 | .with_target("raft", tracing::Level::TRACE) 90 | .with_target("events", tracing::Level::WARN); 91 | 92 | let tracer = opentelemetry_jaeger::new_pipeline() 93 | // TODO: use UDP tracing in production environment 94 | .with_collector_endpoint("http://127.0.0.1:14268/api/traces") 95 | // TODO: change service name to compute-{port} 96 | .with_service_name(service) 97 | // disable proxy 98 | .with_http_client(isahc::HttpClient::builder().proxy(None).build().unwrap()) 99 | .install_batch(opentelemetry::runtime::Tokio) 100 | .unwrap(); 101 | 102 | let opentelemetry_layer = tracing_opentelemetry::layer() 103 | .with_tracer(tracer) 104 | .with_filter(filter); 105 | 106 | tracing_subscriber::registry() 107 | .with(opentelemetry_layer) 108 | .init(); 109 | } else { 110 | tracing_subscriber::registry().with(fmt_layer).init(); 111 | } 112 | 113 | guard 114 | } 115 | -------------------------------------------------------------------------------- /common/src/packer.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use parking_lot::Mutex; 4 | use tokio::sync::oneshot; 5 | 6 | const DEFAULT_QUEUE_CAPACITY: usize = 64; 7 | 8 | pub struct Item 9 | where 10 | T: 'static, 11 | R: 'static, 12 | { 13 | pub data: T, 14 | pub notifier: Option>, 15 | } 16 | 17 | struct PackerCore 18 | where 19 | T: 'static, 20 | R: 'static, 21 | { 22 | queue: Mutex>>, 23 | } 24 | 25 | pub struct Packer 26 | where 27 | T: 'static, 28 | R: 'static, 29 | { 30 | default_queue_capacity: usize, 31 | 32 | core: Arc>, 33 | } 34 | 35 | impl Clone for Packer 36 | where 37 | T: 'static, 38 | R: 'static, 39 | { 40 | fn clone(&self) -> Self { 41 | Self { 42 | default_queue_capacity: self.default_queue_capacity, 43 | core: self.core.clone(), 44 | } 45 | } 46 | } 47 | 48 | impl Default for Packer 49 | where 50 | T: 'static, 51 | R: 'static, 52 | { 53 | fn default() -> Self { 54 | Self::new(DEFAULT_QUEUE_CAPACITY) 55 | } 56 | } 57 | 58 | impl Packer 59 | where 60 | T: 'static, 61 | R: 'static, 62 | { 63 | pub fn new(default_queue_capacity: usize) -> Self { 64 | Self { 65 | default_queue_capacity, 66 | core: Arc::new(PackerCore { 67 | queue: Mutex::new(Vec::with_capacity(default_queue_capacity)), 68 | }), 69 | } 70 | } 71 | 72 | pub fn append(&self, data: T, notifier: Option>) -> bool { 73 | let mut queue = self.core.queue.lock(); 74 | let is_leader = queue.is_empty(); 75 | queue.push(Item { data, notifier }); 76 | is_leader 77 | } 78 | 79 | pub fn package(&self) -> Vec> { 80 | let mut queue = self.core.queue.lock(); 81 | let mut package = Vec::with_capacity(self.default_queue_capacity); 82 | std::mem::swap(&mut package, &mut (*queue)); 83 | package 84 | } 85 | } 86 | 87 | #[cfg(test)] 88 | mod tests { 89 | use super::*; 90 | 91 | fn is_send_sync_clone() {} 92 | 93 | #[test] 94 | fn ensure_send_sync_clone() { 95 | is_send_sync_clone::>(); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /common/src/prometheus.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | 3 | use http::header::CONTENT_TYPE; 4 | use http::{Request, Response}; 5 | use hyper::service::{make_service_fn, service_fn}; 6 | use hyper::{Body, Error, Server}; 7 | use prometheus::{Encoder, TextEncoder}; 8 | use tracing::{error, info}; 9 | 10 | pub struct DefaultPrometheusExporter; 11 | 12 | impl DefaultPrometheusExporter { 13 | pub fn init(addr: SocketAddr) { 14 | tokio::spawn(async move { 15 | info!("Prometheus service is set up on http://{}", addr); 16 | if let Err(e) = Server::bind(&addr) 17 | .serve(make_service_fn(|_| async move { 18 | Ok::<_, Error>(service_fn(Self::serve)) 19 | })) 20 | .await 21 | { 22 | error!("Prometheus service error: {}", e); 23 | } 24 | }); 25 | } 26 | 27 | async fn serve(_request: Request) -> anyhow::Result> { 28 | let encoder = TextEncoder::new(); 29 | let mut buffer = Vec::with_capacity(4096); 30 | let metrics = prometheus::gather(); 31 | encoder.encode(&metrics, &mut buffer).unwrap(); 32 | let response = Response::builder() 33 | .status(200) 34 | .header(CONTENT_TYPE, encoder.format_type()) 35 | .body(Body::from(buffer))?; 36 | Ok(response) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /common/src/sync.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicUsize, Ordering}; 2 | use std::sync::Arc; 3 | 4 | struct TicketLockCore { 5 | head: AtomicUsize, 6 | tail: AtomicUsize, 7 | } 8 | 9 | #[derive(Clone)] 10 | pub struct TicketLock { 11 | core: Arc, 12 | } 13 | 14 | impl Default for TicketLock { 15 | fn default() -> Self { 16 | Self { 17 | core: Arc::new(TicketLockCore { 18 | head: AtomicUsize::new(0), 19 | tail: AtomicUsize::new(0), 20 | }), 21 | } 22 | } 23 | } 24 | 25 | impl TicketLock { 26 | pub fn acquire(&self) -> usize { 27 | let ticket = self.core.head.fetch_add(1, Ordering::SeqCst); 28 | while ticket != self.core.tail.load(Ordering::Acquire) {} 29 | ticket 30 | } 31 | 32 | pub async fn async_acquire(&self) -> usize { 33 | let ticket = self.core.head.fetch_add(1, Ordering::SeqCst); 34 | while ticket != self.core.tail.load(Ordering::Acquire) { 35 | tokio::task::yield_now().await; 36 | } 37 | ticket 38 | } 39 | 40 | pub fn release(&self) { 41 | self.core.tail.fetch_add(1, Ordering::Release); 42 | } 43 | } 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | use std::time::Duration; 48 | 49 | use itertools::Itertools; 50 | use parking_lot::Mutex; 51 | use rand::Rng; 52 | use test_log::test; 53 | 54 | use super::*; 55 | 56 | #[test] 57 | fn test_ticket_lock() { 58 | let lock = TicketLock::default(); 59 | loop { 60 | let results = Arc::new(Mutex::new(vec![])); 61 | 62 | let handles = (0..100) 63 | .into_iter() 64 | .map(|_| { 65 | let lock_clone = lock.clone(); 66 | let results_clone = results.clone(); 67 | std::thread::spawn(move || { 68 | let mut rng = rand::thread_rng(); 69 | std::thread::sleep(Duration::from_millis(rng.gen_range(10..100))); 70 | let ticket = lock_clone.acquire(); 71 | results_clone.lock().push(ticket); 72 | lock_clone.release(); 73 | ticket 74 | }) 75 | }) 76 | .collect_vec(); 77 | 78 | let tickets = handles 79 | .into_iter() 80 | .map(|handle| handle.join().unwrap()) 81 | .collect_vec(); 82 | let mut ordered = true; 83 | for (i, t) in tickets.into_iter().enumerate() { 84 | if i != t { 85 | ordered = false; 86 | break; 87 | } 88 | } 89 | if ordered { 90 | continue; 91 | } 92 | let results = Arc::try_unwrap(results).unwrap().into_inner(); 93 | for (i, r) in results.into_iter().enumerate() { 94 | assert_eq!(i, r); 95 | } 96 | break; 97 | } 98 | } 99 | 100 | #[test(tokio::test(flavor = "multi_thread", worker_threads = 10))] 101 | async fn test_ticket_lock_async() { 102 | let lock = TicketLock::default(); 103 | loop { 104 | let results = Arc::new(Mutex::new(vec![])); 105 | 106 | let futures = (0..100) 107 | .into_iter() 108 | .map(|_| { 109 | let lock_clone = lock.clone(); 110 | let results_clone = results.clone(); 111 | async move { 112 | let mut rng = rand::thread_rng(); 113 | tokio::time::sleep(Duration::from_millis(rng.gen_range(10..100))).await; 114 | let ticket = lock_clone.async_acquire().await; 115 | results_clone.lock().push(ticket); 116 | lock_clone.release(); 117 | ticket 118 | } 119 | }) 120 | .collect_vec(); 121 | let tickets = futures::future::join_all(futures).await; 122 | let mut ordered = true; 123 | for (i, t) in tickets.into_iter().enumerate() { 124 | if i != t { 125 | ordered = false; 126 | break; 127 | } 128 | } 129 | if ordered { 130 | continue; 131 | } 132 | let results = Arc::try_unwrap(results).unwrap().into_inner(); 133 | for (i, r) in results.into_iter().enumerate() { 134 | assert_eq!(i, r); 135 | } 136 | break; 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /common/src/time.rs: -------------------------------------------------------------------------------- 1 | use std::time::{Duration, SystemTime, UNIX_EPOCH}; 2 | 3 | lazy_static::lazy_static! { 4 | /// 2022-03-09T00:00:00Z. 5 | static ref RUNKV_UNIX_DATE_EPOCH: SystemTime = SystemTime::UNIX_EPOCH + Duration::from_secs(1_615_248_000); 6 | } 7 | 8 | pub fn timestamp() -> u64 { 9 | SystemTime::now() 10 | .duration_since(UNIX_EPOCH) 11 | .unwrap() 12 | .as_millis() as u64 13 | } 14 | 15 | pub fn rtimestamp() -> u64 { 16 | RUNKV_UNIX_DATE_EPOCH.elapsed().unwrap().as_millis() as u64 17 | } 18 | 19 | #[cfg(test)] 20 | mod tests { 21 | use chrono::{Local, TimeZone, Utc}; 22 | use test_log::test; 23 | 24 | use super::*; 25 | 26 | #[test] 27 | fn test_singularity_system_time() { 28 | let utc = Utc.ymd(2021, 3, 9).and_hms(0, 0, 0); 29 | let runkv_dt = Local.from_utc_datetime(&utc.naive_utc()); 30 | let runkv_st = SystemTime::from(runkv_dt); 31 | assert_eq!(runkv_st, *RUNKV_UNIX_DATE_EPOCH); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /common/src/tracing_slog_drain.rs: -------------------------------------------------------------------------------- 1 | pub struct TracingSlogDrain; 2 | 3 | macro_rules! tracing_event { 4 | ($level:expr, $msg:expr, $filepath:expr, $namespace:expr, $lineno:expr) => { 5 | match $level { 6 | tracing::Level::ERROR => tracing::error!( 7 | code.filepath = $filepath, 8 | code.namespace = $namespace, 9 | code.lineno = $lineno, 10 | "{}", 11 | $msg 12 | ), 13 | tracing::Level::WARN => tracing::warn!( 14 | code.filepath = $filepath, 15 | code.namespace = $namespace, 16 | code.lineno = $lineno, 17 | "{}", 18 | $msg 19 | ), 20 | tracing::Level::INFO => tracing::info!("{}", $msg), 21 | tracing::Level::DEBUG => tracing::debug!("{}", $msg), 22 | tracing::Level::TRACE => tracing::trace!( 23 | code.filepath = $filepath, 24 | code.namespace = $namespace, 25 | code.lineno = $lineno, 26 | "{}", 27 | $msg 28 | ), 29 | } 30 | }; 31 | } 32 | 33 | fn level(level: slog::Level) -> tracing::Level { 34 | match level { 35 | // There is not `Critical` level in `tracing`. 36 | slog::Level::Critical => tracing::Level::ERROR, 37 | slog::Level::Error => tracing::Level::ERROR, 38 | slog::Level::Warning => tracing::Level::WARN, 39 | slog::Level::Info => tracing::Level::INFO, 40 | slog::Level::Debug => tracing::Level::DEBUG, 41 | slog::Level::Trace => tracing::Level::TRACE, 42 | } 43 | } 44 | 45 | struct KvSerializer { 46 | writer: W, 47 | } 48 | 49 | impl KvSerializer { 50 | fn new(writer: W) -> Self { 51 | Self { writer } 52 | } 53 | 54 | fn into_inner(self) -> W { 55 | self.writer 56 | } 57 | 58 | fn write(&mut self, arg: &std::fmt::Arguments) -> slog::Result { 59 | write!(self.writer, "{}", arg)?; 60 | Ok(()) 61 | } 62 | } 63 | 64 | impl slog::Serializer for KvSerializer { 65 | fn emit_arguments(&mut self, key: slog::Key, val: &std::fmt::Arguments) -> slog::Result { 66 | write!(self.writer, " {}={}", key, val)?; 67 | Ok(()) 68 | } 69 | } 70 | 71 | impl slog::Drain for TracingSlogDrain { 72 | type Ok = (); 73 | 74 | type Err = slog::Never; 75 | 76 | fn log( 77 | &self, 78 | record: &slog::Record, 79 | values: &slog::OwnedKVList, 80 | ) -> std::result::Result { 81 | use slog::KV; 82 | 83 | let writer = std::io::Cursor::new(Vec::new()); 84 | let mut serializer = KvSerializer::new(writer); 85 | 86 | serializer.write(record.msg()).unwrap(); 87 | values.serialize(record, &mut serializer).unwrap(); 88 | 89 | let buf = serializer.into_inner().into_inner(); 90 | let s = String::from_utf8_lossy(&buf); 91 | 92 | let level = level(record.level()); 93 | 94 | let location = record.location(); 95 | 96 | tracing_event!( 97 | level, 98 | s.as_ref(), 99 | location.file, 100 | location.module, 101 | location.line 102 | ); 103 | Ok(()) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /etc/exhauster.toml: -------------------------------------------------------------------------------- 1 | id = 201 2 | host = "127.0.0.1" 3 | port = 12501 4 | data_path = "data" 5 | meta_path = "meta" 6 | heartbeat_interval = "1 s" 7 | 8 | [rudder] 9 | id = 1 10 | host = "127.0.0.1" 11 | port = 12300 12 | 13 | [minio] 14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 15 | 16 | [s3] 17 | bucket = "runkv" 18 | 19 | [buffer] 20 | write_buffer_capacity = "64 MiB" 21 | 22 | [cache] 23 | block_cache_capacity = "512 MiB" 24 | meta_cache_capacity = "256 MiB" 25 | -------------------------------------------------------------------------------- /etc/grafana-provisioning/dashboards/runkv-dashboards.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'runkv-overview' 5 | orgId: 1 6 | folder: 'runkv' 7 | folderUid: '' 8 | type: file 9 | options: 10 | path: etc/grafana-dashboards/runkv-overview.json 11 | -------------------------------------------------------------------------------- /etc/grafana-provisioning/datasources/runkv-prometheus.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | deleteDatasources: 3 | - name: risedev-prometheus 4 | datasources: 5 | - name: risedev-prometheus 6 | type: prometheus 7 | access: proxy 8 | url: http://127.0.0.1:9091 9 | withCredentials: false 10 | isDefault: false 11 | tlsAuth: false 12 | tlsAuthWithCACert: false 13 | version: 1 14 | editable: true 15 | isDefault: true -------------------------------------------------------------------------------- /etc/lsm_tree.toml: -------------------------------------------------------------------------------- 1 | [lsm_tree] 2 | l1_capacity = "1 MiB" 3 | level_multiplier = 10 4 | 5 | trigger_l0_compaction_ssts = 4 6 | trigger_l0_compaction_interval = "1 s" 7 | trigger_lmax_compaction_interval = "10 s" 8 | trigger_compaction_interval = "5 s" 9 | 10 | sstable_capacity = "64 KiB" 11 | block_capacity = "4 KiB" 12 | restart_interval = 2 13 | bloom_false_positive = 0.1 14 | 15 | compaction_pin_ttl = "15 s" 16 | 17 | [[lsm_tree.levels_options]] 18 | compaction_strategy = "Overlap" 19 | compression_algorithm = "None" 20 | 21 | [[lsm_tree.levels_options]] 22 | compaction_strategy = "NonOverlap" 23 | compression_algorithm = "None" 24 | 25 | [[lsm_tree.levels_options]] 26 | compaction_strategy = "NonOverlap" 27 | compression_algorithm = "None" 28 | 29 | [[lsm_tree.levels_options]] 30 | compaction_strategy = "NonOverlap" 31 | compression_algorithm = "None" 32 | 33 | [[lsm_tree.levels_options]] 34 | compaction_strategy = "NonOverlap" 35 | compression_algorithm = "Lz4" 36 | 37 | [[lsm_tree.levels_options]] 38 | compaction_strategy = "NonOverlap" 39 | compression_algorithm = "Lz4" 40 | 41 | [[lsm_tree.levels_options]] 42 | compaction_strategy = "NonOverlap" 43 | compression_algorithm = "Lz4" -------------------------------------------------------------------------------- /etc/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | scrape_configs: 5 | - job_name: "prometheus-runkv" 6 | scrape_interval: 1s 7 | static_configs: 8 | - targets: 9 | [ 10 | "127.0.0.1:9890", 11 | "127.0.0.1:9891", 12 | "127.0.0.1:9892", 13 | "127.0.0.1:9893", 14 | "127.0.0.1:9894", 15 | "127.0.0.1:9895", 16 | "127.0.0.1:9896", 17 | "127.0.0.1:9897", 18 | "127.0.0.1:9898", 19 | "127.0.0.1:9899", 20 | ] 21 | -------------------------------------------------------------------------------- /etc/rudder.toml: -------------------------------------------------------------------------------- 1 | id = 1 2 | host = "127.0.0.1" 3 | port = 12301 4 | data_path = "data" 5 | meta_path = "meta" 6 | health_timeout = "10 s" 7 | 8 | [minio] 9 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 10 | 11 | [s3] 12 | bucket = "runkv" 13 | 14 | [cache] 15 | block_cache_capacity = "0 B" 16 | meta_cache_capacity = "256 MiB" 17 | 18 | [lsm_tree] 19 | trigger_l0_compaction_ssts = 4 20 | trigger_l0_compaction_interval = "1 s" 21 | trigger_compaction_interval = "5 s" 22 | 23 | sstable_capacity = "64 KiB" 24 | block_capacity = "4 KiB" 25 | restart_interval = 2 26 | bloom_false_positive = 0.1 27 | 28 | compaction_pin_ttl = "15 s" 29 | 30 | [[lsm_tree.levels_options]] 31 | compaction_strategy = "Overlap" 32 | compression_algorithm = "None" 33 | 34 | [[lsm_tree.levels_options]] 35 | compaction_strategy = "NonOverlap" 36 | compression_algorithm = "None" 37 | 38 | [[lsm_tree.levels_options]] 39 | compaction_strategy = "NonOverlap" 40 | compression_algorithm = "None" 41 | 42 | [[lsm_tree.levels_options]] 43 | compaction_strategy = "NonOverlap" 44 | compression_algorithm = "None" 45 | 46 | [[lsm_tree.levels_options]] 47 | compaction_strategy = "NonOverlap" 48 | compression_algorithm = "Lz4" 49 | 50 | [[lsm_tree.levels_options]] 51 | compaction_strategy = "NonOverlap" 52 | compression_algorithm = "Lz4" 53 | 54 | [[lsm_tree.levels_options]] 55 | compaction_strategy = "NonOverlap" 56 | compression_algorithm = "Lz4" 57 | -------------------------------------------------------------------------------- /etc/wheel.toml: -------------------------------------------------------------------------------- 1 | id = 101 2 | host = "127.0.0.1" 3 | port = 12401 4 | log = ".run/log/" 5 | data_path = "data" 6 | meta_path = "meta" 7 | poll_interval = "100ms" 8 | heartbeat_interval = "100ms" 9 | 10 | [rudder] 11 | id = 1 12 | host = "127.0.0.1" 13 | port = 12301 14 | 15 | [minio] 16 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 17 | 18 | [s3] 19 | bucket = "runkv" 20 | 21 | [buffer] 22 | write_buffer_capacity = "64 MiB" 23 | 24 | [cache] 25 | block_cache_capacity = "512 MiB" 26 | meta_cache_capacity = "256 MiB" 27 | 28 | [raft_log_store] 29 | log_dir_path = "/path/to/log/dir" 30 | log_file_capacity = "64 MiB" 31 | block_cache_capacity = "256 MiB" 32 | persist = "sync" 33 | 34 | [lsm_tree] 35 | l1_capacity = "1 MiB" 36 | level_multiplier = 10 37 | 38 | trigger_l0_compaction_ssts = 4 39 | trigger_l0_compaction_interval = "1 s" 40 | trigger_compaction_interval = "5 s" 41 | 42 | sstable_capacity = "64 KiB" 43 | block_capacity = "4 KiB" 44 | restart_interval = 2 45 | bloom_false_positive = 0.1 46 | 47 | compaction_pin_ttl = "15 s" 48 | 49 | [[lsm_tree.levels_options]] 50 | compaction_strategy = "Overlap" 51 | compression_algorithm = "None" 52 | 53 | [[lsm_tree.levels_options]] 54 | compaction_strategy = "NonOverlap" 55 | compression_algorithm = "None" 56 | 57 | [[lsm_tree.levels_options]] 58 | compaction_strategy = "NonOverlap" 59 | compression_algorithm = "None" 60 | 61 | [[lsm_tree.levels_options]] 62 | compaction_strategy = "NonOverlap" 63 | compression_algorithm = "None" 64 | 65 | [[lsm_tree.levels_options]] 66 | compaction_strategy = "NonOverlap" 67 | compression_algorithm = "Lz4" 68 | 69 | [[lsm_tree.levels_options]] 70 | compaction_strategy = "NonOverlap" 71 | compression_algorithm = "Lz4" 72 | 73 | [[lsm_tree.levels_options]] 74 | compaction_strategy = "NonOverlap" 75 | compression_algorithm = "Lz4" 76 | 77 | [tiered_cache] 78 | type = "FileCache" 79 | [tiered_cache.args] 80 | dir = "/path/to/file/cache/dir" 81 | capacity = "256 MiB" 82 | total_buffer_capacity = "64 MiB" 83 | cache_file_fallocate_unit = "64 MiB" 84 | cache_meta_fallocate_unit = "16 MiB" 85 | cache_file_max_write_size = "2 MiB" -------------------------------------------------------------------------------- /exhauster/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-exhauster" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-trait = "0.1" 10 | bytes = "1" 11 | bytesize = { version = "1.1.0", features = ["serde"] } 12 | clap = { version = "3.1.6", features = ["derive"] } 13 | humantime = "2.1.0" 14 | humantime-serde = "1.1.1" 15 | itertools = "0.10.3" 16 | parking_lot = "0.12" 17 | prost = "0.9" 18 | runkv-common = { path = "../common" } 19 | runkv-proto = { path = "../proto" } 20 | runkv-storage = { path = "../storage" } 21 | serde = "1.0" 22 | serde_derive = "1.0" 23 | thiserror = "1.0" 24 | tokio = { version = "1", features = [ 25 | "rt-multi-thread", 26 | "sync", 27 | "macros", 28 | "time", 29 | ] } 30 | toml = "0.4.2" 31 | tonic = "0.6.2" 32 | tracing = "0.1" 33 | tracing-subscriber = "0.3" 34 | 35 | [target.'cfg(not(target_env = "msvc"))'.dependencies] 36 | tikv-jemallocator = "0.4.3" 37 | 38 | [dev-dependencies] 39 | env_logger = "*" 40 | test-log = "0.2.10" 41 | 42 | [features] 43 | verbose-release-log = ["tracing/release_max_level_trace"] 44 | -------------------------------------------------------------------------------- /exhauster/src/compaction_filter.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | 3 | pub trait CompactionFilter { 4 | /// Keep the key value pair if `filter` returns true. 5 | fn filter(&mut self, key: &[u8], value: Option<&[u8]>, sequence: u64) -> bool; 6 | } 7 | 8 | pub struct DefaultCompactionFilter { 9 | last_key: Bytes, 10 | watermark: u64, 11 | _remove_tombstone: bool, 12 | } 13 | 14 | impl DefaultCompactionFilter { 15 | pub fn new(watermark: u64, remove_tombstone: bool) -> Self { 16 | Self { 17 | last_key: Bytes::default(), 18 | watermark, 19 | _remove_tombstone: remove_tombstone, 20 | } 21 | } 22 | } 23 | 24 | impl CompactionFilter for DefaultCompactionFilter { 25 | fn filter(&mut self, key: &[u8], _value: Option<&[u8]>, sequence: u64) -> bool { 26 | let mut retain = true; 27 | // TODO: Handle `remove_tombstone`. 28 | if key == self.last_key && sequence < self.watermark { 29 | retain = false; 30 | } 31 | self.last_key = Bytes::copy_from_slice(key); 32 | retain 33 | } 34 | } 35 | 36 | #[cfg(test)] 37 | mod tests { 38 | 39 | use test_log::test; 40 | 41 | use super::*; 42 | 43 | #[test] 44 | fn test_default_compaction_filter() { 45 | #[allow(clippy::type_complexity)] 46 | let dataset: Vec<(&[u8], Option<&[u8]>, u64, bool)> = vec![ 47 | (b"k1", Some(b"v1-20"), 20, true), 48 | (b"k1", Some(b"v1-10"), 10, true), 49 | (b"k1", Some(b"v1-1"), 1, false), 50 | (b"k2", None, 1, true), 51 | (b"k3", Some(b"v3-100"), 100, true), 52 | (b"k3", None, 15, true), 53 | (b"k3", None, 8, false), 54 | (b"k3", Some(b"v3-100"), 100, true), 55 | (b"k4", None, 100, true), 56 | (b"k4", Some(b"v4-20"), 20, true), 57 | (b"k4", Some(b"v4-8"), 8, false), 58 | (b"k4", None, 1, false), 59 | ]; 60 | let mut filter = DefaultCompactionFilter::new(10, false); 61 | for data in dataset { 62 | assert_eq!(filter.filter(data.0, data.1, data.2), data.3) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /exhauster/src/config.rs: -------------------------------------------------------------------------------- 1 | use runkv_common::config::{CacheConfig, MinioConfig, Node, S3Config}; 2 | use serde::Deserialize; 3 | 4 | #[derive(Deserialize, Clone, Debug)] 5 | pub struct ExhausterConfig { 6 | pub id: u64, 7 | pub host: String, 8 | pub port: u16, 9 | pub data_path: String, 10 | pub meta_path: String, 11 | pub heartbeat_interval: String, 12 | pub rudder: Node, 13 | pub s3: Option, 14 | pub minio: Option, 15 | pub cache: CacheConfig, 16 | } 17 | -------------------------------------------------------------------------------- /exhauster/src/error.rs: -------------------------------------------------------------------------------- 1 | pub type Result = std::result::Result; 2 | 3 | pub fn err(e: impl Into>) -> anyhow::Error { 4 | anyhow::anyhow!("error: {}", e.into()) 5 | } 6 | 7 | pub fn config_err(e: impl Into>) -> anyhow::Error { 8 | anyhow::anyhow!("config error: {}", e.into()) 9 | } 10 | -------------------------------------------------------------------------------- /exhauster/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::box_default)] 2 | 3 | pub mod compaction_filter; 4 | pub mod config; 5 | pub mod error; 6 | pub mod partitioner; 7 | pub mod service; 8 | pub mod worker; 9 | 10 | use std::sync::Arc; 11 | 12 | use bytesize::ByteSize; 13 | use config::ExhausterConfig; 14 | use error::{config_err, err, Result}; 15 | use runkv_common::channel_pool::ChannelPool; 16 | use runkv_common::BoxedWorker; 17 | use runkv_proto::common::Endpoint as PbEndpoint; 18 | use runkv_proto::exhauster::exhauster_service_server::ExhausterServiceServer; 19 | use runkv_storage::components::{ 20 | BlockCache, LsmTreeMetrics, LsmTreeMetricsRef, SstableStore, SstableStoreOptions, 21 | SstableStoreRef, 22 | }; 23 | use runkv_storage::tiered_cache::TieredCache; 24 | use runkv_storage::{MemObjectStore, ObjectStoreRef, S3ObjectStore}; 25 | use service::{Exhauster, ExhausterOptions}; 26 | use tonic::transport::Server; 27 | use tracing::info; 28 | use worker::heartbeater::{Heartbeater, HeartbeaterOptions}; 29 | 30 | pub async fn bootstrap_exhauster( 31 | config: &ExhausterConfig, 32 | exhauster: Exhauster, 33 | workers: Vec, 34 | ) -> Result<()> { 35 | let addr_str = format!("{}:{}", config.host, config.port); 36 | 37 | for mut worker in workers.into_iter() { 38 | tokio::spawn(async move { worker.run().await }); 39 | } 40 | 41 | Server::builder() 42 | .add_service(ExhausterServiceServer::new(exhauster)) 43 | .serve(addr_str.parse().map_err(config_err)?) 44 | .await 45 | .map_err(err) 46 | } 47 | 48 | pub async fn build_exhauster(config: &ExhausterConfig) -> Result<(Exhauster, Vec)> { 49 | let object_store = build_object_store(config).await; 50 | build_exhauster_with_object_store(config, object_store).await 51 | } 52 | 53 | pub async fn build_exhauster_with_object_store( 54 | config: &ExhausterConfig, 55 | object_store: ObjectStoreRef, 56 | ) -> Result<(Exhauster, Vec)> { 57 | let lsm_tree_metrics = Arc::new(LsmTreeMetrics::new(config.id)); 58 | 59 | let sstable_store = build_sstable_store(config, object_store, lsm_tree_metrics)?; 60 | 61 | let options = ExhausterOptions { 62 | node: config.id, 63 | sstable_store, 64 | // TODO: Restore from persistent store. 65 | sstable_sequential_id: 1, 66 | }; 67 | 68 | let channel_pool = build_channel_pool(config); 69 | 70 | let heartbeater_options = HeartbeaterOptions { 71 | node_id: config.id, 72 | endpoint: PbEndpoint { 73 | host: config.host.clone(), 74 | port: config.port as u32, 75 | }, 76 | channel_pool, 77 | rudder_node_id: config.rudder.id, 78 | heartbeat_interval: config 79 | .heartbeat_interval 80 | .parse::()? 81 | .into(), 82 | }; 83 | let heartbeater = Box::new(Heartbeater::new(heartbeater_options)); 84 | 85 | let exhauster = Exhauster::new(options); 86 | 87 | Ok((exhauster, vec![heartbeater])) 88 | } 89 | 90 | async fn build_object_store(config: &ExhausterConfig) -> ObjectStoreRef { 91 | if let Some(c) = &config.s3 { 92 | info!("s3 config found, create s3 object store"); 93 | Arc::new(S3ObjectStore::new(c.bucket.clone()).await) 94 | } else if let Some(c) = &config.minio { 95 | info!("minio config found, create minio object store"); 96 | Arc::new(S3ObjectStore::new_with_minio(&c.url).await) 97 | } else { 98 | info!("no object store config found, create default memory object store"); 99 | Arc::new(MemObjectStore::default()) 100 | } 101 | } 102 | 103 | fn build_sstable_store( 104 | config: &ExhausterConfig, 105 | object_store: ObjectStoreRef, 106 | metrics: LsmTreeMetricsRef, 107 | ) -> Result { 108 | let block_cache = BlockCache::new(0, metrics); 109 | let sstable_store_options = SstableStoreOptions { 110 | path: config.data_path.clone(), 111 | object_store, 112 | block_cache, 113 | meta_cache_capacity: config 114 | .cache 115 | .meta_cache_capacity 116 | .parse::() 117 | .map_err(config_err)? 118 | .0 as usize, 119 | tiered_cache: TieredCache::none(), 120 | }; 121 | let sstable_store = SstableStore::new(sstable_store_options); 122 | Ok(Arc::new(sstable_store)) 123 | } 124 | 125 | fn build_channel_pool(config: &ExhausterConfig) -> ChannelPool { 126 | ChannelPool::with_nodes(vec![config.rudder.clone()]) 127 | } 128 | -------------------------------------------------------------------------------- /exhauster/src/main.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(target_env = "msvc"))] 2 | use tikv_jemallocator::Jemalloc; 3 | 4 | #[cfg(not(target_env = "msvc"))] 5 | #[global_allocator] 6 | static GLOBAL: Jemalloc = Jemalloc; 7 | 8 | use std::fs::read_to_string; 9 | 10 | use clap::Parser; 11 | use runkv_exhauster::config::ExhausterConfig; 12 | use runkv_exhauster::error::{config_err, Result}; 13 | use runkv_exhauster::{bootstrap_exhauster, build_exhauster}; 14 | use tracing::info; 15 | use tracing_subscriber::FmtSubscriber; 16 | 17 | #[derive(Parser, Debug)] 18 | struct Args { 19 | #[clap(short, long, default_value = "etc/exhauster.toml")] 20 | config_file_path: String, 21 | } 22 | 23 | #[tokio::main] 24 | async fn main() -> Result<()> { 25 | let subscriber = FmtSubscriber::new(); 26 | tracing::subscriber::set_global_default(subscriber)?; 27 | 28 | let args = Args::parse(); 29 | info!("args: {:?}", args); 30 | 31 | let config: ExhausterConfig = 32 | toml::from_str(&read_to_string(&args.config_file_path)?).map_err(config_err)?; 33 | info!("config: {:?}", config); 34 | 35 | let (exhauster, workers) = build_exhauster(&config).await?; 36 | bootstrap_exhauster(&config, exhauster, workers).await 37 | } 38 | -------------------------------------------------------------------------------- /exhauster/src/partitioner.rs: -------------------------------------------------------------------------------- 1 | use bytes::Bytes; 2 | 3 | pub trait Partitioner: Send + Sync + 'static { 4 | /// Finish building current sstable if returns true. 5 | fn partition(&mut self, key: &[u8], value: Option<&[u8]>, sequence: u64) -> bool; 6 | } 7 | 8 | pub type BoxedPartitioner = Box; 9 | 10 | pub struct DefaultPartitioner { 11 | partition_points: Vec, 12 | offset: usize, 13 | } 14 | 15 | impl DefaultPartitioner { 16 | pub fn new(mut partition_points: Vec) -> Self { 17 | partition_points.sort(); 18 | Self { 19 | partition_points, 20 | offset: 0, 21 | } 22 | } 23 | } 24 | 25 | impl Partitioner for DefaultPartitioner { 26 | fn partition(&mut self, key: &[u8], _value: Option<&[u8]>, _sequence: u64) -> bool { 27 | if self.offset >= self.partition_points.len() { 28 | return false; 29 | } 30 | if key >= self.partition_points[self.offset] { 31 | self.offset += 1; 32 | return true; 33 | } 34 | false 35 | } 36 | } 37 | 38 | #[derive(Default)] 39 | pub struct NoPartitioner; 40 | 41 | impl Partitioner for NoPartitioner { 42 | fn partition(&mut self, _key: &[u8], _value: Option<&[u8]>, _sequence: u64) -> bool { 43 | false 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /exhauster/src/worker/heartbeater.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use async_trait::async_trait; 4 | use runkv_common::channel_pool::ChannelPool; 5 | use runkv_common::Worker; 6 | use runkv_proto::common::Endpoint as PbEndpoint; 7 | use runkv_proto::rudder::rudder_service_client::RudderServiceClient; 8 | use runkv_proto::rudder::{heartbeat_request, ExhausterHeartbeatRequest, HeartbeatRequest}; 9 | use tonic::Request; 10 | use tracing::warn; 11 | 12 | use crate::error::{err, Result}; 13 | 14 | pub struct HeartbeaterOptions { 15 | pub node_id: u64, 16 | pub endpoint: PbEndpoint, 17 | pub channel_pool: ChannelPool, 18 | pub rudder_node_id: u64, 19 | pub heartbeat_interval: Duration, 20 | } 21 | 22 | pub struct Heartbeater { 23 | node_id: u64, 24 | endpoint: PbEndpoint, 25 | channel_pool: ChannelPool, 26 | rudder_node_id: u64, 27 | heartbeat_interval: Duration, 28 | } 29 | 30 | impl Heartbeater { 31 | pub fn new(options: HeartbeaterOptions) -> Self { 32 | Self { 33 | node_id: options.node_id, 34 | endpoint: options.endpoint, 35 | channel_pool: options.channel_pool, 36 | rudder_node_id: options.rudder_node_id, 37 | heartbeat_interval: options.heartbeat_interval, 38 | } 39 | } 40 | 41 | async fn run_inner(&mut self) -> Result<()> { 42 | tokio::time::sleep(self.heartbeat_interval).await; 43 | let req = HeartbeatRequest { 44 | node_id: self.node_id, 45 | endpoint: Some(self.endpoint.clone()), 46 | heartbeat_message: Some(heartbeat_request::HeartbeatMessage::ExhausterHeartbeat( 47 | ExhausterHeartbeatRequest {}, 48 | )), 49 | }; 50 | let request = Request::new(req); 51 | let mut client = RudderServiceClient::new( 52 | self.channel_pool 53 | .get(self.rudder_node_id) 54 | .await 55 | .map_err(err)?, 56 | ); 57 | let _rsp = client.heartbeat(request).await?.into_inner(); 58 | Ok(()) 59 | } 60 | } 61 | 62 | #[async_trait] 63 | impl Worker for Heartbeater { 64 | async fn run(&mut self) -> anyhow::Result<()> { 65 | // TODO: Gracefully kill. 66 | loop { 67 | match self.run_inner().await { 68 | Ok(_) => {} 69 | Err(e) => { 70 | warn!("error occur when heartbeater running: {}", e); 71 | } 72 | } 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /exhauster/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod heartbeater; 2 | -------------------------------------------------------------------------------- /make/common.toml: -------------------------------------------------------------------------------- 1 | [env] 2 | OS = { source = "${CARGO_MAKE_RUST_TARGET_OS}", mapping = { linux = "linux", macos = "darwin" } } 3 | ARCH = { source = "${CARGO_MAKE_RUST_TARGET_ARCH}", mapping = { x86_64 = "amd64", aarch64 = "arm64" } } 4 | SYSTEM = "${OS}-${ARCH}" 5 | SYSTEM_AMD64 = "${OS}-amd64" # some components do not support darwin-arm64 for now, use amd64 for fallback 6 | PREFIX = "${PWD}/.run" 7 | PREFIX_USR_BIN = "${PWD}/.bin" 8 | PREFIX_BIN = "${PREFIX}/bin" 9 | PREFIX_CONFIG = "${PREFIX}/config" 10 | PREFIX_DATA = "${PREFIX}/data" 11 | PREFIX_LOG = "${PREFIX}/log" 12 | PREFIX_TMP = "${PREFIX}/tmp" 13 | 14 | [tasks.prepare] 15 | private = true 16 | category = "Misc" 17 | description = "Create .run folder for temporyary files and data." 18 | script = ''' 19 | #!@duckscript 20 | echo "Using ${PREFIX} as base folder." 21 | mkdir "${PREFIX}" 22 | mkdir "${PREFIX_BIN}" 23 | mkdir "${PREFIX_TMP}" 24 | mkdir "${PREFIX_DATA}" 25 | mkdir "${PREFIX_LOG}" 26 | ''' 27 | -------------------------------------------------------------------------------- /make/grafana.toml: -------------------------------------------------------------------------------- 1 | extend = "common.toml" 2 | 3 | [env] 4 | GRAFANA_SYSTEM = "${SYSTEM_AMD64}" 5 | GRAFANA_DOWNLOAD_PATH = "${PREFIX_TMP}/grafana.tar.gz" 6 | GRAFANA_VERSION = "8.5.1" 7 | GRAFANA_RELEASE = "grafana-${GRAFANA_VERSION}" 8 | GRAFANA_DOWNLOAD_TAR_GZ = "https://dl.grafana.com/oss/release/${GRAFANA_RELEASE}.${GRAFANA_SYSTEM}.tar.gz" 9 | 10 | [tasks.download-grafana] 11 | category = "Grafana" 12 | dependencies = ["prepare"] 13 | description = "Download and extract Grafana" 14 | script = ''' 15 | #!/bin/bash 16 | set -e 17 | if [ -d "${PREFIX_BIN}/grafana" ]; then 18 | exit 0 19 | fi 20 | echo "Grafana Server not found, downloading" 21 | curl -fL -o "${GRAFANA_DOWNLOAD_PATH}" "${GRAFANA_DOWNLOAD_TAR_GZ}" 22 | tar -xf "${GRAFANA_DOWNLOAD_PATH}" -C "${PREFIX_TMP}" 23 | mv "${PREFIX_TMP}/${GRAFANA_RELEASE}" "${PREFIX_BIN}/grafana" 24 | echo "grafana download success" 25 | ''' 26 | -------------------------------------------------------------------------------- /make/jaeger.toml: -------------------------------------------------------------------------------- 1 | extend = "common.toml" 2 | 3 | [env] 4 | JAEGER_SYSTEM = "${SYSTEM}" 5 | JAEGER_DOWNLOAD_TAR = "https://github.com/jaegertracing/jaeger/releases/download/v1.33.0/jaeger-1.33.0-${JAEGER_SYSTEM}.tar.gz" 6 | 7 | [tasks.download-jaeger] 8 | category = "Tracing" 9 | dependencies = ["prepare"] 10 | description = "Download and extract Jaeger." 11 | script = ''' 12 | #!/bin/bash 13 | set -e 14 | if [ -f "${PREFIX_BIN}/jaeger" ]; then 15 | exit 0 16 | fi 17 | echo "Jaeger not found, downloading" 18 | curl -fL -o "${PREFIX_TMP}/jaeger.tar.gz" "${JAEGER_DOWNLOAD_TAR}" 19 | tar -C "${PREFIX_TMP}/" -xzf "${PREFIX_TMP}/jaeger.tar.gz" 20 | chmod +x "${PREFIX_TMP}/jaeger-1.33.0-${JAEGER_SYSTEM}/jaeger-all-in-one" 21 | mv "${PREFIX_TMP}/jaeger-1.33.0-${JAEGER_SYSTEM}/jaeger-all-in-one" "${PREFIX_BIN}/jaeger" 22 | 23 | "${PREFIX_BIN}/jaeger" version 24 | ''' 25 | -------------------------------------------------------------------------------- /make/minio.toml: -------------------------------------------------------------------------------- 1 | extend = "common.toml" 2 | 3 | [env] 4 | MINIO_SYSTEM = "${SYSTEM}" 5 | MCLI_DOWNLOAD_BIN = "https://dl.min.io/client/mc/release/${MINIO_SYSTEM}/mc" 6 | MINIO_DOWNLOAD_BIN = "https://dl.min.io/server/minio/release/${MINIO_SYSTEM}/minio" 7 | 8 | [tasks.download-minio] 9 | category = "MinIO" 10 | dependencies = ["prepare"] 11 | description = "Download and extract MinIO." 12 | script = ''' 13 | #!/bin/bash 14 | set -e 15 | if [ -f "${PREFIX_BIN}/minio" ]; then 16 | exit 0 17 | fi 18 | echo "MinIO Server not found, downloading" 19 | curl -fL -o "${PREFIX_TMP}/minio" "${MINIO_DOWNLOAD_BIN}" 20 | chmod +x "${PREFIX_TMP}/minio" 21 | mv "${PREFIX_TMP}/minio" "${PREFIX_BIN}/minio" 22 | 23 | "${PREFIX_BIN}/minio" --version 24 | ''' 25 | 26 | [tasks.download-mcli] 27 | category = "MinIO" 28 | dependencies = ["prepare"] 29 | description = "Download and extract MinIO Client." 30 | script = ''' 31 | #!/bin/bash 32 | set -e 33 | if [ -f "${PREFIX_BIN}/mcli" ]; then 34 | exit 0 35 | fi 36 | echo "MinIO Client not found, downloading" 37 | curl -fL -o "${PREFIX_TMP}/mcli" "${MCLI_DOWNLOAD_BIN}" 38 | chmod +x "${PREFIX_TMP}/mcli" 39 | mv "${PREFIX_TMP}/mcli" "${PREFIX_BIN}/mcli" 40 | 41 | "${PREFIX_BIN}/mcli" --version 42 | ''' 43 | -------------------------------------------------------------------------------- /make/prometheus.toml: -------------------------------------------------------------------------------- 1 | extend = "common.toml" 2 | 3 | [env] 4 | PROMETHEUS_SYSTEM = "${SYSTEM}" 5 | PROMETHEUS_DOWNLOAD_PATH = "${PREFIX_TMP}/prometheus.tar.gz" 6 | PROMETHEUS_VERSION = "2.32.1" 7 | PROMETHEUS_RELEASE = "prometheus-${PROMETHEUS_VERSION}.${PROMETHEUS_SYSTEM}" 8 | PROMETHEUS_DOWNLOAD_TAR_GZ = "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/${PROMETHEUS_RELEASE}.tar.gz" 9 | 10 | [tasks.download-prometheus] 11 | category = "Metrics" 12 | dependencies = ["prepare"] 13 | description = "Download and extract Prometheus" 14 | script = ''' 15 | #!/bin/bash 16 | set -e 17 | if [ -d "${PREFIX_BIN}/prometheus" ]; then 18 | exit 0 19 | fi 20 | echo "Prometheus not found, downloading" 21 | curl -fL -o "${PROMETHEUS_DOWNLOAD_PATH}" "${PROMETHEUS_DOWNLOAD_TAR_GZ}" 22 | tar -xf "${PROMETHEUS_DOWNLOAD_PATH}" -C "${PREFIX_TMP}" 23 | mv "${PREFIX_TMP}/${PROMETHEUS_RELEASE}" "${PREFIX_BIN}/prometheus" 24 | ''' 25 | -------------------------------------------------------------------------------- /proto/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-proto" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | bytes = "1" 10 | prost = "0.9" 11 | runkv-common = { path = "../common" } 12 | serde = "1.0" 13 | serde_derive = "1.0" 14 | tonic = "0.6.2" 15 | 16 | [build-dependencies] 17 | prost-build = "0.9" 18 | tonic-build = "0.6.2" 19 | -------------------------------------------------------------------------------- /proto/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | tonic_build::configure() 3 | .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]") 4 | .compile( 5 | &[ 6 | "src/proto/common.proto", 7 | "src/proto/manifest.proto", 8 | "src/proto/meta.proto", 9 | "src/proto/rudder.proto", 10 | "src/proto/wheel.proto", 11 | "src/proto/exhauster.proto", 12 | "src/proto/kv.proto", 13 | ], 14 | &["src/proto"], 15 | ) 16 | .unwrap() 17 | } 18 | -------------------------------------------------------------------------------- /proto/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod common { 2 | #![allow(clippy::all)] 3 | tonic::include_proto!("common"); 4 | } 5 | 6 | pub mod manifest { 7 | #![allow(clippy::all)] 8 | tonic::include_proto!("manifest"); 9 | } 10 | 11 | pub mod meta { 12 | #![allow(clippy::all)] 13 | tonic::include_proto!("meta"); 14 | 15 | impl Eq for KeyRange {} 16 | 17 | impl PartialOrd for KeyRange { 18 | fn partial_cmp(&self, other: &Self) -> Option { 19 | Some(self.cmp(&other)) 20 | } 21 | } 22 | 23 | impl Ord for KeyRange { 24 | fn cmp(&self, other: &Self) -> std::cmp::Ordering { 25 | self.start_key.cmp(&other.start_key) 26 | } 27 | } 28 | } 29 | 30 | pub mod rudder { 31 | #![allow(clippy::all)] 32 | tonic::include_proto!("rudder"); 33 | } 34 | 35 | pub mod wheel { 36 | #![allow(clippy::all)] 37 | tonic::include_proto!("wheel"); 38 | } 39 | 40 | pub mod exhauster { 41 | #![allow(clippy::all)] 42 | tonic::include_proto!("exhauster"); 43 | } 44 | 45 | pub mod kv { 46 | #![allow(clippy::all)] 47 | tonic::include_proto!("kv"); 48 | 49 | use runkv_common::coding::BytesSerde; 50 | 51 | impl<'de> BytesSerde<'de> for KvRequest {} 52 | impl<'de> BytesSerde<'de> for KvResponse {} 53 | 54 | impl KvRequest { 55 | pub fn r#type(&self) -> Type { 56 | let mut r#type = Type::TNone; 57 | for op in self.ops.iter() { 58 | match op.r#type() { 59 | OpType::None => {} 60 | OpType::Get => match r#type { 61 | Type::TNone => r#type = Type::TGet, 62 | Type::TGet | Type::TTxn => {} 63 | _ => r#type = Type::TTxn, 64 | }, 65 | OpType::Put => match r#type { 66 | Type::TNone => r#type = Type::TPut, 67 | Type::TPut | Type::TTxn => {} 68 | _ => r#type = Type::TTxn, 69 | }, 70 | OpType::Delete => match r#type { 71 | Type::TNone => r#type = Type::TDelete, 72 | Type::TDelete | Type::TTxn => {} 73 | _ => r#type = Type::TTxn, 74 | }, 75 | OpType::Snapshot => match r#type { 76 | Type::TNone => r#type = Type::TSnapshot, 77 | Type::TSnapshot | Type::TTxn => {} 78 | _ => r#type = Type::TTxn, 79 | }, 80 | } 81 | } 82 | r#type 83 | } 84 | 85 | pub fn is_read_only(&self) -> bool { 86 | for op in self.ops.iter() { 87 | match op.r#type() { 88 | OpType::None | OpType::Get | OpType::Snapshot => {} 89 | OpType::Put | OpType::Delete => return false, 90 | } 91 | } 92 | true 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /proto/src/proto/buf.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | lint: 3 | use: 4 | - DEFAULT 5 | except: 6 | - ENUM_VALUE_PREFIX 7 | - ENUM_ZERO_VALUE_SUFFIX 8 | - PACKAGE_VERSION_SUFFIX 9 | 10 | # We guarantee that every file is one package. So this check isn't necessary. 11 | - DIRECTORY_SAME_PACKAGE 12 | - PACKAGE_DIRECTORY_MATCH 13 | -------------------------------------------------------------------------------- /proto/src/proto/common.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package common; 4 | 5 | message Endpoint { 6 | string host = 1; 7 | uint32 port = 2; 8 | } 9 | -------------------------------------------------------------------------------- /proto/src/proto/exhauster.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package exhauster; 4 | 5 | import "manifest.proto"; 6 | 7 | message CompactionRequest { 8 | repeated uint64 sst_ids = 1; 9 | uint64 watermark = 2; 10 | uint64 sstable_capacity = 3; 11 | uint64 block_capacity = 4; 12 | uint64 restart_interval = 5; 13 | double bloom_false_positive = 6; 14 | uint64 compression_algorithm = 7; 15 | bool remove_tombstone = 8; 16 | repeated bytes partition_points = 9; 17 | } 18 | 19 | message CompactionResponse { 20 | repeated manifest.SstableInfo old_sst_infos = 1; 21 | repeated manifest.SstableInfo new_sst_infos = 2; 22 | } 23 | 24 | service ExhausterService { 25 | rpc Compaction(CompactionRequest) returns (CompactionResponse); 26 | } 27 | -------------------------------------------------------------------------------- /proto/src/proto/kv.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package kv; 4 | 5 | enum ErrCode { 6 | OK = 0; 7 | REDIRECT = 1; 8 | } 9 | 10 | enum OpType { 11 | NONE = 0; 12 | GET = 1; // (key[, sequence]) -> (value) 13 | PUT = 2; // (key, value) -> () 14 | DELETE = 3; // (key) -> () 15 | SNAPSHOT = 4; // () -> (sequence) 16 | } 17 | 18 | enum Type { 19 | T_NONE = 0; 20 | T_GET = 1; 21 | T_PUT = 2; 22 | T_DELETE = 3; 23 | T_SNAPSHOT = 4; 24 | T_TXN = 5; 25 | } 26 | 27 | message Op { 28 | OpType type = 1; 29 | bytes key = 2; 30 | bytes value = 3; 31 | uint64 sequence = 4; 32 | } 33 | 34 | message KvRequest { 35 | repeated Op ops = 1; 36 | // target raft node id 37 | uint64 target = 2; 38 | } 39 | 40 | message KvResponse { 41 | repeated Op ops = 1; 42 | ErrCode err = 2; 43 | } 44 | 45 | service KvService { 46 | rpc Kv(KvRequest) returns (KvResponse); 47 | } 48 | -------------------------------------------------------------------------------- /proto/src/proto/manifest.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package manifest; 4 | 5 | enum SstableOp { 6 | INSERT = 0; 7 | DELETE = 1; 8 | } 9 | 10 | message SstableDiff { 11 | uint64 id = 1; 12 | uint64 level = 2; 13 | SstableOp op = 3; 14 | uint64 data_size = 4; 15 | } 16 | 17 | message VersionDiff { 18 | uint64 id = 1; 19 | repeated SstableDiff sstable_diffs = 2; 20 | } 21 | 22 | message SstableInfo { 23 | uint64 id = 1; 24 | uint64 data_size = 2; 25 | } 26 | -------------------------------------------------------------------------------- /proto/src/proto/meta.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package meta; 4 | 5 | import "common.proto"; 6 | 7 | // Assume [`KeyRange`] does not overlaps. 8 | message KeyRange { 9 | bytes start_key = 1; 10 | bytes end_key = 2; 11 | } 12 | 13 | message KeyRangeInfo { 14 | // raft group id 15 | uint64 group = 1; 16 | // key range 17 | meta.KeyRange key_range = 2; 18 | // { raft node id -> node id } 19 | map raft_nodes = 3; 20 | // leader raft node id 21 | // Used by query router info. 22 | uint64 leader = 4; 23 | } 24 | 25 | message WheelMeta { 26 | uint64 id = 1; 27 | KeyRange key_range = 2; 28 | common.Endpoint endpoint = 3; 29 | } 30 | -------------------------------------------------------------------------------- /proto/src/proto/prototool.yaml: -------------------------------------------------------------------------------- 1 | protoc: 2 | version: 3.17.3 3 | lint: 4 | group: google -------------------------------------------------------------------------------- /proto/src/proto/rudder.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package rudder; 4 | 5 | import "common.proto"; 6 | import "manifest.proto"; 7 | import "meta.proto"; 8 | 9 | message RaftState { 10 | bool is_leader = 1; 11 | } 12 | 13 | message WheelHeartbeatRequest { 14 | uint64 watermark = 1; 15 | uint64 next_version_id = 2; 16 | map raft_states = 3; 17 | } 18 | 19 | message WheelHeartbeatResponse { 20 | repeated manifest.VersionDiff version_diffs = 1; 21 | } 22 | 23 | message ExhausterHeartbeatRequest {} 24 | 25 | message ExhausterHeartbeatResponse {} 26 | 27 | // TODO: Add status report. 28 | message HeartbeatRequest { 29 | uint64 node_id = 1; 30 | common.Endpoint endpoint = 2; 31 | oneof heartbeat_message { 32 | WheelHeartbeatRequest wheel_heartbeat = 3; 33 | ExhausterHeartbeatRequest exhauster_heartbeat = 4; 34 | } 35 | } 36 | 37 | message HeartbeatResponse { 38 | oneof heartbeat_message { 39 | WheelHeartbeatResponse wheel_heartbeat = 1; 40 | ExhausterHeartbeatResponse exhauster_heartbeat = 2; 41 | } 42 | } 43 | 44 | message InsertL0Request { 45 | uint64 node_id = 1; 46 | repeated manifest.SstableInfo sst_infos = 2; 47 | uint64 next_version_id = 3; 48 | } 49 | 50 | message InsertL0Response { 51 | repeated manifest.VersionDiff version_diffs = 1; 52 | } 53 | 54 | message TsoRequest {} 55 | 56 | message TsoResponse { 57 | uint32 timestamp = 1; 58 | } 59 | 60 | service RudderService { 61 | // Called by `wheel` and `exhauster`. 62 | rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse); 63 | // Called by `wheel` when sstable uploader finish upload new L0 sstable to S3. 64 | rpc InsertL0(InsertL0Request) returns (InsertL0Response); 65 | // TODO: Implement transaction. 66 | rpc Tso(TsoRequest) returns (TsoResponse); 67 | } 68 | 69 | // ***** Control Service ***** 70 | 71 | message AddWheelsRequest { 72 | // { node id -> endpoint } 73 | map wheels = 1; 74 | } 75 | 76 | message AddWheelsResponse {} 77 | 78 | message AddKeyRangesRequest { 79 | repeated meta.KeyRangeInfo key_ranges = 1; 80 | } 81 | 82 | message AddKeyRangesResponse {} 83 | 84 | message RouterRequest {} 85 | 86 | message RouterResponse { 87 | repeated meta.KeyRangeInfo key_ranges = 1; 88 | map wheels = 2; 89 | } 90 | 91 | service ControlService { 92 | rpc AddWheels(AddWheelsRequest) returns (AddWheelsResponse); 93 | rpc AddKeyRanges(AddKeyRangesRequest) returns (AddKeyRangesResponse); 94 | rpc Router(RouterRequest) returns (RouterResponse); 95 | } 96 | -------------------------------------------------------------------------------- /proto/src/proto/wheel.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package wheel; 4 | 5 | import "common.proto"; 6 | import "meta.proto"; 7 | 8 | // ***** Inner Service ***** 9 | 10 | message AddWheelsRequest { 11 | // { node id -> endpoint } 12 | map wheels = 1; 13 | } 14 | 15 | message AddWheelsResponse {} 16 | 17 | message AddKeyRangesRequest { 18 | repeated meta.KeyRangeInfo key_ranges = 1; 19 | } 20 | 21 | message AddKeyRangesResponse {} 22 | 23 | service WheelService { 24 | rpc AddWheels(AddWheelsRequest) returns (AddWheelsResponse); 25 | rpc AddKeyRanges(AddKeyRangesRequest) returns (AddKeyRangesResponse); 26 | // TODO: Implement them. 27 | // rpc SyncEndpoints(SyncEndpointsRequest) returns (SyncEndpointsResponse); 28 | // rpc SyncKeyRanges(SyncKeyRangesRequest) returns (SyncKeyRangesResponse); 29 | 30 | } 31 | 32 | // ***** Raft Service ***** 33 | 34 | message RaftRequest { 35 | bytes data = 1; 36 | } 37 | 38 | message RaftResponse {} 39 | 40 | service RaftService { 41 | rpc Raft(RaftRequest) returns (RaftResponse); 42 | } 43 | -------------------------------------------------------------------------------- /rudder/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-rudder" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-trait = "0.1" 10 | base64 = "0.13" 11 | bytes = "1" 12 | bytesize = { version = "1.1.0", features = ["serde"] } 13 | clap = { version = "3.1.6", features = ["derive"] } 14 | futures = "0.3" 15 | humantime = "2.1.0" 16 | humantime-serde = "1.1.1" 17 | itertools = "0.10.3" 18 | parking_lot = "0.12" 19 | prost = "0.9" 20 | rand = "0.8.5" 21 | runkv-common = { path = "../common" } 22 | runkv-proto = { path = "../proto" } 23 | runkv-storage = { path = "../storage" } 24 | serde = "1.0" 25 | serde_derive = "1.0" 26 | thiserror = "1.0" 27 | tokio = { version = "1", features = [ 28 | "rt-multi-thread", 29 | "sync", 30 | "macros", 31 | "time", 32 | ] } 33 | toml = "0.4.2" 34 | tonic = "0.6.2" 35 | tracing = "0.1" 36 | tracing-subscriber = "0.3" 37 | 38 | [target.'cfg(not(target_env = "msvc"))'.dependencies] 39 | tikv-jemallocator = "0.4.3" 40 | 41 | [dev-dependencies] 42 | env_logger = "*" 43 | test-log = "0.2.10" 44 | 45 | [features] 46 | verbose-release-log = ["tracing/release_max_level_trace"] 47 | -------------------------------------------------------------------------------- /rudder/src/config.rs: -------------------------------------------------------------------------------- 1 | use runkv_common::config::{CacheConfig, LsmTreeConfig, MinioConfig, S3Config}; 2 | use serde::Deserialize; 3 | 4 | #[derive(Deserialize, Clone, Debug)] 5 | pub struct RudderConfig { 6 | pub id: u64, 7 | pub host: String, 8 | pub port: u16, 9 | pub data_path: String, 10 | pub meta_path: String, 11 | pub health_timeout: String, 12 | pub s3: Option, 13 | pub minio: Option, 14 | pub cache: CacheConfig, 15 | pub lsm_tree: LsmTreeConfig, 16 | } 17 | -------------------------------------------------------------------------------- /rudder/src/error.rs: -------------------------------------------------------------------------------- 1 | use runkv_proto::common::Endpoint; 2 | use runkv_proto::meta::KeyRange; 3 | use tonic::Status; 4 | 5 | #[derive(thiserror::Error, Debug)] 6 | pub enum Error { 7 | #[error("storage error: {0}")] 8 | StorageError(#[from] runkv_storage::Error), 9 | #[error("invalid watermark: [current: {0}] [new: {1}]")] 10 | InvalidWatermark(u64, u64), 11 | #[error("transport error: {0}")] 12 | TransportError(#[from] tonic::transport::Error), 13 | #[error("rpc status error: {0}")] 14 | RpcStatus(#[from] Status), 15 | #[error("config error: {0}")] 16 | ConfigError(String), 17 | #[error("control error: {0}")] 18 | ControlError(#[from] ControlError), 19 | #[error("other: {0}")] 20 | Other(String), 21 | } 22 | 23 | impl Error { 24 | pub fn err(e: impl Into>) -> Error { 25 | Error::Other(e.into().to_string()) 26 | } 27 | 28 | pub fn config_err(e: impl Into>) -> Error { 29 | Error::ConfigError(e.into().to_string()) 30 | } 31 | } 32 | 33 | #[derive(thiserror::Error, Debug)] 34 | pub enum ControlError { 35 | #[error("node already exists: [node: {node}] [origin endpoint: {origin:?}] [given endpoint: {given:?}]")] 36 | NodeAlreadyExists { 37 | node: u64, 38 | origin: Endpoint, 39 | given: Endpoint, 40 | }, 41 | #[error("node not exists: {0}")] 42 | NodeNotExists(u64), 43 | #[error("group already exists: {0}")] 44 | GroupAlreadyExists(u64), 45 | #[error("group not exists: {0}")] 46 | GroupNotExists(u64), 47 | #[error("raft node already exists: {0}")] 48 | RaftNodeAlreadyExists(u64), 49 | #[error("raft node not exists: {0}")] 50 | RaftNodeNotExists(u64), 51 | #[error("key range overlaps: [{0:?}] [{1:?}]")] 52 | KeyRangeOverlaps(KeyRange, KeyRange), 53 | } 54 | 55 | pub type Result = std::result::Result; 56 | -------------------------------------------------------------------------------- /rudder/src/main.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(target_env = "msvc"))] 2 | use tikv_jemallocator::Jemalloc; 3 | 4 | #[cfg(not(target_env = "msvc"))] 5 | #[global_allocator] 6 | static GLOBAL: Jemalloc = Jemalloc; 7 | 8 | use std::fs::read_to_string; 9 | 10 | use clap::Parser; 11 | use runkv_rudder::config::RudderConfig; 12 | use runkv_rudder::error::{Error, Result}; 13 | use runkv_rudder::{bootstrap_rudder, build_rudder}; 14 | use tracing::info; 15 | use tracing_subscriber::FmtSubscriber; 16 | 17 | #[derive(Parser, Debug)] 18 | struct Args { 19 | #[clap(short, long, default_value = "etc/rudder.toml")] 20 | config_file_path: String, 21 | } 22 | 23 | #[tokio::main] 24 | async fn main() -> Result<()> { 25 | let subscriber = FmtSubscriber::new(); 26 | tracing::subscriber::set_global_default(subscriber).map_err(Error::err)?; 27 | 28 | let args = Args::parse(); 29 | info!("args: {:?}", args); 30 | 31 | let config: RudderConfig = 32 | toml::from_str(&read_to_string(&args.config_file_path).map_err(Error::err)?) 33 | .map_err(Error::config_err)?; 34 | info!("config: {:?}", config); 35 | 36 | let (rudder, workers) = build_rudder(&config).await?; 37 | bootstrap_rudder(&config, rudder, workers).await 38 | } 39 | -------------------------------------------------------------------------------- /rudder/src/meta/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, HashMap}; 2 | use std::sync::Arc; 3 | use std::time::{Duration, SystemTime}; 4 | 5 | use async_trait::async_trait; 6 | use runkv_proto::common::Endpoint; 7 | use runkv_proto::meta::{KeyRange, KeyRangeInfo}; 8 | use runkv_proto::rudder::RaftState; 9 | 10 | use crate::error::Result; 11 | 12 | pub mod mem; 13 | #[allow(dead_code)] 14 | pub mod object; 15 | 16 | #[async_trait] 17 | pub trait MetaStore: Send + Sync + 'static { 18 | /// Add new wheel. 19 | async fn add_wheels(&self, wheels: HashMap) -> Result<()>; 20 | 21 | /// Get all wheel ids. 22 | async fn wheels(&self) -> Result>; 23 | 24 | /// Add new key range. 25 | async fn add_key_ranges(&self, key_ranges: Vec) -> Result<()>; 26 | 27 | /// Get all key range infos. 28 | async fn all_key_range_infos(&self) -> Result>; 29 | 30 | /// Update raft states. 31 | async fn update_raft_states(&self, raft_states: HashMap) -> Result<()>; 32 | 33 | /// Update exhauster meta. 34 | async fn update_exhauster(&self, node_id: u64, endpoint: Endpoint) -> Result<()>; 35 | 36 | /// Random pick a available exhauster. 37 | async fn pick_exhauster(&self, live: Duration) -> Result>; 38 | 39 | /// Get all responsable key ranges grouped by groups. 40 | async fn all_group_key_ranges(&self) -> Result>>; 41 | 42 | /// Get all responsable key ranges. 43 | async fn all_key_ranges(&self) -> Result>; 44 | 45 | /// Pin sstables to prevent them from being compacted. 46 | /// 47 | /// Returns `true` if there is no conflicts and given sstables are pinned. 48 | async fn pin_sstables(&self, sst_ids: &[u64], time: SystemTime) -> Result; 49 | 50 | /// Unpin sstables no matter if they were pinned before. 51 | async fn unpin_sstables(&self, sst_ids: &[u64]) -> Result<()>; 52 | 53 | /// Check if sstables are pinned. Return a vector of pinned status. 54 | async fn is_sstables_pinned(&self, sst_ids: &[u64], time: SystemTime) -> Result>; 55 | 56 | /// Get the current timestamp. 57 | async fn timestamp(&self) -> Result; 58 | 59 | /// Fetch the current timestamp and advance it by `add`. 60 | async fn timestamp_fetch_add(&self, add: u32) -> Result; 61 | } 62 | 63 | pub type MetaStoreRef = Arc; 64 | 65 | fn is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool { 66 | !(r1.start_key > r2.end_key || r1.end_key < r2.start_key) 67 | } 68 | 69 | fn _in_range(key: &[u8], range: &KeyRange) -> bool { 70 | key >= &range.start_key[..] && key < &range.end_key[..] 71 | } 72 | -------------------------------------------------------------------------------- /rudder/src/meta/object.rs: -------------------------------------------------------------------------------- 1 | use runkv_storage::ObjectStoreRef; 2 | 3 | use crate::error::{Error, Result}; 4 | 5 | pub struct ObjectMetaStore { 6 | object_store: ObjectStoreRef, 7 | path: String, 8 | } 9 | 10 | // TODO: Impl me. 11 | // #[async_trait] 12 | // impl MetaStore for ObjectMetaStore {} 13 | 14 | impl ObjectMetaStore { 15 | pub fn new(object_store: ObjectStoreRef, path: String) -> Self { 16 | Self { object_store, path } 17 | } 18 | 19 | async fn put(&self, key: &[u8], value: Vec) -> Result<()> { 20 | self.object_store 21 | .put(&self.key(key), value) 22 | .await 23 | .map_err(Error::StorageError) 24 | } 25 | 26 | async fn get(&self, key: &[u8]) -> Result>> { 27 | self.object_store 28 | .get(&self.key(key)) 29 | .await 30 | .map_err(Error::StorageError) 31 | } 32 | 33 | async fn remove(&self, key: &[u8]) -> Result<()> { 34 | self.object_store 35 | .remove(&self.key(key)) 36 | .await 37 | .map_err(Error::StorageError) 38 | } 39 | 40 | fn key(&self, key: &[u8]) -> String { 41 | format!("{}/{}", self.path, base64::encode(key)) 42 | } 43 | } 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | 48 | use std::sync::Arc; 49 | 50 | use runkv_storage::MemObjectStore; 51 | use test_log::test; 52 | 53 | use super::*; 54 | 55 | #[test(tokio::test)] 56 | async fn test_crud() { 57 | let object_store = Arc::new(MemObjectStore::default()); 58 | let store = ObjectMetaStore::new(object_store, "meta-test".to_string()); 59 | let key = b"test-key".to_vec(); 60 | let value = b"test-value".to_vec(); 61 | store.put(&key, value.clone()).await.unwrap(); 62 | let fetched_value = store.get(&key).await.unwrap().unwrap(); 63 | assert_eq!(fetched_value, value); 64 | store.remove(&key).await.unwrap(); 65 | assert!(store.get(&key).await.unwrap().is_none()); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /rudder/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod compaction_detector; 2 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$(which tmux)" ]; then 4 | echo "tmux is required, please install manually." 5 | exit 0 6 | fi 7 | 8 | if [ -z "$(which cargo-make)" ]; then 9 | echo "Installing cargo-make..." 10 | cargo install cargo-make --version "^0.35" 11 | fi 12 | 13 | if [ $# -eq 0 ] || [ "$1" == "-h" ] || [ "$1" == "--help" ]; then 14 | makers --list-all-steps 15 | exit 0 16 | fi 17 | 18 | makers --no-workspace "$@" -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly-2022-10-16 2 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | comment_width = 120 2 | format_code_in_doc_comments = true 3 | format_macro_bodies = true 4 | format_macro_matchers = true 5 | normalize_comments = true 6 | normalize_doc_attributes = true 7 | imports_granularity = "Module" 8 | group_imports = "StdExternalCrate" 9 | reorder_imports = true 10 | tab_spaces = 4 11 | wrap_comments = true 12 | -------------------------------------------------------------------------------- /storage/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-storage" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-recursion = "1.0.0" 10 | async-stream = "0.3.3" 11 | async-trait = "0.1" 12 | aws-config = "0.8" 13 | aws-endpoint = "0.8" 14 | aws-sdk-s3 = "0.8" 15 | aws-smithy-http = "0.38" 16 | aws-smithy-types = "0.38" 17 | aws-types = { version = "0.8", features = ["hardcoded-credentials"] } 18 | bcc = { version = "0.0.33", optional = true } 19 | bitvec = "1" 20 | bytes = "1" 21 | bytesize = "1.1.0" 22 | clap = { version = "3.1.6", features = ["derive"] } 23 | crc32fast = "1.3.2" 24 | farmhash = "1.1.5" 25 | futures = "0.3" 26 | futures-async-stream = "0.2" 27 | hdrhistogram = "7" 28 | isahc = { version = "1", default-features = false, optional = true } 29 | itertools = "0.10.3" 30 | lazy_static = "1.4.0" 31 | libc = "0.2" 32 | lz4 = "1.23.1" 33 | moka = { version = "0.7", features = ["future"] } 34 | nix = { version = "0.24.1", features = ["fs"] } 35 | opentelemetry = { version = "0.17", optional = true, features = ["rt-tokio"] } 36 | opentelemetry-jaeger = { version = "0.16", optional = true, features = [ 37 | "rt-tokio", 38 | "collector_client", 39 | "isahc", 40 | "isahc_collector_client", 41 | ] } 42 | parking_lot = "0.12" 43 | prometheus = "0.13.0" 44 | rand = "0.8.5" 45 | rand_chacha = "0.3.1" 46 | rangemap = "1.0.2" 47 | runkv-common = { path = "../common" } 48 | runkv-proto = { path = "../proto" } 49 | serde = "1.0" 50 | serde_derive = "1.0" 51 | tempfile = "3" 52 | thiserror = "1.0" 53 | tokio = { version = "1", features = [ 54 | "rt-multi-thread", 55 | "sync", 56 | "macros", 57 | "time", 58 | "fs", 59 | "signal", 60 | ] } 61 | tracing = "0.1" 62 | tracing-opentelemetry = { version = "0.17", optional = true } 63 | tracing-subscriber = { version = "0.3.16", features = [ 64 | "fmt", 65 | "parking_lot", 66 | "std", 67 | "time", 68 | ], optional = true } 69 | 70 | [dev-dependencies] 71 | criterion = { version = "0.3", features = ["async", "async_tokio"] } 72 | env_logger = "*" 73 | test-log = "0.2.10" 74 | 75 | [target.'cfg(target_os = "linux")'.dev-dependencies] 76 | fiemap = "0.1.1" 77 | 78 | [features] 79 | deadlock = [] 80 | bpf = ["bcc"] 81 | trace = [ 82 | "isahc", 83 | "opentelemetry", 84 | "opentelemetry-jaeger", 85 | "tracing-opentelemetry", 86 | "tracing-subscriber", 87 | "tracing/release_max_level_trace", 88 | ] 89 | verbose-release-log = ["tracing/release_max_level_trace"] 90 | 91 | [[bench]] 92 | name = "bench_block_iter" 93 | harness = false 94 | 95 | [[bench]] 96 | name = "bench_compression" 97 | harness = false 98 | 99 | [[bin]] 100 | name = "bench_raft_log_store" 101 | path = "bench/bench_raft_log_store/main.rs" 102 | 103 | [[bin]] 104 | name = "file-cache-bench" 105 | path = "bench/file_cache_bench/main.rs" 106 | -------------------------------------------------------------------------------- /storage/bench/file_cache_bench/README.md: -------------------------------------------------------------------------------- 1 | ## Usage 2 | 3 | ```bash 4 | sudo docker stop `sudo docker ps | grep jaeger | awk '{print $1}'` || true && \ 5 | sudo docker run --rm -d -p6831:6831/udp -p16686:16686 -p14268:14268 --name jaeger jaegertracing/all-in-one:latest && \ 6 | sudo rm -rf /data/filecache && \ 7 | cargo build --bin file-cache-bench --features "bpf trace" --release && \ 8 | sudo ./target/release/file-cache-bench -p /data/filecache --capacity 10240 --total-buffer-capacity 1024 --w-rate 100 --r-rate 100 --concurrency 8 --time 60 --slow 5 9 | ``` 10 | 11 | ## Output Examples 12 | 13 | ```plain 14 | Event { 15 | magic: 16045690984833335023, 16 | sid: 56298568754921497, 17 | vfs_read_enter_ts: 255073825749145, 18 | vfs_read_leave_ts: 255073841493162, 19 | ext4_file_read_iter_enter_ts: 255073825750206, 20 | ext4_file_read_iter_leave_ts: 255073841492771, 21 | iomap_dio_rw_enter_ts: 255073836674011, 22 | iomap_dio_rw_leave_ts: 255073841492271, 23 | filemap_write_and_wait_range_enter_ts: 255073836674320, 24 | filemap_write_and_wait_range_leave_ts: 255073836674622, 25 | } 26 | vfs_read | 15.744ms | ================================================== 27 | ext4_file_read_iter | 15.743ms | ================================================= 28 | iomap_dio_rw | 4.818ms | =============== 29 | filemap_write_and_wait_range | 302.000ns | = 30 | ``` 31 | 32 | ```plain 33 | Total: 34 | disk total iops: 10835.1 35 | disk total throughput: 1.3 GiB/s 36 | disk read iops: 4379.6 37 | disk read throughput: 543.1 MiB/s 38 | disk write iops: 6455.4 39 | disk write throughput: 780.3 MiB/s 40 | insert iops: 788.6/s 41 | insert throughput: 788.6 MiB/s 42 | insert lat p50: 2us 43 | insert lat p90: 5us 44 | insert lat p99: 12us 45 | get iops: 656.6/s 46 | get miss: 6.06% 47 | get hit lat p50: 9087us 48 | get hit lat p90: 23551us 49 | get hit lat p99: 31487us 50 | get miss lat p50: 16us 51 | get miss lat p90: 36us 52 | get miss lat p99: 563us 53 | flush iops: 253.0/s 54 | flush throughput: 770.6 MiB/s 55 | ``` -------------------------------------------------------------------------------- /storage/bench/file_cache_bench/rate.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::time::{Duration, Instant}; 16 | 17 | pub struct RateLimiter { 18 | capacity: f64, 19 | quota: f64, 20 | 21 | last: Instant, 22 | } 23 | 24 | impl RateLimiter { 25 | pub fn new(capacity: f64) -> Self { 26 | Self { 27 | capacity, 28 | quota: 0.0, 29 | last: Instant::now(), 30 | } 31 | } 32 | 33 | pub fn consume(&mut self, weight: f64) -> Option { 34 | let now = Instant::now(); 35 | let refill = now.duration_since(self.last).as_secs_f64() * self.capacity; 36 | self.last = now; 37 | self.quota = f64::min(self.quota + refill, self.capacity); 38 | self.quota -= weight; 39 | if self.quota >= 0.0 { 40 | return None; 41 | } 42 | let wait = Duration::from_secs_f64((-self.quota) / self.capacity); 43 | Some(wait) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /storage/benches/bench_block_iter.rs: -------------------------------------------------------------------------------- 1 | use bytes::BufMut; 2 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; 3 | use runkv_common::coding::CompressionAlgorithm; 4 | use runkv_storage::components::{Block, BlockBuilder, BlockBuilderOptions, BlockHolder}; 5 | use runkv_storage::iterator::{BlockIterator, Seek}; 6 | 7 | const TABLES_PER_SSTABLE: u32 = 10; 8 | const KEYS_PER_TABLE: u64 = 100; 9 | const RESTART_INTERVAL: usize = 16; 10 | const BLOCK_CAPACITY: usize = TABLES_PER_SSTABLE as usize * KEYS_PER_TABLE as usize * 64; 11 | 12 | fn block_iter_next(block: Box) { 13 | let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block)); 14 | iter.seek(Seek::First).unwrap(); 15 | while iter.is_valid() { 16 | iter.next().unwrap(); 17 | } 18 | } 19 | 20 | fn block_iter_prev(block: Box) { 21 | let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block)); 22 | iter.seek(Seek::Last).unwrap(); 23 | while iter.is_valid() { 24 | iter.prev().unwrap(); 25 | } 26 | } 27 | 28 | fn bench_block_iter(c: &mut Criterion) { 29 | let block = Box::new(build_block(TABLES_PER_SSTABLE, KEYS_PER_TABLE)); 30 | 31 | println!("block size: {}", block.len()); 32 | 33 | c.bench_with_input( 34 | BenchmarkId::new( 35 | format!( 36 | "block - iter next - {} tables * {} keys", 37 | TABLES_PER_SSTABLE, KEYS_PER_TABLE 38 | ), 39 | "", 40 | ), 41 | &block, 42 | |b, block| { 43 | b.iter(|| block_iter_next(block.clone())); 44 | }, 45 | ); 46 | 47 | c.bench_with_input( 48 | BenchmarkId::new( 49 | format!( 50 | "block - iter prev - {} tables * {} keys", 51 | TABLES_PER_SSTABLE, KEYS_PER_TABLE 52 | ), 53 | "", 54 | ), 55 | &block, 56 | |b, block| { 57 | b.iter(|| block_iter_prev(block.clone())); 58 | }, 59 | ); 60 | 61 | let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block)); 62 | iter.seek(Seek::First).unwrap(); 63 | for t in 1..=TABLES_PER_SSTABLE { 64 | for i in 1..=KEYS_PER_TABLE { 65 | assert_eq!(iter.key(), key(t, i).to_vec()); 66 | assert_eq!(iter.value(), value(i).to_vec()); 67 | iter.next().unwrap(); 68 | } 69 | } 70 | assert!(!iter.is_valid()); 71 | } 72 | 73 | criterion_group!(benches, bench_block_iter); 74 | criterion_main!(benches); 75 | 76 | fn build_block(t: u32, i: u64) -> Block { 77 | let options = BlockBuilderOptions { 78 | capacity: BLOCK_CAPACITY, 79 | compression_algorithm: CompressionAlgorithm::None, 80 | restart_interval: RESTART_INTERVAL, 81 | }; 82 | let mut builder = BlockBuilder::new(options); 83 | for tt in 1..=t { 84 | for ii in 1..=i { 85 | builder.add(&key(tt, ii), &value(ii)); 86 | } 87 | } 88 | let data = builder.build(); 89 | Block::decode(&data[..]).unwrap() 90 | } 91 | 92 | fn key(t: u32, i: u64) -> Vec { 93 | let mut buf = Vec::new(); 94 | buf.put_u8(b't'); 95 | buf.put_u32(t); 96 | buf.put_u64(i); 97 | buf 98 | } 99 | 100 | fn value(i: u64) -> Vec { 101 | let mut buf = Vec::new(); 102 | buf.put_u64(i); 103 | buf 104 | } 105 | -------------------------------------------------------------------------------- /storage/benches/bench_compression.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | 3 | use bytes::BufMut; 4 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; 5 | use rand::prelude::StdRng; 6 | use rand::{Rng, SeedableRng}; 7 | 8 | const TABLES_PER_SSTABLE: u32 = 10; 9 | const KEYS_PER_TABLE: u64 = 100; 10 | 11 | fn gen_dataset(vsize: usize) -> Vec> { 12 | let mut dataset = vec![]; 13 | let mut rng = StdRng::seed_from_u64(0); 14 | for t in 1..=TABLES_PER_SSTABLE { 15 | for i in 1..=KEYS_PER_TABLE { 16 | let mut v = vec![0; vsize]; 17 | rng.fill(&mut v[..]); 18 | let mut buf = vec![]; 19 | buf.put_u32(t); 20 | buf.put_u64(i); 21 | buf.put_slice(&v); 22 | dataset.push(buf) 23 | } 24 | } 25 | dataset 26 | } 27 | 28 | fn gen_data(dataset: &[Vec]) -> Vec { 29 | let mut data = vec![]; 30 | for entry in dataset.iter() { 31 | data.put_slice(entry); 32 | } 33 | data 34 | } 35 | 36 | fn block_compression(data: Vec) -> Vec { 37 | let mut encoder = lz4::EncoderBuilder::new().level(4).build(vec![]).unwrap(); 38 | encoder.write_all(&data).unwrap(); 39 | let (buf, result) = encoder.finish(); 40 | result.unwrap(); 41 | buf 42 | } 43 | 44 | fn stream_compression(dataset: Vec>) -> Vec { 45 | let buf = vec![]; 46 | let mut encoder = lz4::EncoderBuilder::new().level(4).build(buf).unwrap(); 47 | for entry in dataset { 48 | encoder.write_all(&entry).unwrap(); 49 | } 50 | let (buf, result) = encoder.finish(); 51 | result.unwrap(); 52 | buf 53 | } 54 | 55 | fn bench_compression(c: &mut Criterion) { 56 | for vsize in [8, 16, 32, 64] { 57 | let dataset = gen_dataset(vsize); 58 | let data = gen_data(&dataset); 59 | 60 | c.bench_with_input( 61 | BenchmarkId::new(format!("buffer - vsize: {}B", vsize), ""), 62 | &dataset, 63 | |b, dataset| b.iter(|| gen_data(dataset)), 64 | ); 65 | 66 | c.bench_with_input( 67 | BenchmarkId::new(format!("block compression - vsize: {}B", vsize), ""), 68 | &data, 69 | |b, data| b.iter(|| block_compression(data.clone())), 70 | ); 71 | 72 | c.bench_with_input( 73 | BenchmarkId::new(format!("stream compression - vsize: {}B", vsize), ""), 74 | &dataset, 75 | |b, dataset| b.iter(|| stream_compression(dataset.clone())), 76 | ); 77 | 78 | let uncompressed = data.len(); 79 | let block_compressed = block_compression(data).len(); 80 | let stream_compressed = stream_compression(dataset).len(); 81 | 82 | println!("uncompressed size: {}", uncompressed); 83 | println!( 84 | "block compressed size: {}, rate: {:.3}", 85 | block_compressed, 86 | block_compressed as f64 / uncompressed as f64 87 | ); 88 | println!( 89 | "stream compressed size: {}, rate: {:.3}", 90 | stream_compressed, 91 | stream_compressed as f64 / uncompressed as f64 92 | ); 93 | } 94 | } 95 | 96 | criterion_group!(benches, bench_compression); 97 | criterion_main!(benches); 98 | -------------------------------------------------------------------------------- /storage/src/error.rs: -------------------------------------------------------------------------------- 1 | use crate::manifest::ManifestError; 2 | use crate::object_store::ObjectStoreError; 3 | use crate::raft_log_store::error::RaftLogStoreError; 4 | use crate::tiered_cache::TieredCacheError; 5 | 6 | #[derive(thiserror::Error, Debug)] 7 | pub enum Error { 8 | #[error("encode error: {0}")] 9 | EncodeError(String), 10 | #[error("decode error: {0}")] 11 | DecodeError(String), 12 | #[error("object store error: {0}")] 13 | ObjectStoreError(#[from] ObjectStoreError), 14 | #[error("manifest error: {0}")] 15 | ManifestError(#[from] ManifestError), 16 | #[error("io error: {0}")] 17 | IoError(#[from] std::io::Error), 18 | #[error("raft log store error: {0}")] 19 | RaftLogStoreError(#[from] RaftLogStoreError), 20 | #[error("tiered cache error: {0}")] 21 | TieredCacheError(#[from] TieredCacheError), 22 | #[error("other: {0}")] 23 | Other(String), 24 | } 25 | 26 | impl Error { 27 | pub fn err(e: impl Into>) -> Self { 28 | Self::Other(e.into().to_string()) 29 | } 30 | 31 | pub fn encode_error(e: impl Into>) -> Self { 32 | Self::EncodeError(e.into().to_string()) 33 | } 34 | 35 | pub fn decode_error(e: impl Into>) -> Self { 36 | Self::DecodeError(e.into().to_string()) 37 | } 38 | } 39 | 40 | pub type Result = std::result::Result; 41 | -------------------------------------------------------------------------------- /storage/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(drain_filter)] 2 | #![feature(assert_matches)] 3 | #![feature(generators, generator_trait)] 4 | #![feature(stmt_expr_attributes)] 5 | #![feature(proc_macro_hygiene)] 6 | #![feature(trait_alias)] 7 | #![feature(let_chains)] 8 | #![feature(allocator_api)] 9 | #![feature(lint_reasons)] 10 | #![feature(build_hasher_simple_hash_one)] 11 | #![feature(strict_provenance)] 12 | 13 | mod error; 14 | mod lsm_tree; 15 | mod object_store; 16 | pub mod raft_log_store; 17 | pub mod tiered_cache; 18 | pub mod utils; 19 | 20 | pub use error::*; 21 | pub use lsm_tree::*; 22 | pub use object_store::*; 23 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/components/metrics.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use lazy_static::lazy_static; 4 | 5 | lazy_static! { 6 | static ref INTERNAL_OPS_COUNTER_VEC: prometheus::CounterVec = 7 | prometheus::register_counter_vec!( 8 | "lsm_tree_internal_ops_counter_vec", 9 | "lsm_tree_internal_ops_counter_vec", 10 | &["op", "node"], 11 | ) 12 | .unwrap(); 13 | static ref INTERNAL_GAUGE_VEC: prometheus::GaugeVec = prometheus::register_gauge_vec!( 14 | "lsm_tree_internal_gauge_vec", 15 | "lsm_tree_internal_gauge_vec", 16 | &["type", "node"], 17 | ) 18 | .unwrap(); 19 | static ref BLOCK_CACHE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec = 20 | prometheus::register_histogram_vec!( 21 | "lsm_tree_block_cache_latency_histogram_vec", 22 | "lsm tree block cache latency histogram vec", 23 | &["op", "node"], 24 | vec![0.00001, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1] 25 | ) 26 | .unwrap(); 27 | } 28 | 29 | pub struct LsmTreeMetrics { 30 | pub rotate_memtable_counter: prometheus::Counter, 31 | pub flush_memtable_counter: prometheus::Counter, 32 | 33 | pub active_memtable_size_gauge: prometheus::Gauge, 34 | 35 | pub block_cache_get_latency_histogram: prometheus::Histogram, 36 | pub block_cache_insert_latency_histogram: prometheus::Histogram, 37 | pub block_cache_fill_latency_histogram: prometheus::Histogram, 38 | } 39 | 40 | pub type LsmTreeMetricsRef = Arc; 41 | 42 | impl LsmTreeMetrics { 43 | pub fn new(node: u64) -> Self { 44 | Self { 45 | rotate_memtable_counter: INTERNAL_OPS_COUNTER_VEC 46 | .get_metric_with_label_values(&["rotate_memtable", &node.to_string()]) 47 | .unwrap(), 48 | 49 | flush_memtable_counter: INTERNAL_OPS_COUNTER_VEC 50 | .get_metric_with_label_values(&["flush_memtable", &node.to_string()]) 51 | .unwrap(), 52 | 53 | active_memtable_size_gauge: INTERNAL_GAUGE_VEC 54 | .get_metric_with_label_values(&["active_memtable_size", &node.to_string()]) 55 | .unwrap(), 56 | 57 | block_cache_get_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 58 | .get_metric_with_label_values(&["block_cache_get", &node.to_string()]) 59 | .unwrap(), 60 | block_cache_insert_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 61 | .get_metric_with_label_values(&["block_cache_insert", &node.to_string()]) 62 | .unwrap(), 63 | block_cache_fill_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 64 | .get_metric_with_label_values(&["block_cache_fill", &node.to_string()]) 65 | .unwrap(), 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/components/mod.rs: -------------------------------------------------------------------------------- 1 | mod block; 2 | pub use block::*; 3 | mod block_cache; 4 | pub use block_cache::*; 5 | mod memtable; 6 | pub use memtable::*; 7 | mod sstable; 8 | pub use sstable::*; 9 | mod sstable_store; 10 | pub use sstable_store::*; 11 | mod skiplist; 12 | pub use skiplist::*; 13 | mod metrics; 14 | pub use metrics::*; 15 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/components/skiplist/arena.rs: -------------------------------------------------------------------------------- 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE). 2 | 3 | use std::sync::atomic::{AtomicU32, Ordering}; 4 | use std::sync::Arc; 5 | use std::{mem, ptr}; 6 | 7 | struct ArenaCore { 8 | len: AtomicU32, 9 | cap: usize, 10 | ptr: *mut u8, 11 | } 12 | 13 | impl Drop for ArenaCore { 14 | fn drop(&mut self) { 15 | unsafe { 16 | let ptr = self.ptr as *mut u64; 17 | let cap = self.cap / 8; 18 | Vec::from_raw_parts(ptr, 0, cap); 19 | } 20 | } 21 | } 22 | 23 | pub struct Arena { 24 | core: Arc, 25 | } 26 | 27 | impl Arena { 28 | pub fn with_capacity(cap: u32) -> Arena { 29 | let mut buf: Vec = Vec::with_capacity(cap as usize / 8); 30 | let ptr = buf.as_mut_ptr() as *mut u8; 31 | let cap = buf.capacity() * 8; 32 | mem::forget(buf); 33 | Arena { 34 | core: Arc::new(ArenaCore { 35 | len: AtomicU32::new(1), 36 | cap, 37 | ptr, 38 | }), 39 | } 40 | } 41 | 42 | pub fn len(&self) -> u32 { 43 | self.core.len.load(Ordering::SeqCst) 44 | } 45 | 46 | pub fn alloc(&self, align: usize, size: usize) -> u32 { 47 | let align_mask = align - 1; 48 | // Leave enough padding for align. 49 | let size = size + align_mask; 50 | let offset = self.core.len.fetch_add(size as u32, Ordering::SeqCst); 51 | // Calculate the correct align point, it equals to 52 | // (offset + align_mask) / align * align. 53 | let ptr_offset = (offset as usize + align_mask) & !align_mask; 54 | assert!(offset as usize + size <= self.core.cap); 55 | ptr_offset as u32 56 | } 57 | 58 | pub unsafe fn get_mut(&self, offset: u32) -> *mut N { 59 | if offset == 0 { 60 | return ptr::null_mut(); 61 | } 62 | self.core.ptr.add(offset as usize) as _ 63 | } 64 | 65 | pub fn offset(&self, ptr: *const N) -> u32 { 66 | let ptr_addr = ptr as usize; 67 | let self_addr = self.core.ptr as usize; 68 | if ptr_addr > self_addr && ptr_addr < self_addr + self.core.cap { 69 | (ptr_addr - self_addr) as u32 70 | } else { 71 | 0 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/components/skiplist/key.rs: -------------------------------------------------------------------------------- 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE). 2 | 3 | use std::cmp::Ordering; 4 | 5 | use bytes::Bytes; 6 | 7 | pub trait KeyComparator: Clone { 8 | fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering; 9 | fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool; 10 | } 11 | 12 | #[derive(Default, Debug, Clone, Copy)] 13 | pub struct FixedLengthSuffixComparator { 14 | len: usize, 15 | } 16 | 17 | impl FixedLengthSuffixComparator { 18 | pub const fn new(len: usize) -> FixedLengthSuffixComparator { 19 | FixedLengthSuffixComparator { len } 20 | } 21 | } 22 | 23 | impl KeyComparator for FixedLengthSuffixComparator { 24 | #[inline] 25 | fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering { 26 | if lhs.len() < self.len { 27 | panic!( 28 | "cannot compare with suffix {}: {:?}", 29 | self.len, 30 | Bytes::copy_from_slice(lhs) 31 | ); 32 | } 33 | if rhs.len() < self.len { 34 | panic!( 35 | "cannot compare with suffix {}: {:?}", 36 | self.len, 37 | Bytes::copy_from_slice(rhs) 38 | ); 39 | } 40 | let (l_p, l_s) = lhs.split_at(lhs.len() - self.len); 41 | let (r_p, r_s) = rhs.split_at(rhs.len() - self.len); 42 | let res = l_p.cmp(r_p); 43 | match res { 44 | Ordering::Greater | Ordering::Less => res, 45 | Ordering::Equal => l_s.cmp(r_s), 46 | } 47 | } 48 | 49 | #[inline] 50 | fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool { 51 | let (l_p, _) = lhs.split_at(lhs.len() - self.len); 52 | let (r_p, _) = rhs.split_at(rhs.len() - self.len); 53 | l_p == r_p 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/components/skiplist/mod.rs: -------------------------------------------------------------------------------- 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE). 2 | 3 | mod arena; 4 | mod key; 5 | mod list; 6 | 7 | pub const SKIPLIST_NODE_TOWER_MAX_HEIGHT: usize = 20; 8 | 9 | pub use key::{FixedLengthSuffixComparator, KeyComparator}; 10 | pub use list::{IterRef, Skiplist}; 11 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/iterator/mod.rs: -------------------------------------------------------------------------------- 1 | mod block_iterator; 2 | mod concat_iterator; 3 | mod memtable_iterator; 4 | mod merge_iterator; 5 | mod sstable_iterator; 6 | mod user_key_iterator; 7 | 8 | use async_trait::async_trait; 9 | pub use block_iterator::*; 10 | pub use concat_iterator::*; 11 | pub use memtable_iterator::*; 12 | pub use merge_iterator::*; 13 | pub use sstable_iterator::*; 14 | pub use user_key_iterator::*; 15 | 16 | use crate::utils::compare_full_key; 17 | use crate::Result; 18 | 19 | pub enum Seek<'s> { 20 | /// Seek to the first valid position in order if exists. 21 | First, 22 | /// Seek to the last valid position in order if exists. 23 | Last, 24 | /// Seek forward for the first key euqals the given key or the frist key bigger than it. 25 | RandomForward(&'s [u8]), 26 | /// Seek backward for the first key equals the given key or the first key smaller than it. 27 | RandomBackward(&'s [u8]), 28 | } 29 | 30 | /// [`Iterator`] defines shared behaviours for all iterators. 31 | /// 32 | /// NOTE: 33 | /// 34 | /// [`Iterator`] must be initialized with `seek` before use. 35 | #[async_trait] 36 | pub trait Iterator: Send + Sync { 37 | /// Move a valid iterator to the next key. 38 | /// 39 | /// Note: 40 | /// 41 | /// - Before calling this function, make sure the iterator `is_valid`. 42 | /// - After calling this function, you may first check whether the iterator `is_valid` again, 43 | /// then get the new data by calling `key` and `value`. 44 | /// - If the position after calling this is invalid, this function WON'T return an `Err`. You 45 | /// should check `is_valid` before continuing the iteration. 46 | /// 47 | /// # Panics 48 | /// 49 | /// This function will panic if the iterator is invalid. 50 | async fn next(&mut self) -> Result<()>; 51 | 52 | /// Move a valid iterator to the next key. 53 | /// 54 | /// Note: 55 | /// 56 | /// - Before calling this function, make sure the iterator `is_valid`. 57 | /// - After calling this function, you may first check whether the iterator `is_valid` again, 58 | /// then get the new data by calling `key` and `value`. 59 | /// - If the position after calling this is invalid, this function WON'T return an `Err`. You 60 | /// should check `is_valid` before continuing the iteration. 61 | /// 62 | /// # Panics 63 | /// 64 | /// This function will panic if the iterator is invalid. 65 | async fn prev(&mut self) -> Result<()>; 66 | 67 | /// Retrieve the current key. 68 | /// 69 | /// Note: 70 | /// 71 | /// - Before calling this function, make sure the iterator `is_valid`. 72 | /// - This function should be straightforward and return immediately. 73 | /// 74 | /// # Panics 75 | /// 76 | /// This function will panic if the iterator is invalid. 77 | fn key(&self) -> &[u8]; 78 | 79 | /// Retrieve the current value. 80 | /// 81 | /// Note: 82 | /// 83 | /// - Before calling this function, make sure the iterator `is_valid`. 84 | /// - This function should be straightforward and return immediately. 85 | /// 86 | /// # Panics 87 | /// 88 | /// This function will panic if the iterator is invalid. 89 | fn value(&self) -> &[u8]; 90 | 91 | /// Indicate whether the iterator can be used. 92 | /// 93 | /// Note: 94 | /// 95 | /// - ONLY call `key`, `value`, and `next` if `is_valid` returns `true`. 96 | /// - This function should be straightforward and return immediately. 97 | fn is_valid(&self) -> bool; 98 | 99 | /// Initialize or reset iterator with the given seek mode. For more details, refer to [`Seek`]. 100 | /// 101 | /// `seek` returns a bool which means a visible version of the given seek condition is found in 102 | /// this iterator (but can be existing or be deleted). 103 | /// 104 | /// Note: 105 | /// 106 | /// - Do not decide whether the position is valid or not by checking the returned error of this 107 | /// function. This function WON'T return an `Err` if invalid. You should check `is_valid` 108 | /// before starting iteration. 109 | async fn seek<'s>(&mut self, seek: Seek<'s>) -> Result; 110 | } 111 | 112 | pub type BoxedIterator = Box; 113 | 114 | impl PartialEq for BoxedIterator { 115 | fn eq(&self, other: &Self) -> bool { 116 | self.key() == other.key() 117 | } 118 | } 119 | 120 | impl Eq for BoxedIterator {} 121 | 122 | impl PartialOrd for BoxedIterator { 123 | fn partial_cmp(&self, other: &Self) -> Option { 124 | Some(self.cmp(other)) 125 | } 126 | } 127 | 128 | impl Ord for BoxedIterator { 129 | fn cmp(&self, other: &Self) -> std::cmp::Ordering { 130 | // Should not be used on `UserKeyIterator` 131 | compare_full_key(self.key(), other.key()) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/manifest/error.rs: -------------------------------------------------------------------------------- 1 | #[derive(thiserror::Error, Debug)] 2 | pub enum ManifestError { 3 | #[error("version diff id does not match: [current: {0}] [new: {1}]")] 4 | VersionDiffIdNotMatch(u64, u64), 5 | #[error("invalid version diff: {0}")] 6 | InvalidVersionDiff(String), 7 | #[error("verion diff expired: [id: {0}]")] 8 | VersionDiffExpired(u64), 9 | #[error("level not exists: [idx: {0}] [total: {1}]")] 10 | LevelNotExists(u64, u64), 11 | #[error("invalid watermark: [current: {0}] [given: {1}]")] 12 | InvalidWatermark(u64, u64), 13 | #[error("other: {0}")] 14 | Other(String), 15 | } 16 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/manifest/mod.rs: -------------------------------------------------------------------------------- 1 | mod error; 2 | mod version; 3 | 4 | pub use error::*; 5 | pub use version::*; 6 | -------------------------------------------------------------------------------- /storage/src/lsm_tree/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod components; 2 | pub mod iterator; 3 | pub mod manifest; 4 | 5 | pub const DEFAULT_SSTABLE_SIZE: usize = 4 * 1024 * 1024; // 4 MiB 6 | pub const DEFAULT_BLOCK_SIZE: usize = 64 * 1024; // 64 KiB 7 | pub const DEFAULT_RESTART_INTERVAL: usize = 16; 8 | pub const TEST_DEFAULT_RESTART_INTERVAL: usize = 2; 9 | pub const DEFAULT_ENTRY_SIZE: usize = 1024; // 1 KiB 10 | pub const DEFAULT_BLOOM_FALSE_POSITIVE: f64 = 0.1; 11 | pub const DEFAULT_SSTABLE_META_SIZE: usize = 4 * 1024; // 4 KiB 12 | pub const DEFAULT_MEMTABLE_SIZE: usize = 4 * 1024 * 1024; // 4 MiB 13 | -------------------------------------------------------------------------------- /storage/src/object_store/mem.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::ops::Range; 3 | 4 | use async_trait::async_trait; 5 | use parking_lot::RwLock; 6 | 7 | use super::ObjectStore; 8 | use crate::{ObjectStoreError, Result}; 9 | 10 | #[derive(Default)] 11 | pub struct MemObjectStore { 12 | objects: RwLock>>, 13 | } 14 | 15 | #[async_trait] 16 | impl ObjectStore for MemObjectStore { 17 | async fn put(&self, path: &str, obj: Vec) -> Result<()> { 18 | let mut objects = self.objects.write(); 19 | objects.insert(path.to_string(), obj); 20 | Ok(()) 21 | } 22 | 23 | async fn get(&self, path: &str) -> Result>> { 24 | let objects = self.objects.read(); 25 | let obj = objects.get(path).cloned(); 26 | Ok(obj) 27 | } 28 | 29 | async fn get_range(&self, path: &str, range: Range) -> Result>> { 30 | let objects = self.objects.read(); 31 | let obj = objects.get(path).map(|obj| obj[range].to_vec()); 32 | Ok(obj) 33 | } 34 | 35 | async fn remove(&self, path: &str) -> Result<()> { 36 | let mut objects = self.objects.write(); 37 | objects 38 | .remove(path) 39 | .ok_or_else(|| ObjectStoreError::ObjectNotFound(path.to_string()))?; 40 | Ok(()) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /storage/src/object_store/mod.rs: -------------------------------------------------------------------------------- 1 | mod mem; 2 | pub use mem::*; 3 | mod s3; 4 | use std::ops::Range; 5 | use std::sync::Arc; 6 | 7 | use async_trait::async_trait; 8 | pub use s3::*; 9 | 10 | use crate::Result; 11 | 12 | #[derive(thiserror::Error, Debug)] 13 | pub enum ObjectStoreError { 14 | #[error("object not found: {0}")] 15 | ObjectNotFound(String), 16 | #[error("invalid range: {0}")] 17 | InvalidRange(String), 18 | #[error("S3 error: {0}")] 19 | S3(String), 20 | #[error("other: {0}")] 21 | Other(String), 22 | } 23 | 24 | #[async_trait] 25 | pub trait ObjectStore: Send + Sync { 26 | async fn put(&self, path: &str, obj: Vec) -> Result<()>; 27 | 28 | async fn get(&self, path: &str) -> Result>>; 29 | 30 | async fn get_range(&self, path: &str, range: Range) -> Result>>; 31 | 32 | async fn remove(&self, path: &str) -> Result<()>; 33 | } 34 | 35 | pub type ObjectStoreRef = Arc; 36 | -------------------------------------------------------------------------------- /storage/src/object_store/s3.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | 3 | use async_trait::async_trait; 4 | use aws_sdk_s3::error::{GetObjectError, GetObjectErrorKind}; 5 | use aws_sdk_s3::types::SdkError; 6 | use aws_sdk_s3::{Client, Endpoint, Region}; 7 | use aws_smithy_http::body::SdkBody; 8 | 9 | use super::ObjectStore; 10 | use crate::{ObjectStoreError, Result}; 11 | 12 | pub struct S3ObjectStore { 13 | client: Client, 14 | bucket: String, 15 | } 16 | 17 | impl S3ObjectStore { 18 | pub async fn new(bucket: String) -> Self { 19 | let config = aws_config::load_from_env().await; 20 | let client = Client::new(&config); 21 | Self { client, bucket } 22 | } 23 | 24 | /// Create a minio client. The server should be like `minio://key:secret@address:port/bucket`. 25 | pub async fn new_with_minio(server: &str) -> Self { 26 | let server = server.strip_prefix("minio://").unwrap(); 27 | let (access_key_id, rest) = server.split_once(':').unwrap(); 28 | let (secret_access_key, rest) = rest.split_once('@').unwrap(); 29 | let (address, bucket) = rest.split_once('/').unwrap(); 30 | 31 | let loader = aws_config::ConfigLoader::default(); 32 | let builder = aws_sdk_s3::config::Builder::from(&loader.load().await); 33 | let builder = builder.region(Region::new("custom")); 34 | let builder = builder.endpoint_resolver(Endpoint::immutable( 35 | format!("http://{}", address).try_into().unwrap(), 36 | )); 37 | let builder = builder.credentials_provider(aws_sdk_s3::Credentials::from_keys( 38 | access_key_id, 39 | secret_access_key, 40 | None, 41 | )); 42 | let config = builder.build(); 43 | let client = Client::from_conf(config); 44 | Self { 45 | client, 46 | bucket: bucket.to_string(), 47 | } 48 | } 49 | } 50 | 51 | fn err(err: impl Into>) -> ObjectStoreError { 52 | ObjectStoreError::S3(err.into().to_string()) 53 | } 54 | 55 | #[async_trait] 56 | impl ObjectStore for S3ObjectStore { 57 | async fn put(&self, path: &str, obj: Vec) -> Result<()> { 58 | self.client 59 | .put_object() 60 | .bucket(&self.bucket) 61 | .body(SdkBody::from(obj).into()) 62 | .key(path) 63 | .send() 64 | .await 65 | .map_err(err)?; 66 | Ok(()) 67 | } 68 | 69 | async fn get(&self, path: &str) -> Result>> { 70 | let req = self.client.get_object().bucket(&self.bucket).key(path); 71 | let rsp = match req.send().await { 72 | Ok(rsp) => rsp, 73 | Err(SdkError::ServiceError { 74 | err: 75 | GetObjectError { 76 | kind: GetObjectErrorKind::NoSuchKey(..), 77 | .. 78 | }, 79 | .. 80 | }) => return Ok(None), 81 | Err(e) => return Err(err(e).into()), 82 | }; 83 | let data = rsp.body.collect().await.map_err(err)?.into_bytes().to_vec(); 84 | Ok(Some(data)) 85 | } 86 | 87 | async fn get_range(&self, path: &str, range: Range) -> Result>> { 88 | let req = self 89 | .client 90 | .get_object() 91 | .bucket(&self.bucket) 92 | .key(path) 93 | .range(format!("bytes={}-{}", range.start, range.end - 1)); 94 | let rsp = match req.send().await { 95 | Ok(rsp) => rsp, 96 | Err(SdkError::ServiceError { 97 | err: 98 | GetObjectError { 99 | kind: GetObjectErrorKind::NoSuchKey(..), 100 | .. 101 | }, 102 | .. 103 | }) => return Ok(None), 104 | Err(e) => return Err(err(e).into()), 105 | }; 106 | let data = rsp.body.collect().await.map_err(err)?.into_bytes().to_vec(); 107 | Ok(Some(data)) 108 | } 109 | 110 | async fn remove(&self, path: &str) -> Result<()> { 111 | self.client 112 | .delete_object() 113 | .bucket(&self.bucket) 114 | .key(path) 115 | .send() 116 | .await 117 | .map_err(err)?; 118 | Ok(()) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /storage/src/raft_log_store/block_cache.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::time::Instant; 3 | 4 | use futures::Future; 5 | use moka::future::Cache; 6 | 7 | use super::error::RaftLogStoreError; 8 | use super::metrics::RaftLogStoreMetricsRef; 9 | use super::DEFAULT_LOG_BATCH_SIZE; 10 | use crate::error::Result; 11 | 12 | #[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] 13 | struct BlockIndex { 14 | file_id: u64, 15 | offset: usize, 16 | } 17 | 18 | pub struct BlockCache { 19 | inner: Cache>>, 20 | metrics: RaftLogStoreMetricsRef, 21 | } 22 | 23 | impl BlockCache { 24 | pub fn new(capacity: usize, metrics: RaftLogStoreMetricsRef) -> Self { 25 | let cache: Cache>> = Cache::builder() 26 | .weigher(|_k, v: &Arc>| v.len() as u32) 27 | .initial_capacity(capacity / DEFAULT_LOG_BATCH_SIZE) 28 | .max_capacity(capacity as u64) 29 | .build(); 30 | Self { 31 | inner: cache, 32 | metrics, 33 | } 34 | } 35 | 36 | #[tracing::instrument(level = "trace", skip(self))] 37 | pub fn get(&self, file_id: u64, offset: usize) -> Option>> { 38 | let start = Instant::now(); 39 | 40 | let result = self.inner.get(&BlockIndex { file_id, offset }); 41 | 42 | self.metrics 43 | .block_cache_get_latency_histogram 44 | .observe(start.elapsed().as_secs_f64()); 45 | 46 | result 47 | } 48 | 49 | #[tracing::instrument(level = "trace", skip(self, block))] 50 | pub async fn insert(&self, file_id: u64, offset: usize, block: Arc>) { 51 | let start = Instant::now(); 52 | 53 | self.inner 54 | .insert(BlockIndex { file_id, offset }, block) 55 | .await; 56 | 57 | self.metrics 58 | .block_cache_insert_latency_histogram 59 | .observe(start.elapsed().as_secs_f64()); 60 | } 61 | 62 | #[tracing::instrument(level = "trace", skip(self, f))] 63 | pub async fn get_or_insert_with( 64 | &self, 65 | file_id: u64, 66 | offset: usize, 67 | f: F, 68 | ) -> Result>> 69 | where 70 | F: Future>>>, 71 | { 72 | let future = async move { 73 | let start_fill = Instant::now(); 74 | 75 | let r = f.await; 76 | 77 | self.metrics 78 | .block_cache_fill_latency_histogram 79 | .observe(start_fill.elapsed().as_secs_f64()); 80 | 81 | r 82 | }; 83 | 84 | let start = Instant::now(); 85 | 86 | let result = match self 87 | .inner 88 | .get_or_try_insert_with(BlockIndex { file_id, offset }, future) 89 | .await 90 | { 91 | Ok(block) => block, 92 | Err(arc_error) => return Err(RaftLogStoreError::Other(arc_error.to_string()).into()), 93 | }; 94 | 95 | self.metrics 96 | .block_cache_get_latency_histogram 97 | .observe(start.elapsed().as_secs_f64()); 98 | 99 | Ok(result) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /storage/src/raft_log_store/error.rs: -------------------------------------------------------------------------------- 1 | #[derive(thiserror::Error, Debug)] 2 | pub enum RaftLogStoreError { 3 | #[error("group {0} not exists")] 4 | GroupNotExists(u64), 5 | #[error("group {0} already exists")] 6 | GroupAlreadyExists(u64), 7 | #[error("encode error: {0}")] 8 | EncodeError(String), 9 | #[error("decode error: {0}")] 10 | DecodeError(String), 11 | #[error("checksum mismatch: [expected: {expected}] [get: {get}]")] 12 | ChecksumMismatch { expected: u32, get: u32 }, 13 | #[error("io error: {0}")] 14 | IoError(#[from] std::io::Error), 15 | #[error("raft log gap exists: [{start}, {end})")] 16 | RaftLogGap { start: u64, end: u64 }, 17 | #[error("raft log file gap: [{start}, {end})")] 18 | RaftLogFileGap { start: u64, end: u64 }, 19 | #[error("raft log file not found: {0}")] 20 | RaftLogFileNotFound(u64), 21 | #[error("other: {0}")] 22 | Other(String), 23 | } 24 | 25 | impl RaftLogStoreError { 26 | pub fn encode_error(e: impl Into>) -> Self { 27 | Self::EncodeError(e.into().to_string()) 28 | } 29 | 30 | pub fn decode_error(e: impl Into>) -> Self { 31 | Self::DecodeError(e.into().to_string()) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /storage/src/raft_log_store/metrics.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use lazy_static::lazy_static; 4 | 5 | lazy_static! { 6 | static ref RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec = 7 | prometheus::register_histogram_vec!( 8 | "raft_log_store_latency_histogram_vec", 9 | "raft log store latency histogram vec", 10 | &["op", "node"], 11 | vec![0.0001, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5] 12 | ) 13 | .unwrap(); 14 | static ref RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec = 15 | prometheus::register_histogram_vec!( 16 | "raft_log_store_block_cache_latency_histogram_vec", 17 | "raft log store block cache latency histogram vec", 18 | &["op", "node"], 19 | vec![0.00001, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1] 20 | ) 21 | .unwrap(); 22 | static ref RAFT_LOG_STORE_THROUGHPUT_GAUGE_VEC: prometheus::GaugeVec = 23 | prometheus::register_gauge_vec!( 24 | "raft_log_store_throughput_gauge_vec", 25 | "raft log store throughput guage vec", 26 | &["op", "node"] 27 | ) 28 | .unwrap(); 29 | static ref RAFT_LOG_STORE_OP_COUNTER_VEC: prometheus::CounterVec = 30 | prometheus::register_counter_vec!( 31 | "raft_log_store_op_counter_vec", 32 | "raft log store op counter vec", 33 | &["op", "node"] 34 | ) 35 | .unwrap(); 36 | static ref RAFT_LOG_STORE_BATCH_WRITERS_HISTOGRAM_VEC: prometheus::HistogramVec = 37 | prometheus::register_histogram_vec!( 38 | "raft_log_store_batch_writers_histogram_vec", 39 | "raft log store batch writers histogram vec", 40 | &["node"], 41 | vec![1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0] 42 | ) 43 | .unwrap(); 44 | static ref RAFT_LOG_STORE_SYNC_SIZE_HISTOGRAM_VEC: prometheus::HistogramVec = 45 | prometheus::register_histogram_vec!( 46 | "raft_log_store_sync_size_histogram_vec", 47 | "raft log store sync size histogram vec", 48 | &["node"], 49 | vec![256.0, 1024.0, 4096.0, 8192.0, 16384.0, 65536.0], 50 | ) 51 | .unwrap(); 52 | } 53 | 54 | pub struct RaftLogStoreMetrics { 55 | pub sync_latency_histogram: prometheus::Histogram, 56 | pub sync_size_histogram: prometheus::Histogram, 57 | 58 | pub append_latency_histogram: prometheus::Histogram, 59 | 60 | pub append_log_latency_histogram: prometheus::Histogram, 61 | pub append_log_throughput_guage: prometheus::Gauge, 62 | 63 | pub batch_writers_histogram: prometheus::Histogram, 64 | 65 | pub block_cache_get_latency_histogram: prometheus::Histogram, 66 | pub block_cache_insert_latency_histogram: prometheus::Histogram, 67 | pub block_cache_fill_latency_histogram: prometheus::Histogram, 68 | } 69 | 70 | pub type RaftLogStoreMetricsRef = Arc; 71 | 72 | impl RaftLogStoreMetrics { 73 | pub fn new(node: u64) -> Self { 74 | Self { 75 | sync_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC 76 | .get_metric_with_label_values(&["sync", &node.to_string()]) 77 | .unwrap(), 78 | sync_size_histogram: RAFT_LOG_STORE_SYNC_SIZE_HISTOGRAM_VEC 79 | .get_metric_with_label_values(&[&node.to_string()]) 80 | .unwrap(), 81 | 82 | append_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC 83 | .get_metric_with_label_values(&["append", &node.to_string()]) 84 | .unwrap(), 85 | 86 | append_log_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC 87 | .get_metric_with_label_values(&["append_log", &node.to_string()]) 88 | .unwrap(), 89 | append_log_throughput_guage: RAFT_LOG_STORE_THROUGHPUT_GAUGE_VEC 90 | .get_metric_with_label_values(&["append_log", &node.to_string()]) 91 | .unwrap(), 92 | 93 | block_cache_get_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 94 | .get_metric_with_label_values(&["block_cache_get", &node.to_string()]) 95 | .unwrap(), 96 | block_cache_insert_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 97 | .get_metric_with_label_values(&["block_cache_insert", &node.to_string()]) 98 | .unwrap(), 99 | block_cache_fill_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC 100 | .get_metric_with_label_values(&["block_cache_fill", &node.to_string()]) 101 | .unwrap(), 102 | 103 | batch_writers_histogram: RAFT_LOG_STORE_BATCH_WRITERS_HISTOGRAM_VEC 104 | .get_metric_with_label_values(&[&node.to_string()]) 105 | .unwrap(), 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /storage/src/raft_log_store/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod block_cache; 2 | pub mod entry; 3 | pub mod error; 4 | pub mod file; 5 | pub mod log; 6 | pub mod mem; 7 | pub mod metrics; 8 | pub mod queue; 9 | pub mod store; 10 | 11 | const DEFAULT_LOG_BATCH_SIZE: usize = 8 << 10; 12 | 13 | pub use store::RaftLogStore; 14 | -------------------------------------------------------------------------------- /storage/src/raft_log_store/queue.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | use std::sync::Arc; 3 | 4 | use parking_lot::RwLock; 5 | 6 | use super::file::{ActiveFile, FrozenFile}; 7 | use crate::error::Result; 8 | use crate::raft_log_store::error::RaftLogStoreError; 9 | 10 | #[derive(Debug)] 11 | pub enum LogFile { 12 | Active(ActiveFile), 13 | Frozen(FrozenFile), 14 | } 15 | 16 | struct LogQueueCore { 17 | active: ActiveFile, 18 | frozens: VecDeque, 19 | } 20 | 21 | #[derive(Clone)] 22 | pub struct LogQueue { 23 | node: u64, 24 | core: Arc>, 25 | } 26 | 27 | impl std::fmt::Debug for LogQueue { 28 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 29 | f.debug_struct("LogQueue") 30 | .field("node", &self.node) 31 | .finish() 32 | } 33 | } 34 | 35 | impl LogQueue { 36 | #[tracing::instrument(level = "trace", err)] 37 | pub fn init(node: u64, active: ActiveFile, mut frozens: Vec) -> Result { 38 | frozens.sort_by_key(|frozen| frozen.id()); 39 | if !frozens.is_empty() { 40 | let mut id = frozens.first().unwrap().id(); 41 | for frozen in frozens.iter() { 42 | if frozen.id() != id { 43 | return Err(RaftLogStoreError::RaftLogFileGap { 44 | start: id, 45 | end: frozen.id(), 46 | } 47 | .into()); 48 | } 49 | id += 1; 50 | } 51 | if active.id() != id { 52 | return Err(RaftLogStoreError::RaftLogFileGap { 53 | start: id, 54 | end: active.id(), 55 | } 56 | .into()); 57 | } 58 | } 59 | Ok(Self { 60 | node, 61 | core: Arc::new(RwLock::new(LogQueueCore { 62 | active, 63 | frozens: VecDeque::from_iter(frozens), 64 | })), 65 | }) 66 | } 67 | 68 | #[tracing::instrument(level = "trace")] 69 | pub async fn rotate(&self, active: ActiveFile, frozen: FrozenFile) { 70 | let mut core = self.core.write(); 71 | 72 | core.active = active; 73 | core.frozens.push_back(frozen); 74 | } 75 | 76 | #[tracing::instrument(level = "trace", ret)] 77 | pub fn active(&self) -> ActiveFile { 78 | self.core.read().active.clone() 79 | } 80 | 81 | #[tracing::instrument(level = "trace", ret, err)] 82 | pub fn file(&self, id: u64) -> Result { 83 | let core = self.core.read(); 84 | if id == core.active.id() { 85 | return Ok(LogFile::Active(core.active.clone())); 86 | } 87 | if core.frozens.is_empty() { 88 | return Err(RaftLogStoreError::RaftLogFileNotFound(id).into()); 89 | } 90 | let first = core.frozens[0].id(); 91 | if id < first || (id - first) as usize >= core.frozens.len() { 92 | return Err(RaftLogStoreError::RaftLogFileNotFound(id).into()); 93 | } 94 | Ok(LogFile::Frozen(core.frozens[(id - first) as usize].clone())) 95 | } 96 | 97 | #[tracing::instrument(level = "trace", ret)] 98 | pub fn frozen_file_count(&self) -> usize { 99 | self.core.read().frozens.len() 100 | } 101 | 102 | #[tracing::instrument(level = "trace", ret)] 103 | pub fn frozens(&self) -> Vec { 104 | let frozens = self.core.read().frozens.clone(); 105 | Vec::from_iter(frozens) 106 | } 107 | } 108 | 109 | #[cfg(test)] 110 | mod tests { 111 | use super::*; 112 | 113 | fn is_send_sync_clone() {} 114 | 115 | #[test] 116 | fn ensure_send_sync_clone() { 117 | is_send_sync_clone::(); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /storage/src/tiered_cache/file_cache/alloc.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub struct AlignedAllocator; 16 | 17 | use std::alloc::{Allocator, Global}; 18 | 19 | use super::utils; 20 | 21 | unsafe impl Allocator for AlignedAllocator { 22 | fn allocate( 23 | &self, 24 | layout: std::alloc::Layout, 25 | ) -> Result, std::alloc::AllocError> { 26 | let layout = std::alloc::Layout::from_size_align( 27 | layout.size(), 28 | utils::align_up(ALIGN, layout.align()), 29 | ) 30 | .unwrap(); 31 | Global.allocate(layout) 32 | } 33 | 34 | unsafe fn deallocate(&self, ptr: std::ptr::NonNull, layout: std::alloc::Layout) { 35 | let layout = std::alloc::Layout::from_size_align( 36 | layout.size(), 37 | utils::align_up(ALIGN, layout.align()), 38 | ) 39 | .unwrap(); 40 | Global.deallocate(ptr, layout) 41 | } 42 | } 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use super::*; 47 | 48 | #[test] 49 | fn test_aligned_buffer() { 50 | const ALIGN: usize = 512; 51 | let allocator = AlignedAllocator::; 52 | 53 | let mut buf: Vec = Vec::with_capacity_in(ALIGN * 8, &allocator); 54 | utils::assert_aligned(ALIGN, buf.as_ptr().addr()); 55 | 56 | buf.extend_from_slice(&[b'x'; ALIGN * 8]); 57 | utils::assert_aligned(ALIGN, buf.as_ptr().addr()); 58 | assert_eq!(buf, [b'x'; ALIGN * 8]); 59 | 60 | buf.extend_from_slice(&[b'x'; ALIGN * 8]); 61 | utils::assert_aligned(ALIGN, buf.as_ptr().addr()); 62 | assert_eq!(buf, [b'x'; ALIGN * 16]) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /storage/src/tiered_cache/file_cache/buffer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::sync::Arc; 16 | 17 | use parking_lot::RwLock; 18 | 19 | use super::LRU_SHARD_BITS; 20 | use crate::tiered_cache::{TieredCacheEntryHolder, TieredCacheKey, TieredCacheValue}; 21 | use crate::utils::lru_cache::LruCache; 22 | 23 | pub type Buffer = Arc>; 24 | 25 | struct TwoLevelBufferCore 26 | where 27 | K: TieredCacheKey, 28 | V: TieredCacheValue, 29 | { 30 | active_buffer: Buffer, 31 | frozen_buffer: Buffer, 32 | } 33 | 34 | impl TwoLevelBufferCore 35 | where 36 | K: TieredCacheKey, 37 | V: TieredCacheValue, 38 | { 39 | fn swap(&mut self) { 40 | // Swap fields of `&mut self` to avoid the borrow checker complaining. 41 | std::mem::swap(&mut self.active_buffer, &mut self.frozen_buffer); 42 | } 43 | } 44 | 45 | pub struct TwoLevelBuffer 46 | where 47 | K: TieredCacheKey, 48 | V: TieredCacheValue, 49 | { 50 | capacity: usize, 51 | core: Arc>>, 52 | } 53 | 54 | impl Clone for TwoLevelBuffer 55 | where 56 | K: TieredCacheKey, 57 | V: TieredCacheValue, 58 | { 59 | fn clone(&self) -> Self { 60 | Self { 61 | capacity: self.capacity, 62 | core: Arc::clone(&self.core), 63 | } 64 | } 65 | } 66 | 67 | impl TwoLevelBuffer 68 | where 69 | K: TieredCacheKey, 70 | V: TieredCacheValue, 71 | { 72 | pub fn new(capacity: usize) -> Self { 73 | Self { 74 | capacity, 75 | core: Arc::new(RwLock::new(TwoLevelBufferCore { 76 | active_buffer: Arc::new(LruCache::new(LRU_SHARD_BITS, capacity)), 77 | frozen_buffer: Arc::new(LruCache::new(LRU_SHARD_BITS, capacity)), 78 | })), 79 | } 80 | } 81 | 82 | pub fn insert(&self, hash: u64, key: K, charge: usize, value: V) { 83 | let core = self.core.read(); 84 | core.active_buffer.insert(key, hash, charge, value); 85 | } 86 | 87 | pub fn get(&self, hash: u64, key: &K) -> Option> { 88 | let core = self.core.read(); 89 | if let Some(entry) = core.active_buffer.lookup(hash, key) { 90 | return Some(TieredCacheEntryHolder::from_cached_value(entry)); 91 | } 92 | if let Some(entry) = core.frozen_buffer.lookup(hash, key) { 93 | return Some(TieredCacheEntryHolder::from_cached_value(entry)); 94 | } 95 | None 96 | } 97 | 98 | pub fn erase(&self, hash: u64, key: &K) { 99 | let core = self.core.read(); 100 | core.active_buffer.erase(hash, key); 101 | core.frozen_buffer.erase(hash, key); 102 | } 103 | 104 | pub fn active(&self) -> Buffer { 105 | self.core.read().active_buffer.clone() 106 | } 107 | 108 | pub fn frozen(&self) -> Buffer { 109 | self.core.read().frozen_buffer.clone() 110 | } 111 | 112 | pub fn swap(&self) { 113 | self.core.write().swap(); 114 | } 115 | 116 | pub fn rotate(&self) -> Buffer { 117 | let mut buffer = Arc::new(LruCache::new(LRU_SHARD_BITS, self.capacity)); 118 | let mut core = self.core.write(); 119 | std::mem::swap(&mut buffer, &mut core.active_buffer); 120 | std::mem::swap(&mut buffer, &mut core.frozen_buffer); 121 | buffer 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /storage/src/tiered_cache/file_cache/error.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #[derive(thiserror::Error, Debug)] 16 | pub enum Error { 17 | #[error("io error: {0}")] 18 | Io(#[from] std::io::Error), 19 | #[error("nix error: {0}")] 20 | Nix(#[from] nix::errno::Errno), 21 | #[error("unsupported file system, super block magic: {0}")] 22 | UnsupportedFilesystem(i64), 23 | #[error("invalid slot: {0}")] 24 | InvalidSlot(usize), 25 | #[error("other error: {0}")] 26 | Other(String), 27 | } 28 | 29 | pub type Result = core::result::Result; 30 | -------------------------------------------------------------------------------- /storage/src/tiered_cache/file_cache/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | pub mod alloc; 16 | pub mod buffer; 17 | pub mod cache; 18 | pub mod error; 19 | pub mod file; 20 | pub mod meta; 21 | pub mod metrics; 22 | pub mod store; 23 | pub mod utils; 24 | 25 | #[cfg(test)] 26 | pub mod test_utils; 27 | 28 | async fn asyncify(f: F) -> error::Result 29 | where 30 | F: FnOnce() -> error::Result + Send + 'static, 31 | T: Send + 'static, 32 | { 33 | match tokio::task::spawn_blocking(f).await { 34 | Ok(res) => res, 35 | Err(_) => Err(error::Error::Other("background task failed".to_string())), 36 | } 37 | } 38 | 39 | /// The logical block size of the underlying storage (typically 512 bytes). 40 | /// 41 | /// Can be determined using `ioctl(2)` `BLKSSZGET` operation or from the sheel using the command: 42 | /// 43 | /// ```bash 44 | /// blockdev --getss 45 | /// ``` 46 | /// 47 | /// For more details, see man open(2) NOTES section. 48 | const LOGICAL_BLOCK_SIZE: usize = 512; 49 | /// Size of `st_blocks` with `fstat(2)`. 50 | const ST_BLOCK_SIZE: usize = 512; 51 | 52 | const LRU_SHARD_BITS: usize = 5; 53 | 54 | type DioBuffer = Vec>; 55 | 56 | static DIO_BUFFER_ALLOCATOR: alloc::AlignedAllocator = 57 | alloc::AlignedAllocator::; 58 | -------------------------------------------------------------------------------- /storage/src/tiered_cache/file_cache/test_utils.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Singularity Data 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | use std::path::Path; 16 | use std::sync::Arc; 17 | 18 | use async_trait::async_trait; 19 | use bytes::{Buf, BufMut}; 20 | use tokio::sync::{mpsc, Mutex}; 21 | 22 | use super::cache::FlushBufferHook; 23 | use super::error::Result; 24 | use crate::tiered_cache::{TieredCacheKey, TieredCacheValue}; 25 | 26 | #[derive(Clone, Hash, Debug, PartialEq, Eq)] 27 | pub struct TestCacheKey(pub u64); 28 | 29 | impl TieredCacheKey for TestCacheKey { 30 | fn encoded_len() -> usize { 31 | 8 32 | } 33 | 34 | fn encode(&self, mut buf: &mut [u8]) { 35 | buf.put_u64(self.0); 36 | } 37 | 38 | fn decode(mut buf: &[u8]) -> Self { 39 | Self(buf.get_u64()) 40 | } 41 | } 42 | 43 | pub type TestCacheValue = Vec; 44 | 45 | impl TieredCacheValue for Vec { 46 | fn len(&self) -> usize { 47 | Vec::len(self) 48 | } 49 | 50 | fn encoded_len(&self) -> usize { 51 | self.len() 52 | } 53 | 54 | fn encode(&self, mut buf: &mut [u8]) { 55 | buf.put_slice(self) 56 | } 57 | 58 | fn decode(buf: Vec) -> Self { 59 | buf.to_vec() 60 | } 61 | } 62 | 63 | pub fn key(v: u64) -> TestCacheKey { 64 | TestCacheKey(v) 65 | } 66 | 67 | #[derive(Clone)] 68 | pub struct FlushHolder { 69 | pre_sender: mpsc::UnboundedSender<()>, 70 | pre_receiver: Arc>>, 71 | 72 | post_sender: mpsc::UnboundedSender<()>, 73 | post_receiver: Arc>>, 74 | } 75 | 76 | impl Default for FlushHolder { 77 | fn default() -> Self { 78 | let (tx0, rx0) = mpsc::unbounded_channel(); 79 | let (tx1, rx1) = mpsc::unbounded_channel(); 80 | Self { 81 | pre_sender: tx0, 82 | pre_receiver: Arc::new(Mutex::new(rx0)), 83 | 84 | post_sender: tx1, 85 | post_receiver: Arc::new(Mutex::new(rx1)), 86 | } 87 | } 88 | } 89 | 90 | impl FlushHolder { 91 | pub fn trigger(&self) { 92 | self.pre_sender.send(()).unwrap(); 93 | } 94 | 95 | pub async fn wait(&self) { 96 | self.post_receiver.lock().await.recv().await.unwrap(); 97 | } 98 | } 99 | 100 | #[async_trait] 101 | impl FlushBufferHook for FlushHolder { 102 | async fn pre_flush(&self) -> Result<()> { 103 | self.pre_receiver.lock().await.recv().await.unwrap(); 104 | Ok(()) 105 | } 106 | 107 | async fn post_flush(&self, _bytes: usize) -> Result<()> { 108 | self.post_sender.send(()).unwrap(); 109 | Ok(()) 110 | } 111 | } 112 | 113 | /// `datasize()` returns the actual data size of a file. 114 | /// 115 | /// File systems like ext4 takes metadata blocks into account in `stat.st_blocks` of `fstat(2)`. 116 | /// So it'not accurate if you really want to know the data size of sparse file with `fstat`. 117 | /// 118 | /// `datasize` is implemented by iterates the `fiemap` of the file. 119 | pub fn datasize(path: impl AsRef) -> Result { 120 | let mut size = 0; 121 | 122 | let fm = fiemap::fiemap(path)?; 123 | for fe in fm { 124 | let fe = fe.unwrap(); 125 | size += fe.fe_length as usize; 126 | } 127 | 128 | Ok(size) 129 | } 130 | -------------------------------------------------------------------------------- /storage/src/utils/bloom.rs: -------------------------------------------------------------------------------- 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE). 2 | 3 | // TODO: Refactor this in rusty style. 4 | 5 | use bytes::BufMut; 6 | 7 | pub trait BitSlice { 8 | fn get_bit(&self, idx: usize) -> bool; 9 | fn bit_len(&self) -> usize; 10 | } 11 | 12 | pub trait BitSliceMut { 13 | fn set_bit(&mut self, idx: usize, val: bool); 14 | } 15 | 16 | impl> BitSlice for T { 17 | fn get_bit(&self, idx: usize) -> bool { 18 | let pos = idx / 8; 19 | let offset = idx % 8; 20 | (self.as_ref()[pos] & (1 << offset)) != 0 21 | } 22 | 23 | fn bit_len(&self) -> usize { 24 | self.as_ref().len() * 8 25 | } 26 | } 27 | 28 | impl> BitSliceMut for T { 29 | fn set_bit(&mut self, idx: usize, val: bool) { 30 | let pos = idx / 8; 31 | let offset = idx % 8; 32 | if val { 33 | self.as_mut()[pos] |= 1 << offset; 34 | } else { 35 | self.as_mut()[pos] &= !(1 << offset); 36 | } 37 | } 38 | } 39 | 40 | /// Bloom implements bloom filter functionalities over 41 | /// a bit-slice of data. 42 | pub struct Bloom<'a> { 43 | /// data of filter in bits 44 | filter: &'a [u8], 45 | /// number of hash functions 46 | k: u8, 47 | } 48 | 49 | impl<'a> Bloom<'a> { 50 | /// Create a bloom filter from a byte slice 51 | pub fn new(buf: &'a [u8]) -> Self { 52 | let filter = &buf[..buf.len() - 1]; 53 | let k = buf[buf.len() - 1]; 54 | Self { filter, k } 55 | } 56 | 57 | /// Get bloom filter bits per key from entries count and FPR 58 | pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize { 59 | let size = 60 | -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2); 61 | let locs = (std::f64::consts::LN_2 * size / (entries as f64)).ceil(); 62 | locs as usize 63 | } 64 | 65 | /// Build bloom filter from key hashes 66 | pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Vec { 67 | // 0.69 is approximately ln(2) 68 | let k = ((bits_per_key as f64) * 0.69) as u32; 69 | // limit k in [1, 30] 70 | let k = k.clamp(1, 30); 71 | // For small len(keys), we set a minimum bloom filter length to avoid high FPR 72 | let nbits = (keys.len() * bits_per_key).max(64); 73 | let nbytes = (nbits + 7) / 8; 74 | // nbits is always multiplication of 8 75 | let nbits = nbytes * 8; 76 | let mut filter = Vec::with_capacity(nbytes + 1); 77 | filter.resize(nbytes, 0); 78 | for h in keys { 79 | let mut h = *h; 80 | let delta = (h >> 17) | (h << 15); 81 | for _ in 0..k { 82 | let bit_pos = (h as usize) % nbits; 83 | filter.set_bit(bit_pos, true); 84 | h = h.wrapping_add(delta); 85 | } 86 | } 87 | filter.put_u8(k as u8); 88 | filter 89 | } 90 | 91 | /// Check if a bloom filter may contain some data 92 | pub fn may_contain(&self, mut h: u32) -> bool { 93 | if self.k > 30 { 94 | // potential new encoding for short bloom filters 95 | true 96 | } else { 97 | let nbits = self.filter.bit_len(); 98 | let delta = (h >> 17) | (h << 15); 99 | for _ in 0..self.k { 100 | let bit_pos = h % (nbits as u32); 101 | if !self.filter.get_bit(bit_pos as usize) { 102 | return false; 103 | } 104 | h = h.wrapping_add(delta); 105 | } 106 | true 107 | } 108 | } 109 | } 110 | 111 | #[cfg(test)] 112 | mod tests { 113 | use test_log::test; 114 | 115 | use super::*; 116 | 117 | #[test] 118 | fn test_small_bloom_filter() { 119 | let hash: Vec = vec![b"hello".to_vec(), b"world".to_vec()] 120 | .into_iter() 121 | .map(|x| farmhash::fingerprint32(&x)) 122 | .collect(); 123 | let buf = Bloom::build_from_key_hashes(&hash, 10); 124 | 125 | let check_hash: Vec = vec![ 126 | b"hello".to_vec(), 127 | b"world".to_vec(), 128 | b"x".to_vec(), 129 | b"fool".to_vec(), 130 | ] 131 | .into_iter() 132 | .map(|x| farmhash::fingerprint32(&x)) 133 | .collect(); 134 | 135 | let f = Bloom::new(&buf); 136 | assert_eq!(f.k, 6); 137 | 138 | assert!(f.may_contain(check_hash[0])); 139 | assert!(f.may_contain(check_hash[1])); 140 | assert!(!f.may_contain(check_hash[2])); 141 | assert!(!f.may_contain(check_hash[3])); 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /storage/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | mod coding; 2 | pub use coding::*; 3 | mod bloom; 4 | pub use bloom::*; 5 | pub mod lru_cache; 6 | -------------------------------------------------------------------------------- /tests/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-tests" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [[test]] 8 | name = "integrations" 9 | path = "integrations/lib.rs" 10 | 11 | [dependencies] 12 | anyhow = "1.0" 13 | bytes = "1" 14 | bytesize = "1.1.0" 15 | clap = { version = "3.1.6", features = ["derive"] } 16 | env_logger = "*" 17 | futures = "0.3" 18 | itertools = "0.10.3" 19 | lazy_static = "1.4.0" 20 | parking_lot = "0.12" 21 | rand = "0.8.5" 22 | runkv-client = { path = "../client" } 23 | runkv-common = { path = "../common" } 24 | runkv-exhauster = { path = "../exhauster" } 25 | runkv-proto = { path = "../proto" } 26 | runkv-rudder = { path = "../rudder" } 27 | runkv-storage = { path = "../storage" } 28 | runkv-wheel = { path = "../wheel" } 29 | tempfile = "3" 30 | test-log = "0.2.10" 31 | tokio = { version = "1", features = [ 32 | "rt-multi-thread", 33 | "sync", 34 | "macros", 35 | "time", 36 | ] } 37 | toml = "0.4.2" 38 | tonic = "0.6.2" 39 | tracing = "0.1" 40 | 41 | [features] 42 | tracing = ["runkv-wheel/tracing"] 43 | deadlock = ["parking_lot/deadlock_detection"] 44 | verbose-release-log = ["tracing/release_max_level_trace"] 45 | -------------------------------------------------------------------------------- /tests/etc/exhauster.toml: -------------------------------------------------------------------------------- 1 | id = 201 2 | host = "127.0.0.1" 3 | port = 0 4 | data_path = "data" 5 | meta_path = "meta" 6 | heartbeat_interval = "1 s" 7 | 8 | [rudder] 9 | id = 1 10 | host = "127.0.0.1" 11 | port = 0 12 | 13 | [minio] 14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 15 | 16 | [s3] 17 | bucket = "runkv" 18 | 19 | [buffer] 20 | write_buffer_capacity = "64 MiB" 21 | 22 | [cache] 23 | block_cache_capacity = "512 MiB" 24 | meta_cache_capacity = "256 MiB" 25 | -------------------------------------------------------------------------------- /tests/etc/lsm_tree.toml: -------------------------------------------------------------------------------- 1 | [lsm_tree] 2 | l1_capacity = "20 MiB" 3 | level_multiplier = 2 4 | 5 | trigger_l0_compaction_ssts = 4 6 | trigger_l0_compaction_interval = "500 ms" 7 | trigger_lmax_compaction_interval = "5 s" 8 | trigger_compaction_interval = "2 s" 9 | 10 | sstable_capacity = "4 MiB" 11 | block_capacity = "1 MiB" 12 | restart_interval = 2 13 | bloom_false_positive = 0.1 14 | 15 | compaction_pin_ttl = "15 s" 16 | 17 | [[lsm_tree.levels_options]] 18 | compaction_strategy = "Overlap" 19 | compression_algorithm = "None" 20 | 21 | [[lsm_tree.levels_options]] 22 | compaction_strategy = "NonOverlap" 23 | compression_algorithm = "None" 24 | 25 | [[lsm_tree.levels_options]] 26 | compaction_strategy = "NonOverlap" 27 | compression_algorithm = "None" 28 | 29 | [[lsm_tree.levels_options]] 30 | compaction_strategy = "NonOverlap" 31 | compression_algorithm = "Lz4" 32 | -------------------------------------------------------------------------------- /tests/etc/port.toml: -------------------------------------------------------------------------------- 1 | test_concurrent_put_get = 12300 2 | test_multi_raft_group_concurrent_put_get = 12310 -------------------------------------------------------------------------------- /tests/etc/rudder.toml: -------------------------------------------------------------------------------- 1 | id = 1 2 | host = "127.0.0.1" 3 | port = 0 4 | data_path = "data" 5 | meta_path = "meta" 6 | health_timeout = "10 s" 7 | 8 | # [minio] 9 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 10 | 11 | # [s3] 12 | # bucket = "runkv" 13 | 14 | [cache] 15 | block_cache_capacity = "0 B" 16 | meta_cache_capacity = "64 kiB" 17 | -------------------------------------------------------------------------------- /tests/etc/wheel.toml: -------------------------------------------------------------------------------- 1 | id = 101 2 | host = "127.0.0.1" 3 | port = 0 4 | log = ".run/log/" 5 | data_path = "data" 6 | meta_path = "meta" 7 | poll_interval = "100ms" 8 | heartbeat_interval = "100ms" 9 | 10 | [rudder] 11 | id = 1 12 | host = "127.0.0.1" 13 | port = 0 14 | 15 | # [minio] 16 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv' 17 | 18 | # [s3] 19 | # bucket = "runkv" 20 | 21 | [buffer] 22 | write_buffer_capacity = "1 MiB" 23 | 24 | [cache] 25 | block_cache_capacity = "16 MiB" 26 | meta_cache_capacity = "4 MiB" 27 | 28 | [raft_log_store] 29 | log_dir_path = "/path/to/log/dir" 30 | log_file_capacity = "64 MiB" 31 | block_cache_capacity = "256 MiB" 32 | persist = "sync" 33 | 34 | [prometheus] 35 | host = "127.0.0.1" 36 | port = 9898 37 | 38 | [tiered_cache] 39 | type = "FileCache" 40 | [tiered_cache.args] 41 | dir = "/path/to/file/cache/dir" 42 | capacity = "256 MiB" 43 | total_buffer_capacity = "64 MiB" 44 | cache_file_fallocate_unit = "64 MiB" 45 | cache_meta_fallocate_unit = "16 MiB" 46 | cache_file_max_write_size = "2 MiB" -------------------------------------------------------------------------------- /tests/integrations/lib.rs: -------------------------------------------------------------------------------- 1 | use std::fs::read_to_string; 2 | 3 | mod test_concurrent_put_get; 4 | mod test_multi_raft_group_concurrent_put_get; 5 | 6 | const PORT_CONFIG_PATH: &str = "etc/port.toml"; 7 | 8 | const RUDDER_CONFIG_PATH: &str = "etc/rudder.toml"; 9 | const WHEEL_CONFIG_PATH: &str = "etc/wheel.toml"; 10 | const EXHAUSTER_CONFIG_PATH: &str = "etc/exhauster.toml"; 11 | const LSM_TREE_CONFIG_PATH: &str = "etc/lsm_tree.toml"; 12 | 13 | fn port(name: &str) -> u16 { 14 | let table = read_to_string(PORT_CONFIG_PATH) 15 | .unwrap() 16 | .parse::() 17 | .unwrap(); 18 | let value = match table { 19 | toml::Value::Table(ports) => ports[name].clone(), 20 | _ => unreachable!(), 21 | }; 22 | match value { 23 | toml::Value::Integer(port) => port as u16, 24 | _ => unreachable!(), 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /tests/integrations/test_concurrent_put_get.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use runkv_tests::{run, Args, Options}; 4 | use test_log::test; 5 | 6 | use crate::*; 7 | 8 | #[test(tokio::test)] 9 | async fn test_concurrent_put_get() { 10 | let port = crate::port("test_concurrent_put_get"); 11 | 12 | let options = Options { 13 | log: false, 14 | rudder_config_path: RUDDER_CONFIG_PATH.to_string(), 15 | wheel_config_path: WHEEL_CONFIG_PATH.to_string(), 16 | exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(), 17 | lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(), 18 | rudder_node_id: 10000, 19 | wheel_node_id_base: 0, 20 | exhauster_node_id_base: 100, 21 | rudder_port: port, 22 | exhauster_port_base: port, 23 | wheel_port_base: port + 1, 24 | wheel_prometheus_port_base: 0, 25 | }; 26 | 27 | let tempdir = tempfile::tempdir().unwrap(); 28 | let raft_log_store_data_dir = Path::new(tempdir.path()) 29 | .join("raft") 30 | .to_str() 31 | .unwrap() 32 | .to_string(); 33 | let log_dir = Path::new(tempdir.path()) 34 | .join("log") 35 | .to_str() 36 | .unwrap() 37 | .to_string(); 38 | let file_cache_dir = Path::new(tempdir.path()) 39 | .join("filecache") 40 | .to_str() 41 | .unwrap() 42 | .to_string(); 43 | 44 | let args = Args { 45 | wheels: 1, 46 | exhausters: 1, 47 | groups: 1, 48 | key_size: 64, 49 | value_size: 64, 50 | concurrency: 1000, 51 | r#loop: 3, 52 | raft_log_store_data_dir, 53 | persist: "none".to_string(), 54 | log_dir, 55 | file_cache_dir, 56 | s3_uri: "memory://".to_string(), 57 | }; 58 | 59 | run(args, options).await; 60 | } 61 | -------------------------------------------------------------------------------- /tests/integrations/test_multi_raft_group_concurrent_put_get.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use runkv_tests::{run, Args, Options}; 4 | use test_log::test; 5 | 6 | use crate::*; 7 | 8 | #[test(tokio::test)] 9 | async fn test_multi_raft_group_concurrent_put_get() { 10 | let port = crate::port("test_multi_raft_group_concurrent_put_get"); 11 | 12 | let options = Options { 13 | log: false, 14 | rudder_config_path: RUDDER_CONFIG_PATH.to_string(), 15 | wheel_config_path: WHEEL_CONFIG_PATH.to_string(), 16 | exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(), 17 | lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(), 18 | rudder_node_id: 10000, 19 | wheel_node_id_base: 0, 20 | exhauster_node_id_base: 100, 21 | rudder_port: port, 22 | exhauster_port_base: port, 23 | wheel_port_base: port + 1, 24 | wheel_prometheus_port_base: 0, 25 | }; 26 | 27 | let tempdir = tempfile::tempdir().unwrap(); 28 | let raft_log_store_data_dir = Path::new(tempdir.path()) 29 | .join("raft") 30 | .to_str() 31 | .unwrap() 32 | .to_string(); 33 | let log_dir = Path::new(tempdir.path()) 34 | .join("log") 35 | .to_str() 36 | .unwrap() 37 | .to_string(); 38 | let file_cache_dir = Path::new(tempdir.path()) 39 | .join("filecache") 40 | .to_str() 41 | .unwrap() 42 | .to_string(); 43 | 44 | let args = Args { 45 | wheels: 1, 46 | exhausters: 1, 47 | groups: 10, 48 | key_size: 64, 49 | value_size: 64, 50 | concurrency: 100, 51 | r#loop: 3, 52 | raft_log_store_data_dir, 53 | persist: "none".to_string(), 54 | log_dir, 55 | file_cache_dir, 56 | s3_uri: "memory://".to_string(), 57 | }; 58 | 59 | run(args, options).await; 60 | } 61 | -------------------------------------------------------------------------------- /wheel/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "runkv-wheel" 3 | version = "0.1.0" 4 | edition = "2021" 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 6 | 7 | [dependencies] 8 | anyhow = "1.0" 9 | async-trait = "0.1" 10 | bincode = "1.3.3" 11 | bytes = "1" 12 | bytesize = { version = "1.1.0", features = ["serde"] } 13 | chrono = { version = "0.4", features = ["serde"] } 14 | clap = { version = "3.1.6", features = ["derive"] } 15 | futures = "0.3" 16 | futures-util = "0.3" 17 | http = "0.2.6" 18 | humantime = "2.1.0" 19 | humantime-serde = "1.1.1" 20 | hyper = { version = "^0.14", features = ["server", "http1", "tcp"] } 21 | itertools = "0.10.3" 22 | lazy_static = "1.4.0" 23 | moka = { version = "0.7", features = ["future"] } 24 | parking_lot = "0.12" 25 | prometheus = "0.13.0" 26 | prost = "0.9" 27 | raft = { git = "https://github.com/mrcroxx/raft-rs", rev = "710b3a9cf2342cdcc1d7b43e945490945024ecd2" } 28 | # Uncomment this line if you want to debug raft-rs locally. 29 | # raft = { path = "../../raft-rs" } 30 | rand = "0.8.5" 31 | runkv-common = { path = "../common" } 32 | runkv-proto = { path = "../proto" } 33 | runkv-storage = { path = "../storage" } 34 | serde = "1.0" 35 | serde_derive = "1.0" 36 | slog = "2.7" # Required by "raft". 37 | tempfile = "3" 38 | thiserror = "1.0" 39 | tokio = { version = "1", features = [ 40 | "rt-multi-thread", 41 | "sync", 42 | "macros", 43 | "time", 44 | ] } 45 | toml = "0.4.2" 46 | tonic = "0.6.2" 47 | tracing = "0.1" 48 | tracing-subscriber = "0.3" 49 | 50 | [target.'cfg(not(target_env = "msvc"))'.dependencies] 51 | tikv-jemallocator = "0.4.3" 52 | 53 | [dev-dependencies] 54 | assert_matches = "1.5.0" 55 | env_logger = "*" 56 | test-log = "0.2.10" 57 | 58 | [features] 59 | tracing = ["runkv-common/tracing"] 60 | deadlock = [] 61 | verbose-release-log = ["tracing/release_max_level_trace"] 62 | -------------------------------------------------------------------------------- /wheel/src/components/command.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::DefaultHasher; 2 | use std::hash::{Hash, Hasher}; 3 | 4 | use runkv_common::coding::BytesSerde; 5 | use runkv_proto::kv::KvRequest; 6 | use serde::{Deserialize, Serialize}; 7 | 8 | #[derive(Serialize, Deserialize, Clone, Debug)] 9 | pub enum Command { 10 | KvRequest { 11 | request_id: u64, 12 | sequence: u64, 13 | request: KvRequest, 14 | }, 15 | CompactRaftLog { 16 | index: u64, 17 | sequence: u64, 18 | }, 19 | } 20 | 21 | impl<'de> BytesSerde<'de> for Command {} 22 | 23 | impl Command { 24 | pub fn id(&self) -> u64 { 25 | match self { 26 | Self::KvRequest { request_id, .. } => *request_id, 27 | Self::CompactRaftLog { index, sequence } => { 28 | let mut hasher = DefaultHasher::default(); 29 | index.hash(&mut hasher); 30 | sequence.hash(&mut hasher); 31 | hasher.finish() 32 | } 33 | } 34 | } 35 | 36 | pub fn is_read_only(&self) -> bool { 37 | match self { 38 | Self::KvRequest { request, .. } => request.is_read_only(), 39 | Self::CompactRaftLog { .. } => false, 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /wheel/src/components/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod command; 2 | pub mod fsm; 3 | pub mod lsm_tree; 4 | pub mod raft_log_store; 5 | pub mod raft_manager; 6 | pub mod raft_network; 7 | pub mod read_only_cmd_pool; 8 | -------------------------------------------------------------------------------- /wheel/src/components/read_only_cmd_pool.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | 4 | use itertools::Itertools; 5 | use parking_lot::Mutex; 6 | use tracing::trace; 7 | 8 | use super::command::Command; 9 | 10 | struct ReadyItem { 11 | index: u64, 12 | cmds: Vec, 13 | } 14 | 15 | struct ReadOnlyCmdPoolCore { 16 | /// { id -> [cmd] } 17 | pending: Mutex>>, 18 | ready: Mutex>, 19 | } 20 | 21 | #[derive(Clone)] 22 | pub struct ReadOnlyCmdPool { 23 | core: Arc, 24 | } 25 | 26 | impl Default for ReadOnlyCmdPool { 27 | fn default() -> Self { 28 | Self { 29 | core: Arc::new(ReadOnlyCmdPoolCore { 30 | pending: Mutex::new(HashMap::new()), 31 | ready: Mutex::new(vec![]), 32 | }), 33 | } 34 | } 35 | } 36 | 37 | impl ReadOnlyCmdPool { 38 | pub fn append(&self, id: u64, cmds: Vec) { 39 | assert!(self.core.pending.lock().insert(id, cmds).is_none()); 40 | } 41 | 42 | pub fn ready(&self, id: u64, index: u64) { 43 | let cmds = match self.core.pending.lock().remove(&id) { 44 | None => { 45 | trace!("no read-only cmds found at: {}", index); 46 | return; 47 | } 48 | Some(cmds) => cmds, 49 | }; 50 | let item = ReadyItem { index, cmds }; 51 | let mut ready = self.core.ready.lock(); 52 | if let Some(last) = ready.last() { 53 | assert!(last.index <= index); 54 | } 55 | ready.push(item); 56 | } 57 | 58 | pub fn split(&self, index: u64) -> Vec { 59 | let mut ready = self.core.ready.lock(); 60 | let p = ready.partition_point(|item| item.index <= index); 61 | let cmds = ready.drain(..p).flat_map(|item| item.cmds).collect_vec(); 62 | cmds 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | use super::*; 69 | 70 | fn is_send_sync_clone() {} 71 | 72 | #[test] 73 | fn ensure_send_sync_clone() { 74 | is_send_sync_clone::(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /wheel/src/config.rs: -------------------------------------------------------------------------------- 1 | use runkv_common::config::{ 2 | CacheConfig, LsmTreeConfig, MinioConfig, Node, PrometheusConfig, S3Config, 3 | }; 4 | use serde::Deserialize; 5 | 6 | #[derive(Deserialize, Clone, Debug)] 7 | pub struct WheelConfig { 8 | pub id: u64, 9 | pub host: String, 10 | pub port: u16, 11 | pub log: String, 12 | pub data_path: String, 13 | pub meta_path: String, 14 | pub poll_interval: String, 15 | pub heartbeat_interval: String, 16 | pub rudder: Node, 17 | pub s3: Option, 18 | pub minio: Option, 19 | pub buffer: BufferConfig, 20 | pub cache: CacheConfig, 21 | pub lsm_tree: LsmTreeConfig, 22 | pub raft_log_store: RaftLogStoreConfig, 23 | pub tiered_cache: TieredCacheConfig, 24 | pub prometheus: PrometheusConfig, 25 | } 26 | 27 | #[derive(Deserialize, Clone, Debug)] 28 | pub struct BufferConfig { 29 | pub write_buffer_capacity: String, 30 | } 31 | 32 | #[derive(Deserialize, Clone, Debug)] 33 | pub struct RaftLogStoreConfig { 34 | pub log_dir_path: String, 35 | pub log_file_capacity: String, 36 | pub block_cache_capacity: String, 37 | pub persist: String, 38 | } 39 | 40 | #[derive(Deserialize, Clone, Debug, PartialEq, Eq)] 41 | #[serde(tag = "type", content = "args")] 42 | pub enum TieredCacheConfig { 43 | None, 44 | FileCache(FileCacheConfig), 45 | } 46 | 47 | #[derive(Deserialize, Clone, Debug, PartialEq, Eq)] 48 | pub struct FileCacheConfig { 49 | pub dir: String, 50 | pub capacity: String, 51 | pub total_buffer_capacity: String, 52 | pub cache_file_fallocate_unit: String, 53 | pub cache_meta_fallocate_unit: String, 54 | pub cache_file_max_write_size: String, 55 | } 56 | 57 | #[cfg(test)] 58 | mod tests { 59 | use super::{FileCacheConfig, TieredCacheConfig}; 60 | 61 | #[test] 62 | fn test_tiered_cache_config_parse() { 63 | let text = r#"type = "None""#; 64 | let config: TieredCacheConfig = toml::from_str(text).unwrap(); 65 | assert_eq!(TieredCacheConfig::None, config); 66 | 67 | let text = r#" 68 | type = "FileCache" 69 | [args] 70 | dir = "" 71 | capacity = "" 72 | total_buffer_capacity = "" 73 | cache_file_fallocate_unit = "" 74 | cache_meta_fallocate_unit = "" 75 | cache_file_max_write_size = "" 76 | "#; 77 | let config: TieredCacheConfig = toml::from_str(text).unwrap(); 78 | assert_eq!( 79 | TieredCacheConfig::FileCache(FileCacheConfig { 80 | dir: "".to_string(), 81 | capacity: "".to_string(), 82 | total_buffer_capacity: "".to_string(), 83 | cache_file_fallocate_unit: "".to_string(), 84 | cache_meta_fallocate_unit: "".to_string(), 85 | cache_file_max_write_size: "".to_string(), 86 | }), 87 | config 88 | ); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /wheel/src/error.rs: -------------------------------------------------------------------------------- 1 | use runkv_proto::meta::KeyRange; 2 | use tonic::Status; 3 | 4 | #[derive(thiserror::Error, Debug)] 5 | pub enum Error { 6 | #[error("config error: {0}")] 7 | ConfigError(String), 8 | #[error("storage error: {0}")] 9 | StorageError(#[from] runkv_storage::Error), 10 | #[error("transport error: {0}")] 11 | TransportError(#[from] tonic::transport::Error), 12 | #[error("rpc status error: {0}")] 13 | RpcStatus(#[from] Status), 14 | #[error("serde error: {0}")] 15 | SerdeError(String), 16 | #[error("raft error: {0}")] 17 | RaftError(#[from] raft::Error), 18 | #[error("raft manage error: {0}")] 19 | RaftManagerError(#[from] RaftManageError), 20 | #[error("meta error: {0}")] 21 | MetaError(#[from] MetaError), 22 | #[error("kv error: {0}")] 23 | KvError(#[from] KvError), 24 | #[error("other: {0}")] 25 | Other(String), 26 | } 27 | 28 | impl Error { 29 | pub fn err(e: impl Into>) -> Self { 30 | Self::Other(e.into().to_string()) 31 | } 32 | 33 | pub fn config_err(e: impl Into>) -> Self { 34 | Self::ConfigError(e.into().to_string()) 35 | } 36 | 37 | pub fn serde_err(e: impl Into>) -> Self { 38 | Self::SerdeError(e.into().to_string()) 39 | } 40 | } 41 | 42 | pub type Result = std::result::Result; 43 | 44 | #[derive(thiserror::Error, Debug)] 45 | pub enum RaftManageError { 46 | #[error("raft group already exists")] 47 | RaftGroupAlreadyExists(u64), 48 | #[error("raft group not exists")] 49 | RaftGroupNotExists(u64), 50 | #[error("raft node not exists: [raft node: {raft_node}] [node: {node}]")] 51 | RaftNodeNotExists { raft_node: u64, node: u64 }, 52 | #[error("raft node already exists: [group: {group}] [raft node: {raft_node}] [node: {node}]")] 53 | RaftNodeAlreadyExists { 54 | group: u64, 55 | raft_node: u64, 56 | node: u64, 57 | }, 58 | #[error("other: {0}")] 59 | Other(String), 60 | } 61 | 62 | impl RaftManageError { 63 | pub fn err(e: impl Into>) -> Self { 64 | Self::Other(e.into().to_string()) 65 | } 66 | } 67 | 68 | #[derive(thiserror::Error, Debug)] 69 | pub enum MetaError { 70 | #[error("key range overlaps: {r1:?} {r2:?}")] 71 | KeyRangeOverlaps { r1: KeyRange, r2: KeyRange }, 72 | } 73 | 74 | #[derive(thiserror::Error, Debug)] 75 | pub enum KvError { 76 | #[error("ops include invalid shard or ops cross multiple shards: {0}")] 77 | InvalidShard(String), 78 | #[error("no valid leader in raft group {0}")] 79 | NoValidLeader(u64), 80 | } 81 | -------------------------------------------------------------------------------- /wheel/src/main.rs: -------------------------------------------------------------------------------- 1 | #[cfg(not(target_env = "msvc"))] 2 | use tikv_jemallocator::Jemalloc; 3 | 4 | #[cfg(not(target_env = "msvc"))] 5 | #[global_allocator] 6 | static GLOBAL: Jemalloc = Jemalloc; 7 | 8 | use std::fs::read_to_string; 9 | 10 | use clap::Parser; 11 | use runkv_wheel::config::WheelConfig; 12 | use runkv_wheel::error::{Error, Result}; 13 | use runkv_wheel::{bootstrap_wheel, build_wheel}; 14 | use tracing::info; 15 | use tracing_subscriber::FmtSubscriber; 16 | 17 | #[derive(Parser, Debug)] 18 | struct Args { 19 | #[clap(short, long, default_value = "etc/wheel.toml")] 20 | config_file_path: String, 21 | } 22 | 23 | #[tokio::main] 24 | async fn main() -> Result<()> { 25 | let subscriber = FmtSubscriber::new(); 26 | tracing::subscriber::set_global_default(subscriber).map_err(Error::err)?; 27 | 28 | let args = Args::parse(); 29 | info!("args: {:?}", args); 30 | 31 | let config: WheelConfig = 32 | toml::from_str(&read_to_string(&args.config_file_path).map_err(Error::err)?) 33 | .map_err(Error::config_err)?; 34 | info!("config: {:?}", config); 35 | 36 | let (wheel, workers) = build_wheel(&config).await?; 37 | bootstrap_wheel(&config, wheel, workers).await 38 | } 39 | -------------------------------------------------------------------------------- /wheel/src/meta/mem.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{BTreeMap, HashMap}; 2 | use std::sync::Arc; 3 | 4 | use async_trait::async_trait; 5 | use itertools::Itertools; 6 | use parking_lot::RwLock; 7 | use runkv_proto::meta::KeyRange; 8 | 9 | use super::{in_range, is_overlap, MetaStore}; 10 | use crate::error::{MetaError, Result}; 11 | 12 | type RaftStates = Arc>>>; 13 | 14 | #[derive(Default)] 15 | struct MemoryMetaStoreCore { 16 | /// `{ [start key .. end key) -> (group, [raft node 1, range node 2, ..]) }` 17 | key_ranges: BTreeMap)>, 18 | } 19 | 20 | #[derive(Default)] 21 | pub struct MemoryMetaStore { 22 | inner: RwLock, 23 | 24 | raft_states: RaftStates, 25 | } 26 | 27 | impl MemoryMetaStore {} 28 | 29 | #[async_trait] 30 | impl MetaStore for MemoryMetaStore { 31 | async fn add_key_range( 32 | &self, 33 | key_range: KeyRange, 34 | group: u64, 35 | raft_nodes: &[u64], 36 | ) -> Result<()> { 37 | let mut guard = self.inner.write(); 38 | for r in guard.key_ranges.keys() { 39 | if is_overlap(r, &key_range) { 40 | return Err(MetaError::KeyRangeOverlaps { 41 | r1: r.to_owned(), 42 | r2: key_range, 43 | } 44 | .into()); 45 | } 46 | } 47 | guard 48 | .key_ranges 49 | .insert(key_range, (group, raft_nodes.to_vec())); 50 | Ok(()) 51 | } 52 | 53 | async fn key_ranges(&self) -> Result> { 54 | let guard = self.inner.read(); 55 | Ok(guard.key_ranges.keys().cloned().collect_vec()) 56 | } 57 | 58 | async fn in_range(&self, key: &[u8]) -> Result)>> { 59 | let guard = self.inner.read(); 60 | for (r, (group, raft_nodes)) in guard.key_ranges.iter() { 61 | if in_range(key, r) { 62 | return Ok(Some((r.to_owned(), *group, raft_nodes.to_owned()))); 63 | } 64 | } 65 | Ok(None) 66 | } 67 | 68 | async fn all_in_range(&self, keys: &[&[u8]]) -> Result)>> { 69 | if keys.is_empty() { 70 | return Ok(None); 71 | } 72 | let guard = self.inner.read(); 73 | let mut result = None; 74 | for (r, (group, raft_nodes)) in guard.key_ranges.iter() { 75 | if in_range(keys[0], r) { 76 | result = Some((r.to_owned(), *group, raft_nodes.to_owned())); 77 | break; 78 | } 79 | } 80 | if result.is_none() { 81 | return Ok(None); 82 | } 83 | let (range, group, raft_nodes) = result.unwrap(); 84 | for key in &keys[1..] { 85 | if !in_range(key, &range) { 86 | return Ok(None); 87 | } 88 | } 89 | Ok(Some((range, group, raft_nodes))) 90 | } 91 | 92 | async fn update_raft_state( 93 | &self, 94 | raft_node: u64, 95 | raft_state: Option, 96 | ) -> Result<()> { 97 | let mut raft_states = self.raft_states.write(); 98 | raft_states.insert(raft_node, raft_state); 99 | Ok(()) 100 | } 101 | 102 | async fn all_raft_states(&self) -> Result>> { 103 | Ok(self.raft_states.read().clone()) 104 | } 105 | 106 | async fn is_raft_leader(&self, raft_node: u64) -> Result { 107 | let raft_states = self.raft_states.read(); 108 | let is_leader = match raft_states.get(&raft_node) { 109 | None | Some(None) => false, 110 | Some(Some(ss)) => ss.raft_state == raft::StateRole::Leader, 111 | }; 112 | Ok(is_leader) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /wheel/src/meta/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | 4 | use async_trait::async_trait; 5 | use runkv_proto::meta::KeyRange; 6 | 7 | use crate::error::Result; 8 | 9 | pub mod mem; 10 | #[allow(dead_code)] 11 | pub mod object; 12 | 13 | #[async_trait] 14 | pub trait MetaStore: Send + Sync + 'static { 15 | async fn add_key_range( 16 | &self, 17 | key_range: KeyRange, 18 | group: u64, 19 | raft_nodes: &[u64], 20 | ) -> Result<()>; 21 | 22 | async fn key_ranges(&self) -> Result>; 23 | 24 | async fn in_range(&self, key: &[u8]) -> Result)>>; 25 | 26 | async fn all_in_range(&self, keys: &[&[u8]]) -> Result)>>; 27 | 28 | async fn update_raft_state( 29 | &self, 30 | raft_node: u64, 31 | raft_state: Option, 32 | ) -> Result<()>; 33 | 34 | async fn all_raft_states(&self) -> Result>>; 35 | 36 | async fn is_raft_leader(&self, raft_node: u64) -> Result; 37 | } 38 | 39 | pub type MetaStoreRef = Arc; 40 | 41 | fn is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool { 42 | !(r1.start_key > r2.end_key || r1.end_key < r2.start_key) 43 | } 44 | 45 | fn in_range(key: &[u8], range: &KeyRange) -> bool { 46 | key >= &range.start_key[..] && key < &range.end_key[..] 47 | } 48 | -------------------------------------------------------------------------------- /wheel/src/meta/object.rs: -------------------------------------------------------------------------------- 1 | // TODO: Impl me. 2 | // TODO: Necessary? 3 | -------------------------------------------------------------------------------- /wheel/src/trace.rs: -------------------------------------------------------------------------------- 1 | use lazy_static::lazy_static; 2 | use runkv_common::sharded_hash_map::ShardedHashMap; 3 | 4 | lazy_static! { 5 | pub static ref TRACE_CTX: TraceContext = TraceContext::default(); 6 | pub static ref TRACE_RAFT_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec = 7 | prometheus::register_histogram_vec!( 8 | "trace_raft_latency_histogram_vec", 9 | "trace raft latency histogram vec", 10 | &["op", "node", "group", "raft_node"], 11 | vec![0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0] 12 | ) 13 | .unwrap(); 14 | } 15 | 16 | #[derive(Default)] 17 | pub struct TraceContext { 18 | pub propose_ts: ShardedHashMap, 19 | } 20 | -------------------------------------------------------------------------------- /wheel/src/worker/heartbeater.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | 3 | use async_trait::async_trait; 4 | use runkv_common::channel_pool::ChannelPool; 5 | use runkv_common::Worker; 6 | use runkv_proto::common::Endpoint; 7 | use runkv_proto::rudder::rudder_service_client::RudderServiceClient; 8 | use runkv_proto::rudder::{ 9 | heartbeat_request, heartbeat_response, HeartbeatRequest, RaftState, WheelHeartbeatRequest, 10 | }; 11 | use runkv_storage::manifest::{ManifestError, VersionManager}; 12 | use tonic::Request; 13 | use tracing::warn; 14 | 15 | use crate::error::{Error, Result}; 16 | use crate::meta::MetaStoreRef; 17 | 18 | pub struct HeartbeaterOptions { 19 | pub node: u64, 20 | pub rudder_node: u64, 21 | 22 | pub meta_store: MetaStoreRef, 23 | pub version_manager: VersionManager, 24 | pub channel_pool: ChannelPool, 25 | pub heartbeat_interval: Duration, 26 | pub endpoint: Endpoint, 27 | } 28 | 29 | /// [`Heartbeater`] is respons responsible to sync local version manager. 30 | pub struct Heartbeater { 31 | node: u64, 32 | rudder_node: u64, 33 | 34 | endpoint: Endpoint, 35 | heartbeat_interval: Duration, 36 | 37 | meta_store: MetaStoreRef, 38 | version_manager: VersionManager, 39 | channel_pool: ChannelPool, 40 | } 41 | 42 | #[async_trait] 43 | impl Worker for Heartbeater { 44 | async fn run(&mut self) -> anyhow::Result<()> { 45 | // TODO: Gracefully kill. 46 | loop { 47 | match self.run_inner().await { 48 | Ok(_) => {} 49 | Err(e) => warn!("error occur when heartbeater running: {}", e), 50 | } 51 | } 52 | } 53 | } 54 | 55 | impl Heartbeater { 56 | pub fn new(options: HeartbeaterOptions) -> Self { 57 | Self { 58 | node: options.node, 59 | rudder_node: options.rudder_node, 60 | 61 | endpoint: options.endpoint, 62 | heartbeat_interval: options.heartbeat_interval, 63 | 64 | version_manager: options.version_manager, 65 | meta_store: options.meta_store, 66 | channel_pool: options.channel_pool, 67 | } 68 | } 69 | 70 | async fn run_inner(&mut self) -> Result<()> { 71 | let raft_states = self.meta_store.all_raft_states().await?; 72 | let raft_states = raft_states 73 | .into_iter() 74 | .map(|(raft_node, ss)| { 75 | ( 76 | raft_node, 77 | RaftState { 78 | is_leader: match ss { 79 | Some(ss) => ss.raft_state == raft::StateRole::Leader, 80 | None => false, 81 | }, 82 | }, 83 | ) 84 | }) 85 | .collect(); 86 | 87 | let request = Request::new(HeartbeatRequest { 88 | node_id: self.node, 89 | endpoint: Some(self.endpoint.clone()), 90 | heartbeat_message: Some(heartbeat_request::HeartbeatMessage::WheelHeartbeat( 91 | WheelHeartbeatRequest { 92 | watermark: self.version_manager.watermark().await, 93 | next_version_id: self.version_manager.latest_version_id().await + 1, 94 | raft_states, 95 | }, 96 | )), 97 | }); 98 | 99 | let mut client = RudderServiceClient::new( 100 | self.channel_pool 101 | .get(self.rudder_node) 102 | .await 103 | .map_err(Error::err)?, 104 | ); 105 | let rsp = client.heartbeat(request).await?.into_inner(); 106 | 107 | let hb = match rsp.heartbeat_message.unwrap() { 108 | heartbeat_response::HeartbeatMessage::WheelHeartbeat(hb) => hb, 109 | _ => unreachable!(), 110 | }; 111 | for version_diff in hb.version_diffs { 112 | if let Err(runkv_storage::Error::ManifestError(ManifestError::VersionDiffIdNotMatch( 113 | old, 114 | new, 115 | ))) = self.version_manager.update(version_diff, true).await 116 | { 117 | warn!( 118 | "version diff id not match, skip: [old: {}] [new: {}]", 119 | old, new 120 | ); 121 | } 122 | } 123 | tokio::time::sleep(self.heartbeat_interval).await; 124 | Ok(()) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /wheel/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod heartbeater; 2 | pub mod raft; 3 | pub mod sstable_uploader; 4 | --------------------------------------------------------------------------------