├── .cargo
    └── config.toml
├── .github
    ├── pull_request_template.md
    ├── semantic.yml
    ├── template
    │   ├── generate.sh
    │   ├── main-override.yml
    │   ├── pr-override.yml
    │   └── template.yml
    └── workflows
    │   ├── main.yml
    │   └── pull-request.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── Makefile
├── Makefile.toml
├── README.md
├── bench
    ├── Cargo.toml
    ├── bench_kv
    │   └── main.rs
    └── etc
    │   ├── exhauster.toml
    │   ├── lsm_tree.toml
    │   ├── rudder.toml
    │   └── wheel.toml
├── client
    ├── Cargo.toml
    └── src
    │   ├── client.rs
    │   ├── error.rs
    │   ├── lib.rs
    │   ├── router.rs
    │   └── worker
    │       ├── heartbeater.rs
    │       └── mod.rs
├── codecov.yml
├── common
    ├── Cargo.toml
    ├── benches
    │   └── bench_sharded_hash_map.rs
    └── src
    │   ├── atomic.rs
    │   ├── channel_pool.rs
    │   ├── coding.rs
    │   ├── config.rs
    │   ├── context.rs
    │   ├── lib.rs
    │   ├── log.rs
    │   ├── notify_pool.rs
    │   ├── packer.rs
    │   ├── prometheus.rs
    │   ├── sharded_hash_map.rs
    │   ├── sync.rs
    │   ├── time.rs
    │   └── tracing_slog_drain.rs
├── etc
    ├── exhauster.toml
    ├── grafana-dashboards
    │   └── runkv-overview.json
    ├── grafana-provisioning
    │   ├── dashboards
    │   │   └── runkv-dashboards.yml
    │   └── datasources
    │   │   └── runkv-prometheus.yml
    ├── grafana.ini
    ├── lsm_tree.toml
    ├── prometheus.yml
    ├── rudder.toml
    └── wheel.toml
├── exhauster
    ├── Cargo.toml
    └── src
    │   ├── compaction_filter.rs
    │   ├── config.rs
    │   ├── error.rs
    │   ├── lib.rs
    │   ├── main.rs
    │   ├── partitioner.rs
    │   ├── service.rs
    │   └── worker
    │       ├── heartbeater.rs
    │       └── mod.rs
├── make
    ├── common.toml
    ├── grafana.toml
    ├── jaeger.toml
    ├── minio.toml
    └── prometheus.toml
├── proto
    ├── Cargo.toml
    ├── build.rs
    └── src
    │   ├── lib.rs
    │   └── proto
    │       ├── buf.yaml
    │       ├── common.proto
    │       ├── exhauster.proto
    │       ├── kv.proto
    │       ├── manifest.proto
    │       ├── meta.proto
    │       ├── prototool.yaml
    │       ├── rudder.proto
    │       └── wheel.proto
├── rudder
    ├── Cargo.toml
    └── src
    │   ├── config.rs
    │   ├── error.rs
    │   ├── lib.rs
    │   ├── main.rs
    │   ├── meta
    │       ├── mem.rs
    │       ├── mod.rs
    │       └── object.rs
    │   ├── service.rs
    │   └── worker
    │       ├── compaction_detector.rs
    │       └── mod.rs
├── run
├── rust-toolchain
├── rustfmt.toml
├── storage
    ├── Cargo.toml
    ├── bench
    │   ├── bench_raft_log_store
    │   │   └── main.rs
    │   └── file_cache_bench
    │   │   ├── README.md
    │   │   ├── analyze.rs
    │   │   ├── bench.rs
    │   │   ├── bpf.rs
    │   │   ├── main.rs
    │   │   ├── rate.rs
    │   │   └── utils.rs
    ├── benches
    │   ├── bench_block_iter.rs
    │   └── bench_compression.rs
    └── src
    │   ├── error.rs
    │   ├── lib.rs
    │   ├── lsm_tree
    │       ├── components
    │       │   ├── block.rs
    │       │   ├── block_cache.rs
    │       │   ├── memtable.rs
    │       │   ├── metrics.rs
    │       │   ├── mod.rs
    │       │   ├── skiplist
    │       │   │   ├── arena.rs
    │       │   │   ├── key.rs
    │       │   │   ├── list.rs
    │       │   │   └── mod.rs
    │       │   ├── sstable.rs
    │       │   └── sstable_store.rs
    │       ├── iterator
    │       │   ├── block_iterator.rs
    │       │   ├── concat_iterator.rs
    │       │   ├── memtable_iterator.rs
    │       │   ├── merge_iterator.rs
    │       │   ├── mod.rs
    │       │   ├── sstable_iterator.rs
    │       │   └── user_key_iterator.rs
    │       ├── manifest
    │       │   ├── error.rs
    │       │   ├── mod.rs
    │       │   └── version.rs
    │       └── mod.rs
    │   ├── object_store
    │       ├── mem.rs
    │       ├── mod.rs
    │       └── s3.rs
    │   ├── raft_log_store
    │       ├── block_cache.rs
    │       ├── entry.rs
    │       ├── error.rs
    │       ├── file.rs
    │       ├── log.rs
    │       ├── mem.rs
    │       ├── metrics.rs
    │       ├── mod.rs
    │       ├── queue.rs
    │       └── store.rs
    │   ├── tiered_cache
    │       ├── file_cache
    │       │   ├── alloc.rs
    │       │   ├── buffer.rs
    │       │   ├── cache.rs
    │       │   ├── error.rs
    │       │   ├── file.rs
    │       │   ├── meta.rs
    │       │   ├── metrics.rs
    │       │   ├── mod.rs
    │       │   ├── store.rs
    │       │   ├── test_utils.rs
    │       │   └── utils.rs
    │       └── mod.rs
    │   └── utils
    │       ├── bloom.rs
    │       ├── coding.rs
    │       ├── lru_cache.rs
    │       └── mod.rs
├── tests
    ├── Cargo.toml
    ├── etc
    │   ├── exhauster.toml
    │   ├── lsm_tree.toml
    │   ├── port.toml
    │   ├── rudder.toml
    │   └── wheel.toml
    ├── integrations
    │   ├── lib.rs
    │   ├── test_concurrent_put_get.rs
    │   └── test_multi_raft_group_concurrent_put_get.rs
    └── src
    │   └── lib.rs
└── wheel
    ├── Cargo.toml
    └── src
        ├── components
            ├── command.rs
            ├── fsm.rs
            ├── lsm_tree.rs
            ├── mod.rs
            ├── raft_log_store.rs
            ├── raft_manager.rs
            ├── raft_network.rs
            └── read_only_cmd_pool.rs
        ├── config.rs
        ├── error.rs
        ├── lib.rs
        ├── main.rs
        ├── meta
            ├── mem.rs
            ├── mod.rs
            └── object.rs
        ├── service.rs
        ├── trace.rs
        └── worker
            ├── heartbeater.rs
            ├── mod.rs
            ├── raft.rs
            └── sstable_uploader.rs


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | # Flags for all targets.
2 | [target.'cfg(all())']
3 | rustflags = ["--cfg", "tokio_unstable"]
4 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## What changes were proposed in this pull request?
 2 | 
 3 | (Please fill in changes proposed in this fix)
 4 | 
 5 | ## Which issues is this PR related to?
 6 | 
 7 | (Please list the issues related. Hint: use markdown list for better looking)
 8 | 
 9 | ## How was this patch tested?
10 | 
11 | (Please explain how this patch was tested. E.g. unit tests, integration tests)
12 | (If this patch involves UI changes, please attach a screen-shot; otherwise, remove this)
13 | 
14 | ## Will this help MrCroxx run or graduate?
15 | 
16 | (Run is OK, but better say graduate)
17 | 


--------------------------------------------------------------------------------
/.github/semantic.yml:
--------------------------------------------------------------------------------
 1 | # Ref: https://github.com/zeke/semantic-pull-requests#configuration .
 2 | titleAndCommits: true
 3 | anyCommit: true
 4 | types:
 5 |   - feat
 6 |   - fix
 7 |   - docs
 8 |   - style
 9 |   - refactor
10 |   - perf
11 |   - test
12 |   - build
13 |   - ci
14 |   - chore
15 |   - revert
16 | allowMergeCommits: true
17 | allowMergeCommits: true


--------------------------------------------------------------------------------
/.github/template/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 6 | cd "$DIR"
 7 | 
 8 | # You will need to install yq >= 4.16 to use this tool.
 9 | # brew install yq
10 | 
11 | HEADER="""
12 | # ================= THIS FILE IS AUTOMATICALLY GENERATED =================
13 | #
14 | # Please run generate.sh and commit after editing the workflow templates.
15 | #
16 | # ========================================================================
17 | """
18 | 
19 | # Generate workflow for main branch
20 | echo "$HEADER" > ../workflows/main.yml
21 | # shellcheck disable=SC2016
22 | yq ea '. as $item ireduce ({}; . * $item )' template.yml main-override.yml | yq eval '... comments=""' - >> ../workflows/main.yml
23 | echo "$HEADER" >> ../workflows/main.yml
24 | 
25 | # Generate workflow for pull requests
26 | echo "$HEADER" > ../workflows/pull-request.yml
27 | # shellcheck disable=SC2016
28 | yq ea '. as $item ireduce ({}; . * $item )' template.yml pr-override.yml | yq eval '... comments=""' - >> ../workflows/pull-request.yml
29 | echo "$HEADER" >> ../workflows/pull-request.yml
30 | 
31 | if [ "$1" == "--check" ] ; then
32 |  if ! git diff --exit-code; then
33 |     echo "Please run generate.sh and commit after editing the workflow templates."
34 |     exit 1
35 |  fi
36 | fi
37 | 


--------------------------------------------------------------------------------
/.github/template/main-override.yml:
--------------------------------------------------------------------------------
1 | name: CI (main)
2 | 
3 | on:
4 |   push:
5 |     branches: [main]
6 |   workflow_dispatch:
7 | 


--------------------------------------------------------------------------------
/.github/template/pr-override.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 | 
 7 | concurrency:
 8 |   group: environment-${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 


--------------------------------------------------------------------------------
/.github/template/template.yml:
--------------------------------------------------------------------------------
 1 | name:
 2 | 
 3 | on:
 4 | 
 5 | env:
 6 |   RUST_TOOLCHAIN: nightly-2022-10-16
 7 |   CARGO_TERM_COLOR: always
 8 |   CACHE_KEY_SUFFIX: 20221107
 9 |   RUNKV_CI: true
10 | 
11 | jobs:
12 |   misc-check:
13 |     name: misc check
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v3
18 |       - name: Install tools
19 |         run: |
20 |           wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq
21 |           sudo apt install -y protobuf-compiler -o Acquire::Retries=3
22 |           curl -sSL \
23 |             https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \
24 |             sudo tar -C /usr/local --strip-components 1 -xz
25 |           curl -sSL \
26 |             "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \
27 |             sudo tar -xvzf - -C /usr/local --strip-components 1
28 |         env:
29 |           YQ_VERSION: v4.16.1
30 |           BINARY: yq_linux_amd64
31 |           BUF_VERSION: 1.0.0-rc6
32 |       - name: Check if CI workflows are up-to-date
33 |         run: |
34 |           ./.github/template/generate.sh --check
35 |       - name: Run ShellCheck
36 |         uses: ludeeus/action-shellcheck@master
37 |       - name: Check protobuf style
38 |         run: |
39 |           cd proto/src/proto && prototool format -d && buf lint 
40 |   rust-test:
41 |     name: rust test with codecov
42 |     runs-on: ubuntu-latest
43 |     steps:
44 |       - name: Checkout
45 |         uses: actions/checkout@v3
46 |       - name: Install rust toolchain@v1
47 |         uses: actions-rs/toolchain@v1
48 |         with:
49 |           toolchain: ${{ env.RUST_TOOLCHAIN }}
50 |           components: rustfmt, clippy, llvm-tools-preview
51 |       - name: Cache Cargo home
52 |         uses: actions/cache@v2
53 |         id: cache
54 |         with:
55 |           path: |
56 |             ~/.cargo/bin/
57 |             ~/.cargo/registry/index/
58 |             ~/.cargo/registry/cache/
59 |             ~/.cargo/git/db/
60 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }}
61 |       - name: Install cargo-sort
62 |         if: steps.cache.outputs.cache-hit != 'true'
63 |         run: |
64 |           cargo install cargo-sort
65 |       - name: Run rust cargo-sort check
66 |         run: |
67 |           cargo sort -w -c
68 |       - name: Run rust format check
69 |         run: |
70 |           cargo fmt --all -- --check
71 |       - name: Run rust clippy check
72 |         run: |
73 |           # If new CI checks are added, the one with `--locked` must be run first.
74 |           cargo clippy --all-targets --locked -- -D warnings
75 |       - if: steps.cache.outputs.cache-hit != 'true'
76 |         uses: taiki-e/install-action@cargo-llvm-cov
77 |       - if: steps.cache.outputs.cache-hit != 'true'
78 |         uses: taiki-e/install-action@nextest
79 |       - name: Run rust test with coverage
80 |         run: |
81 |           cargo llvm-cov nextest --lcov --output-path lcov.info
82 |       - uses: codecov/codecov-action@v2
83 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # ================= THIS FILE IS AUTOMATICALLY GENERATED =================
 3 | #
 4 | # Please run generate.sh and commit after editing the workflow templates.
 5 | #
 6 | # ========================================================================
 7 | 
 8 | name: CI (main)
 9 | on:
10 |   push:
11 |     branches: [main]
12 |   workflow_dispatch:
13 | env:
14 |   RUST_TOOLCHAIN: nightly-2022-10-16
15 |   CARGO_TERM_COLOR: always
16 |   CACHE_KEY_SUFFIX: 20221107
17 |   RUNKV_CI: true
18 | jobs:
19 |   misc-check:
20 |     name: misc check
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v3
25 |       - name: Install tools
26 |         run: |
27 |           wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq
28 |           sudo apt install -y protobuf-compiler -o Acquire::Retries=3
29 |           curl -sSL \
30 |             https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \
31 |             sudo tar -C /usr/local --strip-components 1 -xz
32 |           curl -sSL \
33 |             "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \
34 |             sudo tar -xvzf - -C /usr/local --strip-components 1
35 |         env:
36 |           YQ_VERSION: v4.16.1
37 |           BINARY: yq_linux_amd64
38 |           BUF_VERSION: 1.0.0-rc6
39 |       - name: Check if CI workflows are up-to-date
40 |         run: |
41 |           ./.github/template/generate.sh --check
42 |       - name: Run ShellCheck
43 |         uses: ludeeus/action-shellcheck@master
44 |       - name: Check protobuf style
45 |         run: "cd proto/src/proto && prototool format -d && buf lint \n"
46 |   rust-test:
47 |     name: rust test with codecov
48 |     runs-on: ubuntu-latest
49 |     steps:
50 |       - name: Checkout
51 |         uses: actions/checkout@v3
52 |       - name: Install rust toolchain@v1
53 |         uses: actions-rs/toolchain@v1
54 |         with:
55 |           toolchain: ${{ env.RUST_TOOLCHAIN }}
56 |           components: rustfmt, clippy, llvm-tools-preview
57 |       - name: Cache Cargo home
58 |         uses: actions/cache@v2
59 |         id: cache
60 |         with:
61 |           path: |
62 |             ~/.cargo/bin/
63 |             ~/.cargo/registry/index/
64 |             ~/.cargo/registry/cache/
65 |             ~/.cargo/git/db/
66 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }}
67 |       - name: Install cargo-sort
68 |         if: steps.cache.outputs.cache-hit != 'true'
69 |         run: |
70 |           cargo install cargo-sort
71 |       - name: Run rust cargo-sort check
72 |         run: |
73 |           cargo sort -w -c
74 |       - name: Run rust format check
75 |         run: |
76 |           cargo fmt --all -- --check
77 |       - name: Run rust clippy check
78 |         run: |
79 |           # If new CI checks are added, the one with `--locked` must be run first.
80 |           cargo clippy --all-targets --locked -- -D warnings
81 |       - if: steps.cache.outputs.cache-hit != 'true'
82 |         uses: taiki-e/install-action@cargo-llvm-cov
83 |       - if: steps.cache.outputs.cache-hit != 'true'
84 |         uses: taiki-e/install-action@nextest
85 |       - name: Run rust test with coverage
86 |         run: |
87 |           cargo llvm-cov nextest --lcov --output-path lcov.info
88 |       - uses: codecov/codecov-action@v2
89 | 
90 | # ================= THIS FILE IS AUTOMATICALLY GENERATED =================
91 | #
92 | # Please run generate.sh and commit after editing the workflow templates.
93 | #
94 | # ========================================================================
95 | 
96 | 


--------------------------------------------------------------------------------
/.github/workflows/pull-request.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # ================= THIS FILE IS AUTOMATICALLY GENERATED =================
 3 | #
 4 | # Please run generate.sh and commit after editing the workflow templates.
 5 | #
 6 | # ========================================================================
 7 | 
 8 | name: CI
 9 | on:
10 |   pull_request:
11 |     branches: [main]
12 | env:
13 |   RUST_TOOLCHAIN: nightly-2022-10-16
14 |   CARGO_TERM_COLOR: always
15 |   CACHE_KEY_SUFFIX: 20221107
16 |   RUNKV_CI: true
17 | jobs:
18 |   misc-check:
19 |     name: misc check
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - name: Checkout
23 |         uses: actions/checkout@v3
24 |       - name: Install tools
25 |         run: |
26 |           wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/${BINARY}.tar.gz -O - | tar xz && sudo mv ${BINARY} /usr/bin/yq
27 |           sudo apt install -y protobuf-compiler -o Acquire::Retries=3
28 |           curl -sSL \
29 |             https://github.com/uber/prototool/releases/download/v1.8.0/prototool-$(uname -s)-$(uname -m).tar.gz | \
30 |             sudo tar -C /usr/local --strip-components 1 -xz
31 |           curl -sSL \
32 |             "https://github.com/bufbuild/buf/releases/download/v${BUF_VERSION}/buf-$(uname -s)-$(uname -m).tar.gz" | \
33 |             sudo tar -xvzf - -C /usr/local --strip-components 1
34 |         env:
35 |           YQ_VERSION: v4.16.1
36 |           BINARY: yq_linux_amd64
37 |           BUF_VERSION: 1.0.0-rc6
38 |       - name: Check if CI workflows are up-to-date
39 |         run: |
40 |           ./.github/template/generate.sh --check
41 |       - name: Run ShellCheck
42 |         uses: ludeeus/action-shellcheck@master
43 |       - name: Check protobuf style
44 |         run: "cd proto/src/proto && prototool format -d && buf lint \n"
45 |   rust-test:
46 |     name: rust test with codecov
47 |     runs-on: ubuntu-latest
48 |     steps:
49 |       - name: Checkout
50 |         uses: actions/checkout@v3
51 |       - name: Install rust toolchain@v1
52 |         uses: actions-rs/toolchain@v1
53 |         with:
54 |           toolchain: ${{ env.RUST_TOOLCHAIN }}
55 |           components: rustfmt, clippy, llvm-tools-preview
56 |       - name: Cache Cargo home
57 |         uses: actions/cache@v2
58 |         id: cache
59 |         with:
60 |           path: |
61 |             ~/.cargo/bin/
62 |             ~/.cargo/registry/index/
63 |             ~/.cargo/registry/cache/
64 |             ~/.cargo/git/db/
65 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}-${{ env.CACHE_KEY_SUFFIX }}
66 |       - name: Install cargo-sort
67 |         if: steps.cache.outputs.cache-hit != 'true'
68 |         run: |
69 |           cargo install cargo-sort
70 |       - name: Run rust cargo-sort check
71 |         run: |
72 |           cargo sort -w -c
73 |       - name: Run rust format check
74 |         run: |
75 |           cargo fmt --all -- --check
76 |       - name: Run rust clippy check
77 |         run: |
78 |           # If new CI checks are added, the one with `--locked` must be run first.
79 |           cargo clippy --all-targets --locked -- -D warnings
80 |       - if: steps.cache.outputs.cache-hit != 'true'
81 |         uses: taiki-e/install-action@cargo-llvm-cov
82 |       - if: steps.cache.outputs.cache-hit != 'true'
83 |         uses: taiki-e/install-action@nextest
84 |       - name: Run rust test with coverage
85 |         run: |
86 |           cargo llvm-cov nextest --lcov --output-path lcov.info
87 |       - uses: codecov/codecov-action@v2
88 | concurrency:
89 |   group: environment-${{ github.ref }}
90 |   cancel-in-progress: true
91 | 
92 | # ================= THIS FILE IS AUTOMATICALLY GENERATED =================
93 | #
94 | # Please run generate.sh and commit after editing the workflow templates.
95 | #
96 | # ========================================================================
97 | 
98 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | 
 3 | .vscode
 4 | .idea
 5 | 
 6 | /.run
 7 | /.bin
 8 | 
 9 | .DS_Store
10 | 
11 | perf.data*
12 | flamegraph.svg
13 | 
14 | *.log
15 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |     "bench",
 4 |     "client",
 5 |     "common",
 6 |     "exhauster",
 7 |     "proto",
 8 |     "rudder",
 9 |     "storage",
10 |     "tests",
11 |     "wheel",
12 | ]
13 | 
14 | [profile.bench]
15 | debug = true
16 | 
17 | [profile.release]
18 | debug = true
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2022 MrCroxx
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /bin/bash
 2 | .PHONY: proto
 3 | 
 4 | fmt:
 5 | 	cargo sort -w && cargo fmt --all && cargo clippy --all-targets --all-features && cargo clippy --all-targets
 6 | 
 7 | fmt_check:
 8 | 	cargo sort -c -w && cargo fmt --all -- --check && cargo clippy --all-targets --all-features --locked -- -D warnings && cargo clippy --all-targets --locked -- -D warnings
 9 | 
10 | clean:
11 | 	cargo clean
12 | 
13 | check:
14 | 	cargo check --tests
15 | 
16 | test:
17 | 	cargo nextest run --features deadlock
18 | 
19 | proto:
20 | 	cd proto/src/proto && prototool format -w && buf lint
21 | 
22 | proto_check:
23 | 	cd proto/src/proto && prototool format -d && buf lint 
24 | 
25 | update_ci:
26 | 	cd .github/template && ./generate.sh
27 | 
28 | bench_kv:
29 | 	RUNKV_METRICS=true RUST_BACKTRACE=1 cargo run --release --package runkv-bench --bin bench_kv


--------------------------------------------------------------------------------
/Makefile.toml:
--------------------------------------------------------------------------------
 1 | extend = [
 2 |     { path = "make/common.toml" },
 3 |     { path = "make/minio.toml" },
 4 |     { path = "make/jaeger.toml" },
 5 |     { path = "make/prometheus.toml" },
 6 |     { path = "make/grafana.toml" },
 7 | ]
 8 | 
 9 | [env]
10 | RUST_BACKTRACE = 1
11 | CARGO_MAKE_EXTEND_WORKSPACE_MAKEFILE = true
12 | 
13 | [config]
14 | skip_core_tasks = true
15 | 
16 | [tasks.clean-all]
17 | category = "Misc"
18 | description = "Clean all downloaded binaries by deleting .run folder."
19 | dependencies = ["prepare"]
20 | script = '''
21 | #!@duckscript
22 | rm -rf "${PREFIX}"
23 | '''
24 | 
25 | [tasks.clean-data]
26 | category = "Misc"
27 | description = "Clean data by deleting files in .run/data and ./run/log folder."
28 | dependencies = ["prepare"]
29 | script = '''
30 | #!/bin/bash
31 | set -e
32 | rm -rf "${PREFIX_DATA}"
33 | rm -rf "${PREFIX_LOG}"
34 | '''
35 | 
36 | [tasks.d]
37 | alias = "dev"
38 | 
39 | [tasks.dev]
40 | category = "Develop"
41 | description = "Start development envirorment."
42 | dependencies = ["pre-dev", "setup-minio"]
43 | script = '''
44 | #!@duckscript
45 | echo success
46 | '''
47 | 
48 | [tasks.pre-dev]
49 | category = "Develop"
50 | description = "Downloads necessary tools and build required components."
51 | dependencies = [
52 |     "clean-data",
53 |     "download-minio",
54 |     "download-mcli",
55 |     "download-jaeger",
56 |     "download-prometheus",
57 |     "download-grafana",
58 | ]
59 | script = '''
60 | #!/bin/bash
61 | # run minio
62 | tmux new -d -s runkv-minio ${PREFIX_BIN}/minio server --address 0.0.0.0:9000 --console-address 0.0.0.0:9090 ${PREFIX_DATA}
63 | # run jaeger
64 | tmux new -d -s runkv-jaeger ${PREFIX_BIN}/jaeger
65 | # run prometheus
66 | tmux new -d -s runkv-prometheus ${PREFIX_BIN}/prometheus/prometheus --config.file=etc/prometheus.yml --web.listen-address=0.0.0.0:9091 --storage.tsdb.path=${PREFIX_DATA}/prometheus-data
67 | # run grafana
68 | tmux new -d -s runkv-grafana ${PREFIX_BIN}/grafana/bin/grafana-server -homepath .run/bin/grafana -config etc/grafana.ini
69 | sleep 2
70 | '''
71 | 
72 | [tasks.setup-minio]
73 | category = "Misc"
74 | description = "Clean minio data with mcli."
75 | dependencies = ["prepare"]
76 | script = '''
77 | #!/bin/bash
78 | set -e
79 | ${PREFIX_BIN}/mcli alias set local http://127.0.0.1:9000 minioadmin minioadmin
80 | ${PREFIX_BIN}/mcli mb --quiet local/test
81 | '''
82 | 
83 | [tasks.k]
84 | alias = "kill"
85 | 
86 | [tasks.kill]
87 | category = "Develop"
88 | description = "Kill development environment."
89 | script = '''
90 | #!/bin/bash
91 | tmux send-keys -t runkv-minio C-c
92 | tmux send-keys -t runkv-jaeger C-c
93 | tmux send-keys -t runkv-prometheus C-c
94 | tmux send-keys -t runkv-grafana C-c
95 | sleep 2
96 | '''
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RunKV
 2 | 
 3 | ![main](https://github.com/MrCroxx/RunKV/actions/workflows/main.yml/badge.svg)
 4 | [![codecov](https://codecov.io/gh/MrCroxx/RunKV/branch/main/graph/badge.svg?token=LKT7JWROUW)](https://codecov.io/gh/MrCroxx/RunKV)
 5 | 
 6 | **Still working in progress.**
 7 | 
 8 | **TOY ONLY!**
 9 | 
10 | RunKV is an experimental key-value storage engine for OLTP workload based on S3 and EBS. The goal is to reduce storage costs while the performace fallback is tolerable.
11 | 
12 | *This is my master graduation project. Better gonna run, or I must run.*
13 | 


--------------------------------------------------------------------------------
/bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-bench"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | bytes = "1"
10 | bytesize = "1.1.0"
11 | clap = { version = "3.1.6", features = ["derive"] }
12 | env_logger = "*"
13 | futures = "0.3"
14 | itertools = "0.10.3"
15 | lazy_static = "1.4.0"
16 | rand = "0.8.5"
17 | runkv-client = { path = "../client" }
18 | runkv-common = { path = "../common" }
19 | runkv-exhauster = { path = "../exhauster" }
20 | runkv-proto = { path = "../proto" }
21 | runkv-rudder = { path = "../rudder" }
22 | runkv-storage = { path = "../storage" }
23 | runkv-tests = { path = "../tests" }
24 | runkv-wheel = { path = "../wheel" }
25 | tempfile = "3"
26 | test-log = "0.2.10"
27 | tokio = { version = "1", features = [
28 |     "rt-multi-thread",
29 |     "sync",
30 |     "macros",
31 |     "time",
32 |     "tracing",
33 | ] }
34 | toml = "0.4.2"
35 | tonic = "0.6.2"
36 | tracing = "0.1"
37 | 
38 | [target.'cfg(not(target_env = "msvc"))'.dependencies]
39 | tikv-jemallocator = "0.4.3"
40 | 
41 | [features]
42 | tracing = ["runkv-wheel/tracing"]
43 | deadlock = [
44 |     "runkv-tests/deadlock",
45 |     "runkv-storage/deadlock",
46 |     "runkv-wheel/deadlock",
47 | ]
48 | console = ["tokio/tracing", "runkv-common/console"]
49 | trace-notify-pool = ["runkv-common/trace-notify-pool"]
50 | verbose-release-log = [
51 |     "tracing/release_max_level_trace",
52 |     "runkv-common/verbose-release-log",
53 |     "runkv-exhauster/verbose-release-log",
54 |     "runkv-rudder/verbose-release-log",
55 |     "runkv-storage/verbose-release-log",
56 |     "runkv-tests/verbose-release-log",
57 |     "runkv-wheel/verbose-release-log",
58 | ]
59 | 
60 | [[bin]]
61 | name = "bench_kv"
62 | path = "bench_kv/main.rs"
63 | 


--------------------------------------------------------------------------------
/bench/bench_kv/main.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(target_env = "msvc"))]
 2 | use tikv_jemallocator::Jemalloc;
 3 | 
 4 | #[cfg(not(target_env = "msvc"))]
 5 | #[global_allocator]
 6 | static GLOBAL: Jemalloc = Jemalloc;
 7 | 
 8 | use clap::Parser;
 9 | use runkv_tests::{run, Args, Options};
10 | 
11 | const RUDDER_CONFIG_PATH: &str = "bench/etc/rudder.toml";
12 | const WHEEL_CONFIG_PATH: &str = "bench/etc/wheel.toml";
13 | const EXHAUSTER_CONFIG_PATH: &str = "bench/etc/exhauster.toml";
14 | const LSM_TREE_CONFIG_PATH: &str = "bench/etc/lsm_tree.toml";
15 | 
16 | const RUDDER_NODE_ID: u64 = 10000;
17 | const WHEEL_NODE_ID_BASE: u64 = 0;
18 | const EXHAUSTER_NODE_ID_BASE: u64 = 100;
19 | 
20 | const RUDDER_PORT: u16 = 12300;
21 | const WHEEL_PORT_BASE: u16 = 12300;
22 | const WHEEL_PROMETHEUS_PORT_BASE: u16 = 9890;
23 | const EXHAUSTER_PORT_BASE: u16 = 12400;
24 | 
25 | #[tokio::main]
26 | async fn main() {
27 |     let args = Args::parse();
28 |     println!("{:#?}", args);
29 | 
30 |     let options = Options {
31 |         log: true,
32 |         rudder_config_path: RUDDER_CONFIG_PATH.to_string(),
33 |         wheel_config_path: WHEEL_CONFIG_PATH.to_string(),
34 |         exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(),
35 |         lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(),
36 |         rudder_node_id: RUDDER_NODE_ID,
37 |         wheel_node_id_base: WHEEL_NODE_ID_BASE,
38 |         exhauster_node_id_base: EXHAUSTER_NODE_ID_BASE,
39 |         rudder_port: RUDDER_PORT,
40 |         wheel_port_base: WHEEL_PORT_BASE,
41 |         wheel_prometheus_port_base: WHEEL_PROMETHEUS_PORT_BASE,
42 |         exhauster_port_base: EXHAUSTER_PORT_BASE,
43 |     };
44 |     println!("{:#?}", options);
45 | 
46 |     run(args, options).await;
47 | }
48 | 


--------------------------------------------------------------------------------
/bench/etc/exhauster.toml:
--------------------------------------------------------------------------------
 1 | id = 0
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | heartbeat_interval = "1 s"
 7 | 
 8 | [rudder]
 9 | id = 1
10 | host = "127.0.0.1"
11 | port = 0
12 | 
13 | [minio]
14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
15 | 
16 | [s3]
17 | bucket = "runkv"
18 | 
19 | [buffer]
20 | write_buffer_capacity = "64 MiB"
21 | 
22 | [cache]
23 | block_cache_capacity = "512 MiB"
24 | meta_cache_capacity = "256 MiB"
25 | 


--------------------------------------------------------------------------------
/bench/etc/lsm_tree.toml:
--------------------------------------------------------------------------------
 1 | [lsm_tree]
 2 | l1_capacity = "32 MiB"
 3 | level_multiplier = 2
 4 | 
 5 | trigger_l0_compaction_ssts = 4
 6 | trigger_l0_compaction_interval = "500 ms"
 7 | trigger_lmax_compaction_interval = "5 s"
 8 | trigger_compaction_interval = "2 s"
 9 | 
10 | sstable_capacity = "4 MiB"
11 | block_capacity = "1 MiB"
12 | restart_interval = 2
13 | bloom_false_positive = 0.1
14 | 
15 | compaction_pin_ttl = "15 s"
16 | 
17 | [[lsm_tree.levels_options]]
18 | compaction_strategy = "Overlap"
19 | compression_algorithm = "None"
20 | 
21 | [[lsm_tree.levels_options]]
22 | compaction_strategy = "NonOverlap"
23 | compression_algorithm = "None"
24 | 
25 | [[lsm_tree.levels_options]]
26 | compaction_strategy = "NonOverlap"
27 | compression_algorithm = "None"
28 | 
29 | [[lsm_tree.levels_options]]
30 | compaction_strategy = "NonOverlap"
31 | compression_algorithm = "Lz4"
32 | 


--------------------------------------------------------------------------------
/bench/etc/rudder.toml:
--------------------------------------------------------------------------------
 1 | id = 0
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | health_timeout = "10 s"
 7 | 
 8 | # [minio]
 9 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
10 | 
11 | # [s3]
12 | # bucket = "runkv"
13 | 
14 | [cache]
15 | block_cache_capacity = "0 B"
16 | meta_cache_capacity = "64 kiB"
17 | 


--------------------------------------------------------------------------------
/bench/etc/wheel.toml:
--------------------------------------------------------------------------------
 1 | id = 0
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | log = ".run/log/"
 5 | data_path = "data"
 6 | meta_path = "meta"
 7 | poll_interval = "100ms"
 8 | heartbeat_interval = "100ms"
 9 | 
10 | [rudder]
11 | id = 1
12 | host = "127.0.0.1"
13 | port = 0
14 | 
15 | # [minio]
16 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
17 | 
18 | # [s3]
19 | # bucket = "runkv"
20 | 
21 | [buffer]
22 | write_buffer_capacity = "4 MiB"
23 | 
24 | [cache]
25 | block_cache_capacity = "64 MiB"
26 | meta_cache_capacity = "64 MiB"
27 | 
28 | [raft_log_store]
29 | log_dir_path = "{/path/to/log/dir}"
30 | log_file_capacity = "64 MiB"
31 | block_cache_capacity = "256 MiB"
32 | persist = "{persist}"
33 | 
34 | [prometheus]
35 | host = "127.0.0.1"
36 | port = 0
37 | 
38 | [tiered_cache]
39 | type = "FileCache"
40 | [tiered_cache.args]
41 | dir = "/path/to/file/cache/dir"
42 | capacity = "1 GiB"
43 | total_buffer_capacity = "1 GiB"
44 | cache_file_fallocate_unit = "256 MiB"
45 | cache_meta_fallocate_unit = "64 MiB"
46 | cache_file_max_write_size = "4 MiB"


--------------------------------------------------------------------------------
/client/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-client"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | async-trait = "0.1"
10 | itertools = "0.10.3"
11 | parking_lot = "0.12"
12 | runkv-common = { path = "../common" }
13 | runkv-proto = { path = "../proto" }
14 | thiserror = "1.0"
15 | tokio = { version = "1", features = [
16 |     "rt-multi-thread",
17 |     "sync",
18 |     "macros",
19 |     "time",
20 | ] }
21 | tonic = "0.6.2"
22 | tracing = "0.1"
23 | 


--------------------------------------------------------------------------------
/client/src/error.rs:
--------------------------------------------------------------------------------
 1 | use tonic::Status;
 2 | 
 3 | #[derive(thiserror::Error, Debug)]
 4 | pub enum Error {
 5 |     #[error("rpc status error: {0}")]
 6 |     RpcStatus(#[from] Status),
 7 |     #[error("kv error: {0}")]
 8 |     KvError(#[from] KvError),
 9 |     #[error("config error: {0}")]
10 |     ConfigError(String),
11 |     #[error("other: {0}")]
12 |     Other(String),
13 | }
14 | 
15 | impl Error {
16 |     pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> Error {
17 |         Error::Other(e.into().to_string())
18 |     }
19 | 
20 |     pub fn config_err(e: impl Into<Box<dyn std::error::Error>>) -> Error {
21 |         Error::ConfigError(e.into().to_string())
22 |     }
23 | 
24 |     pub fn redirect(&self) -> bool {
25 |         matches!(self, Self::KvError(KvError::Redirect))
26 |     }
27 | }
28 | 
29 | #[derive(thiserror::Error, Debug)]
30 | pub enum KvError {
31 |     #[error("temporarily no leader for key: {0:?}")]
32 |     TemporarilyNoLeader(Vec<u8>),
33 |     #[error("valid leader changed, need redirect")]
34 |     Redirect,
35 | }
36 | 
37 | pub type Result<T> = std::result::Result<T, Error>;
38 | 


--------------------------------------------------------------------------------
/client/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod client;
2 | pub mod error;
3 | pub mod router;
4 | pub mod worker;
5 | 


--------------------------------------------------------------------------------
/client/src/router.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, HashMap};
  2 | use std::sync::Arc;
  3 | 
  4 | use itertools::Itertools;
  5 | use parking_lot::RwLock;
  6 | use runkv_proto::meta::{KeyRange, KeyRangeInfo};
  7 | 
  8 | fn _is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool {
  9 |     !(r1.start_key > r2.end_key || r1.end_key < r2.start_key)
 10 | }
 11 | 
 12 | fn in_range(key: &[u8], range: &KeyRange) -> bool {
 13 |     key >= &range.start_key[..] && key < &range.end_key[..]
 14 | }
 15 | 
 16 | pub struct LeaderInfo {
 17 |     pub node: u64,
 18 |     pub group: u64,
 19 |     pub raft_node: u64,
 20 | }
 21 | 
 22 | struct RouterCore {
 23 |     /// { key range -> raft group id }
 24 |     key_range_groups: BTreeMap<KeyRange, u64>,
 25 |     /// { raft group id -> [raft node id] }
 26 |     group_raft_nodes: HashMap<u64, Vec<u64>>,
 27 |     /// { raft group id -> leader raft node id }
 28 |     group_leader: HashMap<u64, u64>,
 29 |     /// { raft node id -> node id }
 30 |     raft_nodes: HashMap<u64, u64>,
 31 | }
 32 | 
 33 | #[derive(Clone)]
 34 | pub struct Router {
 35 |     core: Arc<RwLock<RouterCore>>,
 36 | }
 37 | 
 38 | impl Default for Router {
 39 |     fn default() -> Self {
 40 |         Self {
 41 |             core: Arc::new(RwLock::new(RouterCore {
 42 |                 key_range_groups: BTreeMap::default(),
 43 |                 group_raft_nodes: HashMap::default(),
 44 |                 group_leader: HashMap::default(),
 45 |                 raft_nodes: HashMap::default(),
 46 |             })),
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | impl Router {
 52 |     pub fn leader(&self, key: &[u8]) -> Option<LeaderInfo> {
 53 |         let core = self.core.read();
 54 |         for (key_range, &group) in core.key_range_groups.iter() {
 55 |             if in_range(key, key_range) {
 56 |                 if let Some(&leader) = core.group_leader.get(&group) {
 57 |                     let node = core.raft_nodes.get(&leader).copied().unwrap();
 58 |                     return Some(LeaderInfo {
 59 |                         node,
 60 |                         group,
 61 |                         raft_node: leader,
 62 |                     });
 63 |                 }
 64 |                 return None;
 65 |             }
 66 |         }
 67 |         None
 68 |     }
 69 | 
 70 |     pub fn update_key_ranges(&self, key_range_infos: Vec<KeyRangeInfo>) {
 71 |         let mut updated = RouterCore {
 72 |             key_range_groups: BTreeMap::default(),
 73 |             group_raft_nodes: HashMap::default(),
 74 |             group_leader: HashMap::default(),
 75 |             raft_nodes: HashMap::default(),
 76 |         };
 77 |         for KeyRangeInfo {
 78 |             group,
 79 |             key_range,
 80 |             raft_nodes,
 81 |             leader,
 82 |         } in key_range_infos
 83 |         {
 84 |             let key_range = key_range.unwrap();
 85 | 
 86 |             updated.key_range_groups.insert(key_range, group);
 87 |             updated
 88 |                 .group_raft_nodes
 89 |                 .insert(group, raft_nodes.keys().copied().collect_vec());
 90 |             updated.group_leader.insert(group, leader);
 91 |             for (raft_node, node) in raft_nodes {
 92 |                 updated.raft_nodes.insert(raft_node, node);
 93 |             }
 94 |         }
 95 |         let mut core = self.core.write();
 96 |         *core = updated;
 97 |     }
 98 | }
 99 | 
100 | #[cfg(test)]
101 | mod tests {
102 |     use super::*;
103 | 
104 |     fn is_send_sync_clone<T: Send + Sync + Clone + 'static>() {}
105 | 
106 |     #[test]
107 |     fn ensure_send_sync_clone() {
108 |         is_send_sync_clone::<Router>();
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/client/src/worker/heartbeater.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | 
 3 | use async_trait::async_trait;
 4 | use runkv_common::channel_pool::ChannelPool;
 5 | use runkv_common::config::Node;
 6 | use runkv_common::Worker;
 7 | use runkv_proto::rudder::control_service_client::ControlServiceClient;
 8 | use runkv_proto::rudder::RouterRequest;
 9 | use tonic::Request;
10 | use tracing::warn;
11 | 
12 | use crate::error::{Error, Result};
13 | use crate::router::Router;
14 | 
15 | pub struct HeartbeaterOptions {
16 |     pub rudder: u64,
17 |     pub heartbeat_interval: Duration,
18 | 
19 |     pub router: Router,
20 |     pub channel_pool: ChannelPool,
21 | }
22 | 
23 | pub struct Heartbeater {
24 |     rudder: u64,
25 |     heartbeat_interval: Duration,
26 | 
27 |     router: Router,
28 |     channel_pool: ChannelPool,
29 | }
30 | 
31 | impl Heartbeater {
32 |     pub fn new(options: HeartbeaterOptions) -> Self {
33 |         Self {
34 |             rudder: options.rudder,
35 |             heartbeat_interval: options.heartbeat_interval,
36 | 
37 |             router: options.router,
38 |             channel_pool: options.channel_pool,
39 |         }
40 |     }
41 | 
42 |     async fn run_inner(&mut self) -> Result<()> {
43 |         loop {
44 |             tokio::time::sleep(self.heartbeat_interval).await;
45 |             let channel = self
46 |                 .channel_pool
47 |                 .get(self.rudder)
48 |                 .await
49 |                 .map_err(Error::err)?;
50 |             let mut client = ControlServiceClient::new(channel);
51 |             let rsp = client
52 |                 .router(Request::new(RouterRequest::default()))
53 |                 .await?
54 |                 .into_inner();
55 |             self.router.update_key_ranges(rsp.key_ranges);
56 |             for (node, endpoint) in rsp.wheels {
57 |                 self.channel_pool
58 |                     .put_node(Node {
59 |                         id: node,
60 |                         host: endpoint.host,
61 |                         port: endpoint.port as u16,
62 |                     })
63 |                     .await;
64 |             }
65 |         }
66 |     }
67 | }
68 | 
69 | #[async_trait]
70 | impl Worker for Heartbeater {
71 |     async fn run(&mut self) -> anyhow::Result<()> {
72 |         // TODO: Gracefully kill.
73 |         loop {
74 |             match self.run_inner().await {
75 |                 Ok(_) => {}
76 |                 Err(e) => warn!("error occur when heartbeater running: {}", e),
77 |             }
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/client/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod heartbeater;
2 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     project:
4 |       default:
5 |         threshold: 5%
6 |     patch: off
7 | ignore:
8 |   - "**/bench"
9 |   - "**/benches"


--------------------------------------------------------------------------------
/common/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-common"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | async-trait = "0.1"
10 | bincode = "1.3.3"
11 | bytes = "1"
12 | bytesize = { version = "1.1.0", features = ["serde"] }
13 | chrono = "0.4"
14 | clap = { version = "3.1.6", features = ["derive"] }
15 | console-subscriber = { version = "0.1.6", optional = true }
16 | futures = "0.3"
17 | http = "0.2.6"
18 | humantime = "2.1.0"
19 | humantime-serde = "1.1.1"
20 | hyper = { version = "^0.14", features = ["server", "http1", "tcp"] }
21 | isahc = "1" # isahc is the http client used for tracing. Always set it as the same version as opentelemetry-jaeger's.
22 | itertools = "0.10.3"
23 | lazy_static = "1.4.0"
24 | opentelemetry = { version = "0.17", features = ["rt-tokio", "trace"] }
25 | opentelemetry-jaeger = { version = "0.16", features = [
26 |     "rt-tokio",
27 |     "collector_client",
28 |     "isahc",
29 |     "isahc_collector_client",
30 | ] }
31 | ouroboros = "0.15.0"
32 | parking_lot = "0.12"
33 | prometheus = "0.13.0"
34 | rand = "0.8.5"
35 | serde = "1.0"
36 | serde_derive = "1.0"
37 | slog = "2.7"
38 | tokio = { version = "1", features = ["rt-multi-thread", "sync"] }
39 | toml = "0.4.2"
40 | tonic = "0.6.2"
41 | tracing = "0.1"
42 | tracing-appender = "0.2"
43 | tracing-opentelemetry = "0.17"
44 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
45 | 
46 | [dev-dependencies]
47 | criterion = { version = "0.3", features = ["async", "async_tokio"] }
48 | env_logger = "*"
49 | test-log = "0.2.10"
50 | 
51 | [features]
52 | console = ["console-subscriber"]
53 | trace-notify-pool = []
54 | tracing = []
55 | verbose-release-log = ["tracing/release_max_level_trace"]
56 | 
57 | [[bench]]
58 | name = "bench_sharded_hash_map"
59 | harness = false
60 | # Uncomment this line if you are generating flame graph.
61 | # debug = true
62 | 


--------------------------------------------------------------------------------
/common/benches/bench_sharded_hash_map.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::sync::Arc;
  3 | use std::time::Duration;
  4 | 
  5 | use criterion::{criterion_group, criterion_main, Criterion};
  6 | use itertools::Itertools;
  7 | use parking_lot::RwLock;
  8 | use runkv_common::sharded_hash_map::ShardedHashMap;
  9 | 
 10 | const CONCURRENCY: u64 = 1024;
 11 | const SLEEP: Duration = Duration::from_micros(200);
 12 | 
 13 | fn sharded_hash_map_concurrent_put_get(shards: u16) {
 14 |     let map = ShardedHashMap::new(shards);
 15 | 
 16 |     let job = |map: ShardedHashMap<u64, u64>, i: u64, total: u64| {
 17 |         {
 18 |             assert_eq!(map.insert(i, i), None);
 19 |         }
 20 |         {
 21 |             let read = map.read(&i);
 22 |             assert_eq!(read.get(), Some(&i));
 23 |             std::thread::sleep(SLEEP);
 24 |             drop(read);
 25 |         }
 26 |         {
 27 |             let mut write = map.write(&i);
 28 |             assert_eq!(write.get(), Some(&i));
 29 |             *write.get_mut().unwrap() += total;
 30 |             assert_eq!(write.get(), Some(&(i + total)));
 31 |             std::thread::sleep(SLEEP);
 32 |             drop(write);
 33 |         }
 34 |         {
 35 |             let read = map.read(&i);
 36 |             assert_eq!(read.get(), Some(&(i + total)));
 37 |             std::thread::sleep(SLEEP);
 38 |             drop(read);
 39 |         }
 40 |     };
 41 | 
 42 |     let handles = (0..CONCURRENCY)
 43 |         .into_iter()
 44 |         .map(|i| {
 45 |             let map_clone = map.clone();
 46 |             std::thread::spawn(move || job(map_clone, i, CONCURRENCY))
 47 |         })
 48 |         .collect_vec();
 49 | 
 50 |     for handle in handles {
 51 |         handle.join().unwrap();
 52 |     }
 53 | }
 54 | 
 55 | fn hash_map_concurrent_put_get() {
 56 |     let map = Arc::new(RwLock::new(HashMap::default()));
 57 | 
 58 |     let job = |map: Arc<RwLock<HashMap<u64, u64>>>, i: u64, total: u64| {
 59 |         {
 60 |             assert_eq!(map.write().insert(i, i), None);
 61 |         }
 62 |         {
 63 |             let read = map.read();
 64 |             assert_eq!(read.get(&i), Some(&i));
 65 |             std::thread::sleep(SLEEP);
 66 |             drop(read);
 67 |         }
 68 |         {
 69 |             let mut write = map.write();
 70 |             assert_eq!(write.get(&i), Some(&i));
 71 |             *write.get_mut(&i).unwrap() += total;
 72 |             assert_eq!(write.get(&i), Some(&(i + total)));
 73 |             std::thread::sleep(SLEEP);
 74 |             drop(write);
 75 |         }
 76 |         {
 77 |             let read = map.read();
 78 |             assert_eq!(read.get(&i), Some(&(i + total)));
 79 |             std::thread::sleep(SLEEP);
 80 |             drop(read);
 81 |         }
 82 |     };
 83 | 
 84 |     let handles = (0..CONCURRENCY)
 85 |         .into_iter()
 86 |         .map(|i| {
 87 |             let map_clone = map.clone();
 88 |             std::thread::spawn(move || job(map_clone, i, CONCURRENCY))
 89 |         })
 90 |         .collect_vec();
 91 | 
 92 |     for handle in handles {
 93 |         handle.join().unwrap();
 94 |     }
 95 | }
 96 | 
 97 | fn bench_hash_map_concurrent_put_get(c: &mut Criterion) {
 98 |     let mut group = c.benchmark_group("10 samples");
 99 |     group.sample_size(10);
100 | 
101 |     group.bench_function("hash map curruent put get", |b| {
102 |         b.iter(hash_map_concurrent_put_get)
103 |     });
104 | 
105 |     group.bench_function("sharded hash map curruent put get - 1 shard(s)", |b| {
106 |         b.iter(|| sharded_hash_map_concurrent_put_get(1))
107 |     });
108 | 
109 |     group.bench_function("sharded hash map curruent put get - 16 shard(s)", |b| {
110 |         b.iter(|| sharded_hash_map_concurrent_put_get(16))
111 |     });
112 | 
113 |     group.bench_function("sharded hash map curruent put get - 64 shard(s)", |b| {
114 |         b.iter(|| sharded_hash_map_concurrent_put_get(64))
115 |     });
116 | 
117 |     group.bench_function("sharded hash map curruent put get - 256 shard(s)", |b| {
118 |         b.iter(|| sharded_hash_map_concurrent_put_get(256))
119 |     });
120 | 
121 |     group.bench_function("sharded hash map curruent put get - 1024 shard(s)", |b| {
122 |         b.iter(|| sharded_hash_map_concurrent_put_get(1024))
123 |     });
124 | 
125 |     group.finish();
126 | }
127 | 
128 | criterion_group!(benches, bench_hash_map_concurrent_put_get);
129 | criterion_main!(benches);
130 | 


--------------------------------------------------------------------------------
/common/src/atomic.rs:
--------------------------------------------------------------------------------
 1 | #[macro_export]
 2 | macro_rules! may_advance_atomic {
 3 |     ($atomic:expr, $val:expr) => {
 4 |         let mut old = $atomic.load(Ordering::Relaxed);
 5 |         while $val > old {
 6 |             match $atomic.compare_exchange_weak(old, $val, Ordering::SeqCst, Ordering::Relaxed) {
 7 |                 Ok(_) => break,
 8 |                 Err(v) => old = v,
 9 |             }
10 |         }
11 |     };
12 | }
13 | 


--------------------------------------------------------------------------------
/common/src/channel_pool.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | use std::sync::Arc;
 3 | 
 4 | use tokio::sync::Mutex;
 5 | use tonic::transport::{Channel, Endpoint};
 6 | 
 7 | use crate::config::Node;
 8 | 
 9 | struct ChannelPoolCore {
10 |     endpoints: BTreeMap<u64, Endpoint>,
11 |     channels: BTreeMap<u64, Channel>,
12 | }
13 | 
14 | #[derive(Clone)]
15 | pub struct ChannelPool {
16 |     core: Arc<Mutex<ChannelPoolCore>>,
17 | }
18 | 
19 | fn endpoint(node: &Node) -> Endpoint {
20 |     Endpoint::from_shared(format!("http://{}:{}", node.host, node.port)).unwrap()
21 | }
22 | 
23 | impl Default for ChannelPool {
24 |     fn default() -> Self {
25 |         Self::with_nodes(vec![])
26 |     }
27 | }
28 | 
29 | impl ChannelPool {
30 |     pub fn with_nodes(nodes: Vec<Node>) -> Self {
31 |         Self {
32 |             core: Arc::new(Mutex::new(ChannelPoolCore {
33 |                 endpoints: BTreeMap::from_iter(
34 |                     nodes.into_iter().map(|node| (node.id, endpoint(&node))),
35 |                 ),
36 |                 channels: BTreeMap::default(),
37 |             })),
38 |         }
39 |     }
40 | 
41 |     pub async fn put_node(&self, node: Node) {
42 |         let mut guard = self.core.lock().await;
43 |         guard.endpoints.insert(node.id, endpoint(&node));
44 |     }
45 | 
46 |     pub async fn get(&self, node: u64) -> anyhow::Result<Channel> {
47 |         let mut guard = self.core.lock().await;
48 |         if let Some(channel) = guard.channels.get(&node) {
49 |             return Ok(channel.clone());
50 |         }
51 |         if let Some(endpoint) = guard.endpoints.get(&node) {
52 |             let channel = endpoint.connect().await?;
53 |             guard.channels.insert(node, channel.clone());
54 |             return Ok(channel);
55 |         }
56 |         Err(anyhow::anyhow!("endpoint of node {} not found", node))
57 |     }
58 | 
59 |     pub async fn release(&self, node: u64) -> anyhow::Result<()> {
60 |         let mut guard = self.core.lock().await;
61 |         match guard.channels.remove(&node) {
62 |             Some(_) => Ok(()),
63 |             None => Err(anyhow::anyhow!("channel to node {} not exists", node)),
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/common/src/coding.rs:
--------------------------------------------------------------------------------
 1 | use bytes::{Buf, BufMut};
 2 | use serde::Deserialize;
 3 | 
 4 | pub trait BytesSerde<'de>: serde::Serialize + serde::Deserialize<'de> + Sized {
 5 |     fn encode_to_vec(&self) -> anyhow::Result<Vec<u8>> {
 6 |         bincode::serialize(self).map_err(|e| anyhow::anyhow!("bincode serialize error: {}", e))
 7 |     }
 8 | 
 9 |     fn decode(slice: &'de [u8]) -> anyhow::Result<Self> {
10 |         bincode::deserialize(slice).map_err(|e| anyhow::anyhow!("bincode deserialize error: {}", e))
11 |     }
12 | }
13 | 
14 | #[derive(Deserialize, Clone, Copy, Debug)]
15 | pub enum CompressionAlgorithm {
16 |     None,
17 |     Lz4,
18 | }
19 | 
20 | impl CompressionAlgorithm {
21 |     pub fn encode(&self, buf: &mut impl BufMut) {
22 |         let v = match self {
23 |             Self::None => 0,
24 |             Self::Lz4 => 1,
25 |         };
26 |         buf.put_u8(v);
27 |     }
28 | 
29 |     pub fn decode(buf: &mut impl Buf) -> Result<Self, anyhow::Error> {
30 |         match buf.get_u8() {
31 |             0 => Ok(Self::None),
32 |             1 => Ok(Self::Lz4),
33 |             _ => Err(anyhow::anyhow!("not valid compression algorithm")),
34 |         }
35 |     }
36 | }
37 | 
38 | impl From<CompressionAlgorithm> for u8 {
39 |     fn from(ca: CompressionAlgorithm) -> Self {
40 |         match ca {
41 |             CompressionAlgorithm::None => 0,
42 |             CompressionAlgorithm::Lz4 => 1,
43 |         }
44 |     }
45 | }
46 | 
47 | impl From<CompressionAlgorithm> for u64 {
48 |     fn from(ca: CompressionAlgorithm) -> Self {
49 |         match ca {
50 |             CompressionAlgorithm::None => 0,
51 |             CompressionAlgorithm::Lz4 => 1,
52 |         }
53 |     }
54 | }
55 | 
56 | impl TryFrom<u8> for CompressionAlgorithm {
57 |     type Error = anyhow::Error;
58 |     fn try_from(v: u8) -> core::result::Result<Self, Self::Error> {
59 |         match v {
60 |             0 => Ok(Self::None),
61 |             1 => Ok(Self::Lz4),
62 |             _ => Err(anyhow::anyhow!("not valid compression algorithm")),
63 |         }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/common/src/config.rs:
--------------------------------------------------------------------------------
  1 | use std::str::FromStr;
  2 | 
  3 | use serde::Deserialize;
  4 | 
  5 | use crate::coding::CompressionAlgorithm;
  6 | 
  7 | #[derive(Deserialize, Clone, Copy, PartialEq, Eq, Debug)]
  8 | pub enum LevelCompactionStrategy {
  9 |     Overlap,
 10 |     NonOverlap,
 11 | }
 12 | 
 13 | #[derive(Deserialize, Clone, Debug)]
 14 | pub struct LevelOptions {
 15 |     pub compaction_strategy: LevelCompactionStrategy,
 16 |     pub compression_algorithm: CompressionAlgorithm,
 17 | }
 18 | 
 19 | #[derive(Deserialize, Clone, Default, Debug)]
 20 | pub struct LsmTreeConfig {
 21 |     pub l1_capacity: String,
 22 |     pub level_multiplier: usize,
 23 |     pub trigger_l0_compaction_ssts: usize,
 24 |     pub trigger_l0_compaction_interval: String,
 25 |     pub trigger_lmax_compaction_interval: String,
 26 |     pub trigger_compaction_interval: String,
 27 |     pub sstable_capacity: String,
 28 |     pub block_capacity: String,
 29 |     pub restart_interval: usize,
 30 |     pub bloom_false_positive: f64,
 31 |     pub compaction_pin_ttl: String,
 32 |     pub levels_options: Vec<LevelOptions>,
 33 | }
 34 | 
 35 | impl FromStr for LsmTreeConfig {
 36 |     type Err = anyhow::Error;
 37 | 
 38 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 39 |         let c = toml::from_str(s)?;
 40 |         Ok(c)
 41 |     }
 42 | }
 43 | 
 44 | // TODO: Fill me.
 45 | #[derive(Deserialize, Clone, Debug)]
 46 | pub struct S3Config {
 47 |     pub bucket: String,
 48 | }
 49 | 
 50 | #[derive(Deserialize, Clone, Debug)]
 51 | pub struct MinioConfig {
 52 |     pub url: String,
 53 | }
 54 | 
 55 | #[derive(Deserialize, Clone, Debug)]
 56 | pub struct CacheConfig {
 57 |     pub block_cache_capacity: String,
 58 |     pub meta_cache_capacity: String,
 59 | }
 60 | 
 61 | #[derive(Deserialize, Clone, Debug)]
 62 | pub struct Node {
 63 |     pub id: u64,
 64 |     pub host: String,
 65 |     pub port: u16,
 66 | }
 67 | 
 68 | #[derive(Deserialize, Clone, Debug)]
 69 | pub struct PrometheusConfig {
 70 |     pub host: String,
 71 |     pub port: u16,
 72 | }
 73 | 
 74 | #[cfg(test)]
 75 | mod tests {
 76 | 
 77 |     use test_log::test;
 78 | 
 79 |     use super::*;
 80 | 
 81 |     #[test]
 82 |     fn lsm_tree_config_serde() {
 83 |         let s = r#"
 84 |         l1_capacity = "1 MiB"
 85 |         level_multiplier = 10
 86 |         
 87 |         trigger_l0_compaction_ssts = 4
 88 |         trigger_l0_compaction_interval = "1 s"
 89 |         trigger_lmax_compaction_interval = "10 s"
 90 |         trigger_compaction_interval = "5 s"
 91 |         
 92 |         sstable_capacity = "64 KiB"
 93 |         block_capacity = "4 KiB"
 94 |         restart_interval = 2
 95 |         bloom_false_positive = 0.1
 96 |         
 97 |         compaction_pin_ttl = "15 s"
 98 |         
 99 |         [[levels_options]]
100 |         compaction_strategy = "Overlap"
101 |         compression_algorithm = "None"
102 |         
103 |         [[levels_options]]
104 |         compaction_strategy = "NonOverlap"
105 |         compression_algorithm = "None"
106 |         
107 |         [[levels_options]]
108 |         compaction_strategy = "NonOverlap"
109 |         compression_algorithm = "None"
110 |         
111 |         [[levels_options]]
112 |         compaction_strategy = "NonOverlap"
113 |         compression_algorithm = "None"
114 |         
115 |         [[levels_options]]
116 |         compaction_strategy = "NonOverlap"
117 |         compression_algorithm = "Lz4"
118 |         
119 |         [[levels_options]]
120 |         compaction_strategy = "NonOverlap"
121 |         compression_algorithm = "Lz4"
122 |         
123 |         [[levels_options]]
124 |         compaction_strategy = "NonOverlap"
125 |         compression_algorithm = "Lz4""#;
126 |         LsmTreeConfig::from_str(s).unwrap();
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/common/src/context.rs:
--------------------------------------------------------------------------------
 1 | use crate::coding::BytesSerde;
 2 | 
 3 | #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
 4 | pub struct Context {
 5 |     pub span_id: u64,
 6 |     pub request_id: u64,
 7 | }
 8 | 
 9 | impl<'de> BytesSerde<'de> for Context {}
10 | 


--------------------------------------------------------------------------------
/common/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod atomic;
 2 | pub mod channel_pool;
 3 | pub mod coding;
 4 | pub mod config;
 5 | pub mod context;
 6 | pub mod log;
 7 | pub mod notify_pool;
 8 | pub mod packer;
 9 | pub mod prometheus;
10 | pub mod sharded_hash_map;
11 | pub mod sync;
12 | pub mod time;
13 | pub mod tracing_slog_drain;
14 | 
15 | use async_trait::async_trait;
16 | 
17 | #[async_trait]
18 | pub trait Worker: Sync + Send + 'static {
19 |     async fn run(&mut self) -> anyhow::Result<()>;
20 | }
21 | 
22 | pub type BoxedWorker = Box<dyn Worker>;
23 | 


--------------------------------------------------------------------------------
/common/src/log.rs:
--------------------------------------------------------------------------------
  1 | use isahc::config::Configurable;
  2 | use tracing_subscriber::filter::Targets;
  3 | use tracing_subscriber::fmt::format::FmtSpan;
  4 | use tracing_subscriber::layer::{Layer, SubscriberExt};
  5 | use tracing_subscriber::util::SubscriberInitExt;
  6 | 
  7 | pub struct LogGuard {
  8 |     _file_appender_guard: Option<tracing_appender::non_blocking::WorkerGuard>,
  9 |     pub jaeger_enabled: bool,
 10 |     pub tokio_console_enabled: bool,
 11 | }
 12 | 
 13 | impl Drop for LogGuard {
 14 |     fn drop(&mut self) {
 15 |         if self.jaeger_enabled {
 16 |             opentelemetry::global::shutdown_tracer_provider();
 17 |         }
 18 |     }
 19 | }
 20 | 
 21 | pub fn init_runkv_logger(service: &str, id: u64, log_path: &str) -> LogGuard {
 22 |     let tokio_console_enabled = cfg!(feature = "console");
 23 |     let jaeger_enabled = cfg!(feature = "tracing");
 24 | 
 25 |     if tokio_console_enabled {
 26 |         #[cfg(feature = "console")]
 27 |         {
 28 |             console_subscriber::init();
 29 |             return LogGuard {
 30 |                 _file_appender_guard: None,
 31 |                 jaeger_enabled,
 32 |                 tokio_console_enabled,
 33 |             };
 34 |         }
 35 |     }
 36 | 
 37 |     let (file_appender, file_appender_guard) = tracing_appender::non_blocking(
 38 |         tracing_appender::rolling::daily(log_path, format!("runkv-{}-{}.log", service, id)),
 39 |     );
 40 | 
 41 |     let guard = LogGuard {
 42 |         _file_appender_guard: Some(file_appender_guard),
 43 |         jaeger_enabled,
 44 |         tokio_console_enabled,
 45 |     };
 46 | 
 47 |     let fmt_layer = {
 48 |         let runkv_log_level = if cfg!(feature = "verbose-release-log") || cfg!(debug_assertions) {
 49 |             tracing::Level::DEBUG
 50 |         } else {
 51 |             tracing::Level::INFO
 52 |         };
 53 | 
 54 |         // Configure RunKV's own crates to log at TRACE level, and ignore all third-party crates.
 55 |         let filter = Targets::new()
 56 |             // Enable trace for most modules.
 57 |             .with_target("runkv_common", runkv_log_level)
 58 |             .with_target("runkv_storage", runkv_log_level)
 59 |             .with_target("runkv_rudder", runkv_log_level)
 60 |             .with_target("runkv_wheel", runkv_log_level)
 61 |             .with_target("runkv_exhauster", runkv_log_level)
 62 |             .with_target("runkv_tests", runkv_log_level)
 63 |             .with_target("openraft::raft", tracing::Level::INFO)
 64 |             .with_target("raft", tracing::Level::INFO)
 65 |             .with_target("events", tracing::Level::WARN);
 66 | 
 67 |         tracing_subscriber::fmt::layer()
 68 |             .with_span_events(FmtSpan::ACTIVE)
 69 |             .with_target(true)
 70 |             .with_level(true)
 71 |             .with_writer(file_appender)
 72 |             .with_ansi(false)
 73 |             .with_filter(filter)
 74 |     };
 75 | 
 76 |     if jaeger_enabled {
 77 |         opentelemetry::global::set_text_map_propagator(opentelemetry_jaeger::Propagator::new());
 78 | 
 79 |         // Configure RunKV's own crates to log at TRACE level, and ignore all third-party crates.
 80 |         let filter = Targets::new()
 81 |             // Enable trace for most modules.
 82 |             .with_target("runkv_common", tracing::Level::TRACE)
 83 |             .with_target("runkv_storage", tracing::Level::TRACE)
 84 |             .with_target("runkv_rudder", tracing::Level::TRACE)
 85 |             .with_target("runkv_wheel", tracing::Level::TRACE)
 86 |             .with_target("runkv_exhauster", tracing::Level::TRACE)
 87 |             .with_target("runkv_tests", tracing::Level::TRACE)
 88 |             .with_target("openraft::raft", tracing::Level::TRACE)
 89 |             .with_target("raft", tracing::Level::TRACE)
 90 |             .with_target("events", tracing::Level::WARN);
 91 | 
 92 |         let tracer = opentelemetry_jaeger::new_pipeline()
 93 |             // TODO: use UDP tracing in production environment
 94 |             .with_collector_endpoint("http://127.0.0.1:14268/api/traces")
 95 |             // TODO: change service name to compute-{port}
 96 |             .with_service_name(service)
 97 |             // disable proxy
 98 |             .with_http_client(isahc::HttpClient::builder().proxy(None).build().unwrap())
 99 |             .install_batch(opentelemetry::runtime::Tokio)
100 |             .unwrap();
101 | 
102 |         let opentelemetry_layer = tracing_opentelemetry::layer()
103 |             .with_tracer(tracer)
104 |             .with_filter(filter);
105 | 
106 |         tracing_subscriber::registry()
107 |             .with(opentelemetry_layer)
108 |             .init();
109 |     } else {
110 |         tracing_subscriber::registry().with(fmt_layer).init();
111 |     }
112 | 
113 |     guard
114 | }
115 | 


--------------------------------------------------------------------------------
/common/src/packer.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use parking_lot::Mutex;
 4 | use tokio::sync::oneshot;
 5 | 
 6 | const DEFAULT_QUEUE_CAPACITY: usize = 64;
 7 | 
 8 | pub struct Item<T, R>
 9 | where
10 |     T: 'static,
11 |     R: 'static,
12 | {
13 |     pub data: T,
14 |     pub notifier: Option<oneshot::Sender<R>>,
15 | }
16 | 
17 | struct PackerCore<T, R>
18 | where
19 |     T: 'static,
20 |     R: 'static,
21 | {
22 |     queue: Mutex<Vec<Item<T, R>>>,
23 | }
24 | 
25 | pub struct Packer<T, R>
26 | where
27 |     T: 'static,
28 |     R: 'static,
29 | {
30 |     default_queue_capacity: usize,
31 | 
32 |     core: Arc<PackerCore<T, R>>,
33 | }
34 | 
35 | impl<T, R> Clone for Packer<T, R>
36 | where
37 |     T: 'static,
38 |     R: 'static,
39 | {
40 |     fn clone(&self) -> Self {
41 |         Self {
42 |             default_queue_capacity: self.default_queue_capacity,
43 |             core: self.core.clone(),
44 |         }
45 |     }
46 | }
47 | 
48 | impl<T, R> Default for Packer<T, R>
49 | where
50 |     T: 'static,
51 |     R: 'static,
52 | {
53 |     fn default() -> Self {
54 |         Self::new(DEFAULT_QUEUE_CAPACITY)
55 |     }
56 | }
57 | 
58 | impl<T, R> Packer<T, R>
59 | where
60 |     T: 'static,
61 |     R: 'static,
62 | {
63 |     pub fn new(default_queue_capacity: usize) -> Self {
64 |         Self {
65 |             default_queue_capacity,
66 |             core: Arc::new(PackerCore {
67 |                 queue: Mutex::new(Vec::with_capacity(default_queue_capacity)),
68 |             }),
69 |         }
70 |     }
71 | 
72 |     pub fn append(&self, data: T, notifier: Option<oneshot::Sender<R>>) -> bool {
73 |         let mut queue = self.core.queue.lock();
74 |         let is_leader = queue.is_empty();
75 |         queue.push(Item { data, notifier });
76 |         is_leader
77 |     }
78 | 
79 |     pub fn package(&self) -> Vec<Item<T, R>> {
80 |         let mut queue = self.core.queue.lock();
81 |         let mut package = Vec::with_capacity(self.default_queue_capacity);
82 |         std::mem::swap(&mut package, &mut (*queue));
83 |         package
84 |     }
85 | }
86 | 
87 | #[cfg(test)]
88 | mod tests {
89 |     use super::*;
90 | 
91 |     fn is_send_sync_clone<T: Send + Sync + Clone + 'static>() {}
92 | 
93 |     #[test]
94 |     fn ensure_send_sync_clone() {
95 |         is_send_sync_clone::<Packer<(), ()>>();
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/common/src/prometheus.rs:
--------------------------------------------------------------------------------
 1 | use std::net::SocketAddr;
 2 | 
 3 | use http::header::CONTENT_TYPE;
 4 | use http::{Request, Response};
 5 | use hyper::service::{make_service_fn, service_fn};
 6 | use hyper::{Body, Error, Server};
 7 | use prometheus::{Encoder, TextEncoder};
 8 | use tracing::{error, info};
 9 | 
10 | pub struct DefaultPrometheusExporter;
11 | 
12 | impl DefaultPrometheusExporter {
13 |     pub fn init(addr: SocketAddr) {
14 |         tokio::spawn(async move {
15 |             info!("Prometheus service is set up on http://{}", addr);
16 |             if let Err(e) = Server::bind(&addr)
17 |                 .serve(make_service_fn(|_| async move {
18 |                     Ok::<_, Error>(service_fn(Self::serve))
19 |                 }))
20 |                 .await
21 |             {
22 |                 error!("Prometheus service error: {}", e);
23 |             }
24 |         });
25 |     }
26 | 
27 |     async fn serve(_request: Request<Body>) -> anyhow::Result<Response<Body>> {
28 |         let encoder = TextEncoder::new();
29 |         let mut buffer = Vec::with_capacity(4096);
30 |         let metrics = prometheus::gather();
31 |         encoder.encode(&metrics, &mut buffer).unwrap();
32 |         let response = Response::builder()
33 |             .status(200)
34 |             .header(CONTENT_TYPE, encoder.format_type())
35 |             .body(Body::from(buffer))?;
36 |         Ok(response)
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/common/src/sync.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::atomic::{AtomicUsize, Ordering};
  2 | use std::sync::Arc;
  3 | 
  4 | struct TicketLockCore {
  5 |     head: AtomicUsize,
  6 |     tail: AtomicUsize,
  7 | }
  8 | 
  9 | #[derive(Clone)]
 10 | pub struct TicketLock {
 11 |     core: Arc<TicketLockCore>,
 12 | }
 13 | 
 14 | impl Default for TicketLock {
 15 |     fn default() -> Self {
 16 |         Self {
 17 |             core: Arc::new(TicketLockCore {
 18 |                 head: AtomicUsize::new(0),
 19 |                 tail: AtomicUsize::new(0),
 20 |             }),
 21 |         }
 22 |     }
 23 | }
 24 | 
 25 | impl TicketLock {
 26 |     pub fn acquire(&self) -> usize {
 27 |         let ticket = self.core.head.fetch_add(1, Ordering::SeqCst);
 28 |         while ticket != self.core.tail.load(Ordering::Acquire) {}
 29 |         ticket
 30 |     }
 31 | 
 32 |     pub async fn async_acquire(&self) -> usize {
 33 |         let ticket = self.core.head.fetch_add(1, Ordering::SeqCst);
 34 |         while ticket != self.core.tail.load(Ordering::Acquire) {
 35 |             tokio::task::yield_now().await;
 36 |         }
 37 |         ticket
 38 |     }
 39 | 
 40 |     pub fn release(&self) {
 41 |         self.core.tail.fetch_add(1, Ordering::Release);
 42 |     }
 43 | }
 44 | 
 45 | #[cfg(test)]
 46 | mod tests {
 47 |     use std::time::Duration;
 48 | 
 49 |     use itertools::Itertools;
 50 |     use parking_lot::Mutex;
 51 |     use rand::Rng;
 52 |     use test_log::test;
 53 | 
 54 |     use super::*;
 55 | 
 56 |     #[test]
 57 |     fn test_ticket_lock() {
 58 |         let lock = TicketLock::default();
 59 |         loop {
 60 |             let results = Arc::new(Mutex::new(vec![]));
 61 | 
 62 |             let handles = (0..100)
 63 |                 .into_iter()
 64 |                 .map(|_| {
 65 |                     let lock_clone = lock.clone();
 66 |                     let results_clone = results.clone();
 67 |                     std::thread::spawn(move || {
 68 |                         let mut rng = rand::thread_rng();
 69 |                         std::thread::sleep(Duration::from_millis(rng.gen_range(10..100)));
 70 |                         let ticket = lock_clone.acquire();
 71 |                         results_clone.lock().push(ticket);
 72 |                         lock_clone.release();
 73 |                         ticket
 74 |                     })
 75 |                 })
 76 |                 .collect_vec();
 77 | 
 78 |             let tickets = handles
 79 |                 .into_iter()
 80 |                 .map(|handle| handle.join().unwrap())
 81 |                 .collect_vec();
 82 |             let mut ordered = true;
 83 |             for (i, t) in tickets.into_iter().enumerate() {
 84 |                 if i != t {
 85 |                     ordered = false;
 86 |                     break;
 87 |                 }
 88 |             }
 89 |             if ordered {
 90 |                 continue;
 91 |             }
 92 |             let results = Arc::try_unwrap(results).unwrap().into_inner();
 93 |             for (i, r) in results.into_iter().enumerate() {
 94 |                 assert_eq!(i, r);
 95 |             }
 96 |             break;
 97 |         }
 98 |     }
 99 | 
100 |     #[test(tokio::test(flavor = "multi_thread", worker_threads = 10))]
101 |     async fn test_ticket_lock_async() {
102 |         let lock = TicketLock::default();
103 |         loop {
104 |             let results = Arc::new(Mutex::new(vec![]));
105 | 
106 |             let futures = (0..100)
107 |                 .into_iter()
108 |                 .map(|_| {
109 |                     let lock_clone = lock.clone();
110 |                     let results_clone = results.clone();
111 |                     async move {
112 |                         let mut rng = rand::thread_rng();
113 |                         tokio::time::sleep(Duration::from_millis(rng.gen_range(10..100))).await;
114 |                         let ticket = lock_clone.async_acquire().await;
115 |                         results_clone.lock().push(ticket);
116 |                         lock_clone.release();
117 |                         ticket
118 |                     }
119 |                 })
120 |                 .collect_vec();
121 |             let tickets = futures::future::join_all(futures).await;
122 |             let mut ordered = true;
123 |             for (i, t) in tickets.into_iter().enumerate() {
124 |                 if i != t {
125 |                     ordered = false;
126 |                     break;
127 |                 }
128 |             }
129 |             if ordered {
130 |                 continue;
131 |             }
132 |             let results = Arc::try_unwrap(results).unwrap().into_inner();
133 |             for (i, r) in results.into_iter().enumerate() {
134 |                 assert_eq!(i, r);
135 |             }
136 |             break;
137 |         }
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/common/src/time.rs:
--------------------------------------------------------------------------------
 1 | use std::time::{Duration, SystemTime, UNIX_EPOCH};
 2 | 
 3 | lazy_static::lazy_static! {
 4 |     /// 2022-03-09T00:00:00Z.
 5 |     static ref RUNKV_UNIX_DATE_EPOCH: SystemTime = SystemTime::UNIX_EPOCH + Duration::from_secs(1_615_248_000);
 6 | }
 7 | 
 8 | pub fn timestamp() -> u64 {
 9 |     SystemTime::now()
10 |         .duration_since(UNIX_EPOCH)
11 |         .unwrap()
12 |         .as_millis() as u64
13 | }
14 | 
15 | pub fn rtimestamp() -> u64 {
16 |     RUNKV_UNIX_DATE_EPOCH.elapsed().unwrap().as_millis() as u64
17 | }
18 | 
19 | #[cfg(test)]
20 | mod tests {
21 |     use chrono::{Local, TimeZone, Utc};
22 |     use test_log::test;
23 | 
24 |     use super::*;
25 | 
26 |     #[test]
27 |     fn test_singularity_system_time() {
28 |         let utc = Utc.ymd(2021, 3, 9).and_hms(0, 0, 0);
29 |         let runkv_dt = Local.from_utc_datetime(&utc.naive_utc());
30 |         let runkv_st = SystemTime::from(runkv_dt);
31 |         assert_eq!(runkv_st, *RUNKV_UNIX_DATE_EPOCH);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/common/src/tracing_slog_drain.rs:
--------------------------------------------------------------------------------
  1 | pub struct TracingSlogDrain;
  2 | 
  3 | macro_rules! tracing_event {
  4 |     ($level:expr, $msg:expr, $filepath:expr, $namespace:expr, $lineno:expr) => {
  5 |         match $level {
  6 |             tracing::Level::ERROR => tracing::error!(
  7 |                 code.filepath = $filepath,
  8 |                 code.namespace = $namespace,
  9 |                 code.lineno = $lineno,
 10 |                 "{}",
 11 |                 $msg
 12 |             ),
 13 |             tracing::Level::WARN => tracing::warn!(
 14 |                 code.filepath = $filepath,
 15 |                 code.namespace = $namespace,
 16 |                 code.lineno = $lineno,
 17 |                 "{}",
 18 |                 $msg
 19 |             ),
 20 |             tracing::Level::INFO => tracing::info!("{}", $msg),
 21 |             tracing::Level::DEBUG => tracing::debug!("{}", $msg),
 22 |             tracing::Level::TRACE => tracing::trace!(
 23 |                 code.filepath = $filepath,
 24 |                 code.namespace = $namespace,
 25 |                 code.lineno = $lineno,
 26 |                 "{}",
 27 |                 $msg
 28 |             ),
 29 |         }
 30 |     };
 31 | }
 32 | 
 33 | fn level(level: slog::Level) -> tracing::Level {
 34 |     match level {
 35 |         // There is not `Critical` level in `tracing`.
 36 |         slog::Level::Critical => tracing::Level::ERROR,
 37 |         slog::Level::Error => tracing::Level::ERROR,
 38 |         slog::Level::Warning => tracing::Level::WARN,
 39 |         slog::Level::Info => tracing::Level::INFO,
 40 |         slog::Level::Debug => tracing::Level::DEBUG,
 41 |         slog::Level::Trace => tracing::Level::TRACE,
 42 |     }
 43 | }
 44 | 
 45 | struct KvSerializer<W: std::io::Write> {
 46 |     writer: W,
 47 | }
 48 | 
 49 | impl<W: std::io::Write> KvSerializer<W> {
 50 |     fn new(writer: W) -> Self {
 51 |         Self { writer }
 52 |     }
 53 | 
 54 |     fn into_inner(self) -> W {
 55 |         self.writer
 56 |     }
 57 | 
 58 |     fn write(&mut self, arg: &std::fmt::Arguments) -> slog::Result {
 59 |         write!(self.writer, "{}", arg)?;
 60 |         Ok(())
 61 |     }
 62 | }
 63 | 
 64 | impl<W: std::io::Write> slog::Serializer for KvSerializer<W> {
 65 |     fn emit_arguments(&mut self, key: slog::Key, val: &std::fmt::Arguments) -> slog::Result {
 66 |         write!(self.writer, " {}={}", key, val)?;
 67 |         Ok(())
 68 |     }
 69 | }
 70 | 
 71 | impl slog::Drain for TracingSlogDrain {
 72 |     type Ok = ();
 73 | 
 74 |     type Err = slog::Never;
 75 | 
 76 |     fn log(
 77 |         &self,
 78 |         record: &slog::Record,
 79 |         values: &slog::OwnedKVList,
 80 |     ) -> std::result::Result<Self::Ok, Self::Err> {
 81 |         use slog::KV;
 82 | 
 83 |         let writer = std::io::Cursor::new(Vec::new());
 84 |         let mut serializer = KvSerializer::new(writer);
 85 | 
 86 |         serializer.write(record.msg()).unwrap();
 87 |         values.serialize(record, &mut serializer).unwrap();
 88 | 
 89 |         let buf = serializer.into_inner().into_inner();
 90 |         let s = String::from_utf8_lossy(&buf);
 91 | 
 92 |         let level = level(record.level());
 93 | 
 94 |         let location = record.location();
 95 | 
 96 |         tracing_event!(
 97 |             level,
 98 |             s.as_ref(),
 99 |             location.file,
100 |             location.module,
101 |             location.line
102 |         );
103 |         Ok(())
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/etc/exhauster.toml:
--------------------------------------------------------------------------------
 1 | id = 201
 2 | host = "127.0.0.1"
 3 | port = 12501
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | heartbeat_interval = "1 s"
 7 | 
 8 | [rudder]
 9 | id = 1
10 | host = "127.0.0.1"
11 | port = 12300
12 | 
13 | [minio]
14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
15 | 
16 | [s3]
17 | bucket = "runkv"
18 | 
19 | [buffer]
20 | write_buffer_capacity = "64 MiB"
21 | 
22 | [cache]
23 | block_cache_capacity = "512 MiB"
24 | meta_cache_capacity = "256 MiB"
25 | 


--------------------------------------------------------------------------------
/etc/grafana-provisioning/dashboards/runkv-dashboards.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 | - name: 'runkv-overview'
 5 |   orgId: 1
 6 |   folder: 'runkv'
 7 |   folderUid: ''
 8 |   type: file
 9 |   options:
10 |     path: etc/grafana-dashboards/runkv-overview.json
11 | 


--------------------------------------------------------------------------------
/etc/grafana-provisioning/datasources/runkv-prometheus.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | deleteDatasources:
 3 |   - name: risedev-prometheus
 4 | datasources:
 5 |   - name: risedev-prometheus
 6 |     type: prometheus
 7 |     access: proxy
 8 |     url: http://127.0.0.1:9091
 9 |     withCredentials: false
10 |     isDefault: false
11 |     tlsAuth: false
12 |     tlsAuthWithCACert: false
13 |     version: 1
14 |     editable: true
15 |     isDefault: true


--------------------------------------------------------------------------------
/etc/lsm_tree.toml:
--------------------------------------------------------------------------------
 1 | [lsm_tree]
 2 | l1_capacity = "1 MiB"
 3 | level_multiplier = 10
 4 | 
 5 | trigger_l0_compaction_ssts = 4
 6 | trigger_l0_compaction_interval = "1 s"
 7 | trigger_lmax_compaction_interval = "10 s"
 8 | trigger_compaction_interval = "5 s"
 9 | 
10 | sstable_capacity = "64 KiB"
11 | block_capacity = "4 KiB"
12 | restart_interval = 2
13 | bloom_false_positive = 0.1
14 | 
15 | compaction_pin_ttl = "15 s"
16 | 
17 | [[lsm_tree.levels_options]]
18 | compaction_strategy = "Overlap"
19 | compression_algorithm = "None"
20 | 
21 | [[lsm_tree.levels_options]]
22 | compaction_strategy = "NonOverlap"
23 | compression_algorithm = "None"
24 | 
25 | [[lsm_tree.levels_options]]
26 | compaction_strategy = "NonOverlap"
27 | compression_algorithm = "None"
28 | 
29 | [[lsm_tree.levels_options]]
30 | compaction_strategy = "NonOverlap"
31 | compression_algorithm = "None"
32 | 
33 | [[lsm_tree.levels_options]]
34 | compaction_strategy = "NonOverlap"
35 | compression_algorithm = "Lz4"
36 | 
37 | [[lsm_tree.levels_options]]
38 | compaction_strategy = "NonOverlap"
39 | compression_algorithm = "Lz4"
40 | 
41 | [[lsm_tree.levels_options]]
42 | compaction_strategy = "NonOverlap"
43 | compression_algorithm = "Lz4"


--------------------------------------------------------------------------------
/etc/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 15s
 3 |   evaluation_interval: 15s
 4 | scrape_configs:
 5 |   - job_name: "prometheus-runkv"
 6 |     scrape_interval: 1s
 7 |     static_configs:
 8 |       - targets:
 9 |           [
10 |             "127.0.0.1:9890",
11 |             "127.0.0.1:9891",
12 |             "127.0.0.1:9892",
13 |             "127.0.0.1:9893",
14 |             "127.0.0.1:9894",
15 |             "127.0.0.1:9895",
16 |             "127.0.0.1:9896",
17 |             "127.0.0.1:9897",
18 |             "127.0.0.1:9898",
19 |             "127.0.0.1:9899",
20 |           ]
21 | 


--------------------------------------------------------------------------------
/etc/rudder.toml:
--------------------------------------------------------------------------------
 1 | id = 1
 2 | host = "127.0.0.1"
 3 | port = 12301
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | health_timeout = "10 s"
 7 | 
 8 | [minio]
 9 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
10 | 
11 | [s3]
12 | bucket = "runkv"
13 | 
14 | [cache]
15 | block_cache_capacity = "0 B"
16 | meta_cache_capacity = "256 MiB"
17 | 
18 | [lsm_tree]
19 | trigger_l0_compaction_ssts = 4
20 | trigger_l0_compaction_interval = "1 s"
21 | trigger_compaction_interval = "5 s"
22 | 
23 | sstable_capacity = "64 KiB"
24 | block_capacity = "4 KiB"
25 | restart_interval = 2
26 | bloom_false_positive = 0.1
27 | 
28 | compaction_pin_ttl = "15 s"
29 | 
30 | [[lsm_tree.levels_options]]
31 | compaction_strategy = "Overlap"
32 | compression_algorithm = "None"
33 | 
34 | [[lsm_tree.levels_options]]
35 | compaction_strategy = "NonOverlap"
36 | compression_algorithm = "None"
37 | 
38 | [[lsm_tree.levels_options]]
39 | compaction_strategy = "NonOverlap"
40 | compression_algorithm = "None"
41 | 
42 | [[lsm_tree.levels_options]]
43 | compaction_strategy = "NonOverlap"
44 | compression_algorithm = "None"
45 | 
46 | [[lsm_tree.levels_options]]
47 | compaction_strategy = "NonOverlap"
48 | compression_algorithm = "Lz4"
49 | 
50 | [[lsm_tree.levels_options]]
51 | compaction_strategy = "NonOverlap"
52 | compression_algorithm = "Lz4"
53 | 
54 | [[lsm_tree.levels_options]]
55 | compaction_strategy = "NonOverlap"
56 | compression_algorithm = "Lz4"
57 | 


--------------------------------------------------------------------------------
/etc/wheel.toml:
--------------------------------------------------------------------------------
 1 | id = 101
 2 | host = "127.0.0.1"
 3 | port = 12401
 4 | log = ".run/log/"
 5 | data_path = "data"
 6 | meta_path = "meta"
 7 | poll_interval = "100ms"
 8 | heartbeat_interval = "100ms"
 9 | 
10 | [rudder]
11 | id = 1
12 | host = "127.0.0.1"
13 | port = 12301
14 | 
15 | [minio]
16 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
17 | 
18 | [s3]
19 | bucket = "runkv"
20 | 
21 | [buffer]
22 | write_buffer_capacity = "64 MiB"
23 | 
24 | [cache]
25 | block_cache_capacity = "512 MiB"
26 | meta_cache_capacity = "256 MiB"
27 | 
28 | [raft_log_store]
29 | log_dir_path = "/path/to/log/dir"
30 | log_file_capacity = "64 MiB"
31 | block_cache_capacity = "256 MiB"
32 | persist = "sync"
33 | 
34 | [lsm_tree]
35 | l1_capacity = "1 MiB"
36 | level_multiplier = 10
37 | 
38 | trigger_l0_compaction_ssts = 4
39 | trigger_l0_compaction_interval = "1 s"
40 | trigger_compaction_interval = "5 s"
41 | 
42 | sstable_capacity = "64 KiB"
43 | block_capacity = "4 KiB"
44 | restart_interval = 2
45 | bloom_false_positive = 0.1
46 | 
47 | compaction_pin_ttl = "15 s"
48 | 
49 | [[lsm_tree.levels_options]]
50 | compaction_strategy = "Overlap"
51 | compression_algorithm = "None"
52 | 
53 | [[lsm_tree.levels_options]]
54 | compaction_strategy = "NonOverlap"
55 | compression_algorithm = "None"
56 | 
57 | [[lsm_tree.levels_options]]
58 | compaction_strategy = "NonOverlap"
59 | compression_algorithm = "None"
60 | 
61 | [[lsm_tree.levels_options]]
62 | compaction_strategy = "NonOverlap"
63 | compression_algorithm = "None"
64 | 
65 | [[lsm_tree.levels_options]]
66 | compaction_strategy = "NonOverlap"
67 | compression_algorithm = "Lz4"
68 | 
69 | [[lsm_tree.levels_options]]
70 | compaction_strategy = "NonOverlap"
71 | compression_algorithm = "Lz4"
72 | 
73 | [[lsm_tree.levels_options]]
74 | compaction_strategy = "NonOverlap"
75 | compression_algorithm = "Lz4"
76 | 
77 | [tiered_cache]
78 | type = "FileCache"
79 | [tiered_cache.args]
80 | dir = "/path/to/file/cache/dir"
81 | capacity = "256 MiB"
82 | total_buffer_capacity = "64 MiB"
83 | cache_file_fallocate_unit = "64 MiB"
84 | cache_meta_fallocate_unit = "16 MiB"
85 | cache_file_max_write_size = "2 MiB"


--------------------------------------------------------------------------------
/exhauster/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-exhauster"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | async-trait = "0.1"
10 | bytes = "1"
11 | bytesize = { version = "1.1.0", features = ["serde"] }
12 | clap = { version = "3.1.6", features = ["derive"] }
13 | humantime = "2.1.0"
14 | humantime-serde = "1.1.1"
15 | itertools = "0.10.3"
16 | parking_lot = "0.12"
17 | prost = "0.9"
18 | runkv-common = { path = "../common" }
19 | runkv-proto = { path = "../proto" }
20 | runkv-storage = { path = "../storage" }
21 | serde = "1.0"
22 | serde_derive = "1.0"
23 | thiserror = "1.0"
24 | tokio = { version = "1", features = [
25 |     "rt-multi-thread",
26 |     "sync",
27 |     "macros",
28 |     "time",
29 | ] }
30 | toml = "0.4.2"
31 | tonic = "0.6.2"
32 | tracing = "0.1"
33 | tracing-subscriber = "0.3"
34 | 
35 | [target.'cfg(not(target_env = "msvc"))'.dependencies]
36 | tikv-jemallocator = "0.4.3"
37 | 
38 | [dev-dependencies]
39 | env_logger = "*"
40 | test-log = "0.2.10"
41 | 
42 | [features]
43 | verbose-release-log = ["tracing/release_max_level_trace"]
44 | 


--------------------------------------------------------------------------------
/exhauster/src/compaction_filter.rs:
--------------------------------------------------------------------------------
 1 | use bytes::Bytes;
 2 | 
 3 | pub trait CompactionFilter {
 4 |     /// Keep the key value pair if `filter` returns true.
 5 |     fn filter(&mut self, key: &[u8], value: Option<&[u8]>, sequence: u64) -> bool;
 6 | }
 7 | 
 8 | pub struct DefaultCompactionFilter {
 9 |     last_key: Bytes,
10 |     watermark: u64,
11 |     _remove_tombstone: bool,
12 | }
13 | 
14 | impl DefaultCompactionFilter {
15 |     pub fn new(watermark: u64, remove_tombstone: bool) -> Self {
16 |         Self {
17 |             last_key: Bytes::default(),
18 |             watermark,
19 |             _remove_tombstone: remove_tombstone,
20 |         }
21 |     }
22 | }
23 | 
24 | impl CompactionFilter for DefaultCompactionFilter {
25 |     fn filter(&mut self, key: &[u8], _value: Option<&[u8]>, sequence: u64) -> bool {
26 |         let mut retain = true;
27 |         // TODO: Handle `remove_tombstone`.
28 |         if key == self.last_key && sequence < self.watermark {
29 |             retain = false;
30 |         }
31 |         self.last_key = Bytes::copy_from_slice(key);
32 |         retain
33 |     }
34 | }
35 | 
36 | #[cfg(test)]
37 | mod tests {
38 | 
39 |     use test_log::test;
40 | 
41 |     use super::*;
42 | 
43 |     #[test]
44 |     fn test_default_compaction_filter() {
45 |         #[allow(clippy::type_complexity)]
46 |         let dataset: Vec<(&[u8], Option<&[u8]>, u64, bool)> = vec![
47 |             (b"k1", Some(b"v1-20"), 20, true),
48 |             (b"k1", Some(b"v1-10"), 10, true),
49 |             (b"k1", Some(b"v1-1"), 1, false),
50 |             (b"k2", None, 1, true),
51 |             (b"k3", Some(b"v3-100"), 100, true),
52 |             (b"k3", None, 15, true),
53 |             (b"k3", None, 8, false),
54 |             (b"k3", Some(b"v3-100"), 100, true),
55 |             (b"k4", None, 100, true),
56 |             (b"k4", Some(b"v4-20"), 20, true),
57 |             (b"k4", Some(b"v4-8"), 8, false),
58 |             (b"k4", None, 1, false),
59 |         ];
60 |         let mut filter = DefaultCompactionFilter::new(10, false);
61 |         for data in dataset {
62 |             assert_eq!(filter.filter(data.0, data.1, data.2), data.3)
63 |         }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/exhauster/src/config.rs:
--------------------------------------------------------------------------------
 1 | use runkv_common::config::{CacheConfig, MinioConfig, Node, S3Config};
 2 | use serde::Deserialize;
 3 | 
 4 | #[derive(Deserialize, Clone, Debug)]
 5 | pub struct ExhausterConfig {
 6 |     pub id: u64,
 7 |     pub host: String,
 8 |     pub port: u16,
 9 |     pub data_path: String,
10 |     pub meta_path: String,
11 |     pub heartbeat_interval: String,
12 |     pub rudder: Node,
13 |     pub s3: Option<S3Config>,
14 |     pub minio: Option<MinioConfig>,
15 |     pub cache: CacheConfig,
16 | }
17 | 


--------------------------------------------------------------------------------
/exhauster/src/error.rs:
--------------------------------------------------------------------------------
 1 | pub type Result<T> = std::result::Result<T, anyhow::Error>;
 2 | 
 3 | pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> anyhow::Error {
 4 |     anyhow::anyhow!("error: {}", e.into())
 5 | }
 6 | 
 7 | pub fn config_err(e: impl Into<Box<dyn std::error::Error>>) -> anyhow::Error {
 8 |     anyhow::anyhow!("config error: {}", e.into())
 9 | }
10 | 


--------------------------------------------------------------------------------
/exhauster/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::box_default)]
  2 | 
  3 | pub mod compaction_filter;
  4 | pub mod config;
  5 | pub mod error;
  6 | pub mod partitioner;
  7 | pub mod service;
  8 | pub mod worker;
  9 | 
 10 | use std::sync::Arc;
 11 | 
 12 | use bytesize::ByteSize;
 13 | use config::ExhausterConfig;
 14 | use error::{config_err, err, Result};
 15 | use runkv_common::channel_pool::ChannelPool;
 16 | use runkv_common::BoxedWorker;
 17 | use runkv_proto::common::Endpoint as PbEndpoint;
 18 | use runkv_proto::exhauster::exhauster_service_server::ExhausterServiceServer;
 19 | use runkv_storage::components::{
 20 |     BlockCache, LsmTreeMetrics, LsmTreeMetricsRef, SstableStore, SstableStoreOptions,
 21 |     SstableStoreRef,
 22 | };
 23 | use runkv_storage::tiered_cache::TieredCache;
 24 | use runkv_storage::{MemObjectStore, ObjectStoreRef, S3ObjectStore};
 25 | use service::{Exhauster, ExhausterOptions};
 26 | use tonic::transport::Server;
 27 | use tracing::info;
 28 | use worker::heartbeater::{Heartbeater, HeartbeaterOptions};
 29 | 
 30 | pub async fn bootstrap_exhauster(
 31 |     config: &ExhausterConfig,
 32 |     exhauster: Exhauster,
 33 |     workers: Vec<BoxedWorker>,
 34 | ) -> Result<()> {
 35 |     let addr_str = format!("{}:{}", config.host, config.port);
 36 | 
 37 |     for mut worker in workers.into_iter() {
 38 |         tokio::spawn(async move { worker.run().await });
 39 |     }
 40 | 
 41 |     Server::builder()
 42 |         .add_service(ExhausterServiceServer::new(exhauster))
 43 |         .serve(addr_str.parse().map_err(config_err)?)
 44 |         .await
 45 |         .map_err(err)
 46 | }
 47 | 
 48 | pub async fn build_exhauster(config: &ExhausterConfig) -> Result<(Exhauster, Vec<BoxedWorker>)> {
 49 |     let object_store = build_object_store(config).await;
 50 |     build_exhauster_with_object_store(config, object_store).await
 51 | }
 52 | 
 53 | pub async fn build_exhauster_with_object_store(
 54 |     config: &ExhausterConfig,
 55 |     object_store: ObjectStoreRef,
 56 | ) -> Result<(Exhauster, Vec<BoxedWorker>)> {
 57 |     let lsm_tree_metrics = Arc::new(LsmTreeMetrics::new(config.id));
 58 | 
 59 |     let sstable_store = build_sstable_store(config, object_store, lsm_tree_metrics)?;
 60 | 
 61 |     let options = ExhausterOptions {
 62 |         node: config.id,
 63 |         sstable_store,
 64 |         // TODO: Restore from persistent store.
 65 |         sstable_sequential_id: 1,
 66 |     };
 67 | 
 68 |     let channel_pool = build_channel_pool(config);
 69 | 
 70 |     let heartbeater_options = HeartbeaterOptions {
 71 |         node_id: config.id,
 72 |         endpoint: PbEndpoint {
 73 |             host: config.host.clone(),
 74 |             port: config.port as u32,
 75 |         },
 76 |         channel_pool,
 77 |         rudder_node_id: config.rudder.id,
 78 |         heartbeat_interval: config
 79 |             .heartbeat_interval
 80 |             .parse::<humantime::Duration>()?
 81 |             .into(),
 82 |     };
 83 |     let heartbeater = Box::new(Heartbeater::new(heartbeater_options));
 84 | 
 85 |     let exhauster = Exhauster::new(options);
 86 | 
 87 |     Ok((exhauster, vec![heartbeater]))
 88 | }
 89 | 
 90 | async fn build_object_store(config: &ExhausterConfig) -> ObjectStoreRef {
 91 |     if let Some(c) = &config.s3 {
 92 |         info!("s3 config found, create s3 object store");
 93 |         Arc::new(S3ObjectStore::new(c.bucket.clone()).await)
 94 |     } else if let Some(c) = &config.minio {
 95 |         info!("minio config found, create minio object store");
 96 |         Arc::new(S3ObjectStore::new_with_minio(&c.url).await)
 97 |     } else {
 98 |         info!("no object store config found, create default memory object store");
 99 |         Arc::new(MemObjectStore::default())
100 |     }
101 | }
102 | 
103 | fn build_sstable_store(
104 |     config: &ExhausterConfig,
105 |     object_store: ObjectStoreRef,
106 |     metrics: LsmTreeMetricsRef,
107 | ) -> Result<SstableStoreRef> {
108 |     let block_cache = BlockCache::new(0, metrics);
109 |     let sstable_store_options = SstableStoreOptions {
110 |         path: config.data_path.clone(),
111 |         object_store,
112 |         block_cache,
113 |         meta_cache_capacity: config
114 |             .cache
115 |             .meta_cache_capacity
116 |             .parse::<ByteSize>()
117 |             .map_err(config_err)?
118 |             .0 as usize,
119 |         tiered_cache: TieredCache::none(),
120 |     };
121 |     let sstable_store = SstableStore::new(sstable_store_options);
122 |     Ok(Arc::new(sstable_store))
123 | }
124 | 
125 | fn build_channel_pool(config: &ExhausterConfig) -> ChannelPool {
126 |     ChannelPool::with_nodes(vec![config.rudder.clone()])
127 | }
128 | 


--------------------------------------------------------------------------------
/exhauster/src/main.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(target_env = "msvc"))]
 2 | use tikv_jemallocator::Jemalloc;
 3 | 
 4 | #[cfg(not(target_env = "msvc"))]
 5 | #[global_allocator]
 6 | static GLOBAL: Jemalloc = Jemalloc;
 7 | 
 8 | use std::fs::read_to_string;
 9 | 
10 | use clap::Parser;
11 | use runkv_exhauster::config::ExhausterConfig;
12 | use runkv_exhauster::error::{config_err, Result};
13 | use runkv_exhauster::{bootstrap_exhauster, build_exhauster};
14 | use tracing::info;
15 | use tracing_subscriber::FmtSubscriber;
16 | 
17 | #[derive(Parser, Debug)]
18 | struct Args {
19 |     #[clap(short, long, default_value = "etc/exhauster.toml")]
20 |     config_file_path: String,
21 | }
22 | 
23 | #[tokio::main]
24 | async fn main() -> Result<()> {
25 |     let subscriber = FmtSubscriber::new();
26 |     tracing::subscriber::set_global_default(subscriber)?;
27 | 
28 |     let args = Args::parse();
29 |     info!("args: {:?}", args);
30 | 
31 |     let config: ExhausterConfig =
32 |         toml::from_str(&read_to_string(&args.config_file_path)?).map_err(config_err)?;
33 |     info!("config: {:?}", config);
34 | 
35 |     let (exhauster, workers) = build_exhauster(&config).await?;
36 |     bootstrap_exhauster(&config, exhauster, workers).await
37 | }
38 | 


--------------------------------------------------------------------------------
/exhauster/src/partitioner.rs:
--------------------------------------------------------------------------------
 1 | use bytes::Bytes;
 2 | 
 3 | pub trait Partitioner: Send + Sync + 'static {
 4 |     /// Finish building current sstable if returns true.
 5 |     fn partition(&mut self, key: &[u8], value: Option<&[u8]>, sequence: u64) -> bool;
 6 | }
 7 | 
 8 | pub type BoxedPartitioner = Box<dyn Partitioner>;
 9 | 
10 | pub struct DefaultPartitioner {
11 |     partition_points: Vec<Bytes>,
12 |     offset: usize,
13 | }
14 | 
15 | impl DefaultPartitioner {
16 |     pub fn new(mut partition_points: Vec<Bytes>) -> Self {
17 |         partition_points.sort();
18 |         Self {
19 |             partition_points,
20 |             offset: 0,
21 |         }
22 |     }
23 | }
24 | 
25 | impl Partitioner for DefaultPartitioner {
26 |     fn partition(&mut self, key: &[u8], _value: Option<&[u8]>, _sequence: u64) -> bool {
27 |         if self.offset >= self.partition_points.len() {
28 |             return false;
29 |         }
30 |         if key >= self.partition_points[self.offset] {
31 |             self.offset += 1;
32 |             return true;
33 |         }
34 |         false
35 |     }
36 | }
37 | 
38 | #[derive(Default)]
39 | pub struct NoPartitioner;
40 | 
41 | impl Partitioner for NoPartitioner {
42 |     fn partition(&mut self, _key: &[u8], _value: Option<&[u8]>, _sequence: u64) -> bool {
43 |         false
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/exhauster/src/worker/heartbeater.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | 
 3 | use async_trait::async_trait;
 4 | use runkv_common::channel_pool::ChannelPool;
 5 | use runkv_common::Worker;
 6 | use runkv_proto::common::Endpoint as PbEndpoint;
 7 | use runkv_proto::rudder::rudder_service_client::RudderServiceClient;
 8 | use runkv_proto::rudder::{heartbeat_request, ExhausterHeartbeatRequest, HeartbeatRequest};
 9 | use tonic::Request;
10 | use tracing::warn;
11 | 
12 | use crate::error::{err, Result};
13 | 
14 | pub struct HeartbeaterOptions {
15 |     pub node_id: u64,
16 |     pub endpoint: PbEndpoint,
17 |     pub channel_pool: ChannelPool,
18 |     pub rudder_node_id: u64,
19 |     pub heartbeat_interval: Duration,
20 | }
21 | 
22 | pub struct Heartbeater {
23 |     node_id: u64,
24 |     endpoint: PbEndpoint,
25 |     channel_pool: ChannelPool,
26 |     rudder_node_id: u64,
27 |     heartbeat_interval: Duration,
28 | }
29 | 
30 | impl Heartbeater {
31 |     pub fn new(options: HeartbeaterOptions) -> Self {
32 |         Self {
33 |             node_id: options.node_id,
34 |             endpoint: options.endpoint,
35 |             channel_pool: options.channel_pool,
36 |             rudder_node_id: options.rudder_node_id,
37 |             heartbeat_interval: options.heartbeat_interval,
38 |         }
39 |     }
40 | 
41 |     async fn run_inner(&mut self) -> Result<()> {
42 |         tokio::time::sleep(self.heartbeat_interval).await;
43 |         let req = HeartbeatRequest {
44 |             node_id: self.node_id,
45 |             endpoint: Some(self.endpoint.clone()),
46 |             heartbeat_message: Some(heartbeat_request::HeartbeatMessage::ExhausterHeartbeat(
47 |                 ExhausterHeartbeatRequest {},
48 |             )),
49 |         };
50 |         let request = Request::new(req);
51 |         let mut client = RudderServiceClient::new(
52 |             self.channel_pool
53 |                 .get(self.rudder_node_id)
54 |                 .await
55 |                 .map_err(err)?,
56 |         );
57 |         let _rsp = client.heartbeat(request).await?.into_inner();
58 |         Ok(())
59 |     }
60 | }
61 | 
62 | #[async_trait]
63 | impl Worker for Heartbeater {
64 |     async fn run(&mut self) -> anyhow::Result<()> {
65 |         // TODO: Gracefully kill.
66 |         loop {
67 |             match self.run_inner().await {
68 |                 Ok(_) => {}
69 |                 Err(e) => {
70 |                     warn!("error occur when heartbeater running: {}", e);
71 |                 }
72 |             }
73 |         }
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/exhauster/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod heartbeater;
2 | 


--------------------------------------------------------------------------------
/make/common.toml:
--------------------------------------------------------------------------------
 1 | [env]
 2 | OS = { source = "${CARGO_MAKE_RUST_TARGET_OS}", mapping = { linux = "linux", macos = "darwin" } }
 3 | ARCH = { source = "${CARGO_MAKE_RUST_TARGET_ARCH}", mapping = { x86_64 = "amd64", aarch64 = "arm64" } }
 4 | SYSTEM = "${OS}-${ARCH}"
 5 | SYSTEM_AMD64 = "${OS}-amd64"                                                                            # some components do not support darwin-arm64 for now, use amd64 for fallback
 6 | PREFIX = "${PWD}/.run"
 7 | PREFIX_USR_BIN = "${PWD}/.bin"
 8 | PREFIX_BIN = "${PREFIX}/bin"
 9 | PREFIX_CONFIG = "${PREFIX}/config"
10 | PREFIX_DATA = "${PREFIX}/data"
11 | PREFIX_LOG = "${PREFIX}/log"
12 | PREFIX_TMP = "${PREFIX}/tmp"
13 | 
14 | [tasks.prepare]
15 | private = true
16 | category = "Misc"
17 | description = "Create .run folder for temporyary files and data."
18 | script = '''
19 | #!@duckscript
20 | echo "Using ${PREFIX} as base folder."
21 | mkdir "${PREFIX}"
22 | mkdir "${PREFIX_BIN}"
23 | mkdir "${PREFIX_TMP}"
24 | mkdir "${PREFIX_DATA}"
25 | mkdir "${PREFIX_LOG}"
26 | '''
27 | 


--------------------------------------------------------------------------------
/make/grafana.toml:
--------------------------------------------------------------------------------
 1 | extend = "common.toml"
 2 | 
 3 | [env]
 4 | GRAFANA_SYSTEM = "${SYSTEM_AMD64}"
 5 | GRAFANA_DOWNLOAD_PATH = "${PREFIX_TMP}/grafana.tar.gz"
 6 | GRAFANA_VERSION = "8.5.1"
 7 | GRAFANA_RELEASE = "grafana-${GRAFANA_VERSION}"
 8 | GRAFANA_DOWNLOAD_TAR_GZ = "https://dl.grafana.com/oss/release/${GRAFANA_RELEASE}.${GRAFANA_SYSTEM}.tar.gz"
 9 | 
10 | [tasks.download-grafana]
11 | category = "Grafana"
12 | dependencies = ["prepare"]
13 | description = "Download and extract Grafana"
14 | script = '''
15 | #!/bin/bash
16 | set -e
17 | if [ -d "${PREFIX_BIN}/grafana" ]; then
18 |     exit 0
19 | fi
20 | echo "Grafana Server not found, downloading"
21 | curl -fL -o "${GRAFANA_DOWNLOAD_PATH}" "${GRAFANA_DOWNLOAD_TAR_GZ}"
22 | tar -xf "${GRAFANA_DOWNLOAD_PATH}" -C "${PREFIX_TMP}"
23 | mv "${PREFIX_TMP}/${GRAFANA_RELEASE}" "${PREFIX_BIN}/grafana"
24 | echo "grafana download success"
25 | '''
26 | 


--------------------------------------------------------------------------------
/make/jaeger.toml:
--------------------------------------------------------------------------------
 1 | extend = "common.toml"
 2 | 
 3 | [env]
 4 | JAEGER_SYSTEM = "${SYSTEM}"
 5 | JAEGER_DOWNLOAD_TAR = "https://github.com/jaegertracing/jaeger/releases/download/v1.33.0/jaeger-1.33.0-${JAEGER_SYSTEM}.tar.gz"
 6 | 
 7 | [tasks.download-jaeger]
 8 | category = "Tracing"
 9 | dependencies = ["prepare"]
10 | description = "Download and extract Jaeger."
11 | script = '''
12 | #!/bin/bash
13 | set -e
14 | if [ -f "${PREFIX_BIN}/jaeger" ]; then
15 |     exit 0
16 | fi
17 | echo "Jaeger not found, downloading"
18 | curl -fL -o "${PREFIX_TMP}/jaeger.tar.gz" "${JAEGER_DOWNLOAD_TAR}"
19 | tar -C "${PREFIX_TMP}/" -xzf "${PREFIX_TMP}/jaeger.tar.gz"
20 | chmod +x "${PREFIX_TMP}/jaeger-1.33.0-${JAEGER_SYSTEM}/jaeger-all-in-one"
21 | mv "${PREFIX_TMP}/jaeger-1.33.0-${JAEGER_SYSTEM}/jaeger-all-in-one" "${PREFIX_BIN}/jaeger"
22 | 
23 | "${PREFIX_BIN}/jaeger" version
24 | '''
25 | 


--------------------------------------------------------------------------------
/make/minio.toml:
--------------------------------------------------------------------------------
 1 | extend = "common.toml"
 2 | 
 3 | [env]
 4 | MINIO_SYSTEM = "${SYSTEM}"
 5 | MCLI_DOWNLOAD_BIN = "https://dl.min.io/client/mc/release/${MINIO_SYSTEM}/mc"
 6 | MINIO_DOWNLOAD_BIN = "https://dl.min.io/server/minio/release/${MINIO_SYSTEM}/minio"
 7 | 
 8 | [tasks.download-minio]
 9 | category = "MinIO"
10 | dependencies = ["prepare"]
11 | description = "Download and extract MinIO."
12 | script = '''
13 | #!/bin/bash
14 | set -e
15 | if [ -f "${PREFIX_BIN}/minio" ]; then
16 |     exit 0
17 | fi
18 | echo "MinIO Server not found, downloading"
19 | curl -fL -o "${PREFIX_TMP}/minio" "${MINIO_DOWNLOAD_BIN}"
20 | chmod +x "${PREFIX_TMP}/minio"
21 | mv "${PREFIX_TMP}/minio" "${PREFIX_BIN}/minio"
22 | 
23 | "${PREFIX_BIN}/minio" --version
24 | '''
25 | 
26 | [tasks.download-mcli]
27 | category = "MinIO"
28 | dependencies = ["prepare"]
29 | description = "Download and extract MinIO Client."
30 | script = '''
31 | #!/bin/bash
32 | set -e
33 | if [ -f "${PREFIX_BIN}/mcli" ]; then
34 |     exit 0
35 | fi
36 | echo "MinIO Client not found, downloading"
37 | curl -fL -o "${PREFIX_TMP}/mcli" "${MCLI_DOWNLOAD_BIN}"
38 | chmod +x "${PREFIX_TMP}/mcli"
39 | mv "${PREFIX_TMP}/mcli" "${PREFIX_BIN}/mcli"
40 | 
41 | "${PREFIX_BIN}/mcli" --version
42 | '''
43 | 


--------------------------------------------------------------------------------
/make/prometheus.toml:
--------------------------------------------------------------------------------
 1 | extend = "common.toml"
 2 | 
 3 | [env]
 4 | PROMETHEUS_SYSTEM = "${SYSTEM}"
 5 | PROMETHEUS_DOWNLOAD_PATH = "${PREFIX_TMP}/prometheus.tar.gz"
 6 | PROMETHEUS_VERSION = "2.32.1"
 7 | PROMETHEUS_RELEASE = "prometheus-${PROMETHEUS_VERSION}.${PROMETHEUS_SYSTEM}"
 8 | PROMETHEUS_DOWNLOAD_TAR_GZ = "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/${PROMETHEUS_RELEASE}.tar.gz"
 9 | 
10 | [tasks.download-prometheus]
11 | category = "Metrics"
12 | dependencies = ["prepare"]
13 | description = "Download and extract Prometheus"
14 | script = '''
15 | #!/bin/bash
16 | set -e
17 | if [ -d "${PREFIX_BIN}/prometheus" ]; then
18 |     exit 0
19 | fi
20 | echo "Prometheus not found, downloading"
21 | curl -fL -o "${PROMETHEUS_DOWNLOAD_PATH}" "${PROMETHEUS_DOWNLOAD_TAR_GZ}"
22 | tar -xf "${PROMETHEUS_DOWNLOAD_PATH}" -C "${PREFIX_TMP}"
23 | mv "${PREFIX_TMP}/${PROMETHEUS_RELEASE}" "${PREFIX_BIN}/prometheus"
24 | '''
25 | 


--------------------------------------------------------------------------------
/proto/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-proto"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | bytes = "1"
10 | prost = "0.9"
11 | runkv-common = { path = "../common" }
12 | serde = "1.0"
13 | serde_derive = "1.0"
14 | tonic = "0.6.2"
15 | 
16 | [build-dependencies]
17 | prost-build = "0.9"
18 | tonic-build = "0.6.2"
19 | 


--------------------------------------------------------------------------------
/proto/build.rs:
--------------------------------------------------------------------------------
 1 | fn main() {
 2 |     tonic_build::configure()
 3 |         .type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
 4 |         .compile(
 5 |             &[
 6 |                 "src/proto/common.proto",
 7 |                 "src/proto/manifest.proto",
 8 |                 "src/proto/meta.proto",
 9 |                 "src/proto/rudder.proto",
10 |                 "src/proto/wheel.proto",
11 |                 "src/proto/exhauster.proto",
12 |                 "src/proto/kv.proto",
13 |             ],
14 |             &["src/proto"],
15 |         )
16 |         .unwrap()
17 | }
18 | 


--------------------------------------------------------------------------------
/proto/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod common {
 2 |     #![allow(clippy::all)]
 3 |     tonic::include_proto!("common");
 4 | }
 5 | 
 6 | pub mod manifest {
 7 |     #![allow(clippy::all)]
 8 |     tonic::include_proto!("manifest");
 9 | }
10 | 
11 | pub mod meta {
12 |     #![allow(clippy::all)]
13 |     tonic::include_proto!("meta");
14 | 
15 |     impl Eq for KeyRange {}
16 | 
17 |     impl PartialOrd for KeyRange {
18 |         fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
19 |             Some(self.cmp(&other))
20 |         }
21 |     }
22 | 
23 |     impl Ord for KeyRange {
24 |         fn cmp(&self, other: &Self) -> std::cmp::Ordering {
25 |             self.start_key.cmp(&other.start_key)
26 |         }
27 |     }
28 | }
29 | 
30 | pub mod rudder {
31 |     #![allow(clippy::all)]
32 |     tonic::include_proto!("rudder");
33 | }
34 | 
35 | pub mod wheel {
36 |     #![allow(clippy::all)]
37 |     tonic::include_proto!("wheel");
38 | }
39 | 
40 | pub mod exhauster {
41 |     #![allow(clippy::all)]
42 |     tonic::include_proto!("exhauster");
43 | }
44 | 
45 | pub mod kv {
46 |     #![allow(clippy::all)]
47 |     tonic::include_proto!("kv");
48 | 
49 |     use runkv_common::coding::BytesSerde;
50 | 
51 |     impl<'de> BytesSerde<'de> for KvRequest {}
52 |     impl<'de> BytesSerde<'de> for KvResponse {}
53 | 
54 |     impl KvRequest {
55 |         pub fn r#type(&self) -> Type {
56 |             let mut r#type = Type::TNone;
57 |             for op in self.ops.iter() {
58 |                 match op.r#type() {
59 |                     OpType::None => {}
60 |                     OpType::Get => match r#type {
61 |                         Type::TNone => r#type = Type::TGet,
62 |                         Type::TGet | Type::TTxn => {}
63 |                         _ => r#type = Type::TTxn,
64 |                     },
65 |                     OpType::Put => match r#type {
66 |                         Type::TNone => r#type = Type::TPut,
67 |                         Type::TPut | Type::TTxn => {}
68 |                         _ => r#type = Type::TTxn,
69 |                     },
70 |                     OpType::Delete => match r#type {
71 |                         Type::TNone => r#type = Type::TDelete,
72 |                         Type::TDelete | Type::TTxn => {}
73 |                         _ => r#type = Type::TTxn,
74 |                     },
75 |                     OpType::Snapshot => match r#type {
76 |                         Type::TNone => r#type = Type::TSnapshot,
77 |                         Type::TSnapshot | Type::TTxn => {}
78 |                         _ => r#type = Type::TTxn,
79 |                     },
80 |                 }
81 |             }
82 |             r#type
83 |         }
84 | 
85 |         pub fn is_read_only(&self) -> bool {
86 |             for op in self.ops.iter() {
87 |                 match op.r#type() {
88 |                     OpType::None | OpType::Get | OpType::Snapshot => {}
89 |                     OpType::Put | OpType::Delete => return false,
90 |                 }
91 |             }
92 |             true
93 |         }
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/proto/src/proto/buf.yaml:
--------------------------------------------------------------------------------
 1 | version: v1
 2 | lint:
 3 |   use:
 4 |     - DEFAULT
 5 |   except:
 6 |     - ENUM_VALUE_PREFIX
 7 |     - ENUM_ZERO_VALUE_SUFFIX
 8 |     - PACKAGE_VERSION_SUFFIX
 9 | 
10 |     # We guarantee that every file is one package. So this check isn't necessary.
11 |     - DIRECTORY_SAME_PACKAGE
12 |     - PACKAGE_DIRECTORY_MATCH
13 | 


--------------------------------------------------------------------------------
/proto/src/proto/common.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 | 
3 | package common;
4 | 
5 | message Endpoint {
6 |   string host = 1;
7 |   uint32 port = 2;
8 | }
9 | 


--------------------------------------------------------------------------------
/proto/src/proto/exhauster.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package exhauster;
 4 | 
 5 | import "manifest.proto";
 6 | 
 7 | message CompactionRequest {
 8 |   repeated uint64 sst_ids = 1;
 9 |   uint64 watermark = 2;
10 |   uint64 sstable_capacity = 3;
11 |   uint64 block_capacity = 4;
12 |   uint64 restart_interval = 5;
13 |   double bloom_false_positive = 6;
14 |   uint64 compression_algorithm = 7;
15 |   bool remove_tombstone = 8;
16 |   repeated bytes partition_points = 9;
17 | }
18 | 
19 | message CompactionResponse {
20 |   repeated manifest.SstableInfo old_sst_infos = 1;
21 |   repeated manifest.SstableInfo new_sst_infos = 2;
22 | }
23 | 
24 | service ExhausterService {
25 |   rpc Compaction(CompactionRequest) returns (CompactionResponse);
26 | }
27 | 


--------------------------------------------------------------------------------
/proto/src/proto/kv.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package kv;
 4 | 
 5 | enum ErrCode {
 6 |   OK = 0;
 7 |   REDIRECT = 1;
 8 | }
 9 | 
10 | enum OpType {
11 |   NONE = 0;
12 |   GET = 1; // (key[, sequence]) -> (value)
13 |   PUT = 2; // (key, value) -> ()
14 |   DELETE = 3; // (key) -> ()
15 |   SNAPSHOT = 4; // () -> (sequence)
16 | }
17 | 
18 | enum Type {
19 |   T_NONE = 0;
20 |   T_GET = 1;
21 |   T_PUT = 2;
22 |   T_DELETE = 3;
23 |   T_SNAPSHOT = 4;
24 |   T_TXN = 5;
25 | }
26 | 
27 | message Op {
28 |   OpType type = 1;
29 |   bytes key = 2;
30 |   bytes value = 3;
31 |   uint64 sequence = 4;
32 | }
33 | 
34 | message KvRequest {
35 |   repeated Op ops = 1;
36 |   // target raft node id
37 |   uint64 target = 2;
38 | }
39 | 
40 | message KvResponse {
41 |   repeated Op ops = 1;
42 |   ErrCode err = 2;
43 | }
44 | 
45 | service KvService {
46 |   rpc Kv(KvRequest) returns (KvResponse);
47 | }
48 | 


--------------------------------------------------------------------------------
/proto/src/proto/manifest.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package manifest;
 4 | 
 5 | enum SstableOp {
 6 |   INSERT = 0;
 7 |   DELETE = 1;
 8 | }
 9 | 
10 | message SstableDiff {
11 |   uint64 id = 1;
12 |   uint64 level = 2;
13 |   SstableOp op = 3;
14 |   uint64 data_size = 4;
15 | }
16 | 
17 | message VersionDiff {
18 |   uint64 id = 1;
19 |   repeated SstableDiff sstable_diffs = 2;
20 | }
21 | 
22 | message SstableInfo {
23 |   uint64 id = 1;
24 |   uint64 data_size = 2;
25 | }
26 | 


--------------------------------------------------------------------------------
/proto/src/proto/meta.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package meta;
 4 | 
 5 | import "common.proto";
 6 | 
 7 | // Assume [`KeyRange`] does not overlaps.
 8 | message KeyRange {
 9 |   bytes start_key = 1;
10 |   bytes end_key = 2;
11 | }
12 | 
13 | message KeyRangeInfo {
14 |   // raft group id
15 |   uint64 group = 1;
16 |   // key range
17 |   meta.KeyRange key_range = 2;
18 |   // { raft node id -> node id }
19 |   map<uint64, uint64> raft_nodes = 3;
20 |   // leader raft node id
21 |   // Used by query router info.
22 |   uint64 leader = 4;
23 | }
24 | 
25 | message WheelMeta {
26 |   uint64 id = 1;
27 |   KeyRange key_range = 2;
28 |   common.Endpoint endpoint = 3;
29 | }
30 | 


--------------------------------------------------------------------------------
/proto/src/proto/prototool.yaml:
--------------------------------------------------------------------------------
1 | protoc:
2 |   version: 3.17.3
3 | lint:
4 |   group: google


--------------------------------------------------------------------------------
/proto/src/proto/rudder.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package rudder;
 4 | 
 5 | import "common.proto";
 6 | import "manifest.proto";
 7 | import "meta.proto";
 8 | 
 9 | message RaftState {
10 |   bool is_leader = 1;
11 | }
12 | 
13 | message WheelHeartbeatRequest {
14 |   uint64 watermark = 1;
15 |   uint64 next_version_id = 2;
16 |   map<uint64, RaftState> raft_states = 3;
17 | }
18 | 
19 | message WheelHeartbeatResponse {
20 |   repeated manifest.VersionDiff version_diffs = 1;
21 | }
22 | 
23 | message ExhausterHeartbeatRequest {}
24 | 
25 | message ExhausterHeartbeatResponse {}
26 | 
27 | // TODO: Add status report.
28 | message HeartbeatRequest {
29 |   uint64 node_id = 1;
30 |   common.Endpoint endpoint = 2;
31 |   oneof heartbeat_message {
32 |     WheelHeartbeatRequest wheel_heartbeat = 3;
33 |     ExhausterHeartbeatRequest exhauster_heartbeat = 4;
34 |   }
35 | }
36 | 
37 | message HeartbeatResponse {
38 |   oneof heartbeat_message {
39 |     WheelHeartbeatResponse wheel_heartbeat = 1;
40 |     ExhausterHeartbeatResponse exhauster_heartbeat = 2;
41 |   }
42 | }
43 | 
44 | message InsertL0Request {
45 |   uint64 node_id = 1;
46 |   repeated manifest.SstableInfo sst_infos = 2;
47 |   uint64 next_version_id = 3;
48 | }
49 | 
50 | message InsertL0Response {
51 |   repeated manifest.VersionDiff version_diffs = 1;
52 | }
53 | 
54 | message TsoRequest {}
55 | 
56 | message TsoResponse {
57 |   uint32 timestamp = 1;
58 | }
59 | 
60 | service RudderService {
61 |   // Called by `wheel` and `exhauster`.
62 |   rpc Heartbeat(HeartbeatRequest) returns (HeartbeatResponse);
63 |   // Called by `wheel` when sstable uploader finish upload new L0 sstable to S3.
64 |   rpc InsertL0(InsertL0Request) returns (InsertL0Response);
65 |   // TODO: Implement transaction.
66 |   rpc Tso(TsoRequest) returns (TsoResponse);
67 | }
68 | 
69 | // ***** Control Service *****
70 | 
71 | message AddWheelsRequest {
72 |   // { node id -> endpoint }
73 |   map<uint64, common.Endpoint> wheels = 1;
74 | }
75 | 
76 | message AddWheelsResponse {}
77 | 
78 | message AddKeyRangesRequest {
79 |   repeated meta.KeyRangeInfo key_ranges = 1;
80 | }
81 | 
82 | message AddKeyRangesResponse {}
83 | 
84 | message RouterRequest {}
85 | 
86 | message RouterResponse {
87 |   repeated meta.KeyRangeInfo key_ranges = 1;
88 |   map<uint64, common.Endpoint> wheels = 2;
89 | }
90 | 
91 | service ControlService {
92 |   rpc AddWheels(AddWheelsRequest) returns (AddWheelsResponse);
93 |   rpc AddKeyRanges(AddKeyRangesRequest) returns (AddKeyRangesResponse);
94 |   rpc Router(RouterRequest) returns (RouterResponse);
95 | }
96 | 


--------------------------------------------------------------------------------
/proto/src/proto/wheel.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package wheel;
 4 | 
 5 | import "common.proto";
 6 | import "meta.proto";
 7 | 
 8 | // ***** Inner Service *****
 9 | 
10 | message AddWheelsRequest {
11 |   // { node id -> endpoint }
12 |   map<uint64, common.Endpoint> wheels = 1;
13 | }
14 | 
15 | message AddWheelsResponse {}
16 | 
17 | message AddKeyRangesRequest {
18 |   repeated meta.KeyRangeInfo key_ranges = 1;
19 | }
20 | 
21 | message AddKeyRangesResponse {}
22 | 
23 | service WheelService {
24 |   rpc AddWheels(AddWheelsRequest) returns (AddWheelsResponse);
25 |   rpc AddKeyRanges(AddKeyRangesRequest) returns (AddKeyRangesResponse);
26 |   // TODO: Implement them.
27 |   // rpc SyncEndpoints(SyncEndpointsRequest) returns (SyncEndpointsResponse);
28 |   // rpc SyncKeyRanges(SyncKeyRangesRequest) returns (SyncKeyRangesResponse);
29 | 
30 | }
31 | 
32 | // ***** Raft Service *****
33 | 
34 | message RaftRequest {
35 |   bytes data = 1;
36 | }
37 | 
38 | message RaftResponse {}
39 | 
40 | service RaftService {
41 |   rpc Raft(RaftRequest) returns (RaftResponse);
42 | }
43 | 


--------------------------------------------------------------------------------
/rudder/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-rudder"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | async-trait = "0.1"
10 | base64 = "0.13"
11 | bytes = "1"
12 | bytesize = { version = "1.1.0", features = ["serde"] }
13 | clap = { version = "3.1.6", features = ["derive"] }
14 | futures = "0.3"
15 | humantime = "2.1.0"
16 | humantime-serde = "1.1.1"
17 | itertools = "0.10.3"
18 | parking_lot = "0.12"
19 | prost = "0.9"
20 | rand = "0.8.5"
21 | runkv-common = { path = "../common" }
22 | runkv-proto = { path = "../proto" }
23 | runkv-storage = { path = "../storage" }
24 | serde = "1.0"
25 | serde_derive = "1.0"
26 | thiserror = "1.0"
27 | tokio = { version = "1", features = [
28 |     "rt-multi-thread",
29 |     "sync",
30 |     "macros",
31 |     "time",
32 | ] }
33 | toml = "0.4.2"
34 | tonic = "0.6.2"
35 | tracing = "0.1"
36 | tracing-subscriber = "0.3"
37 | 
38 | [target.'cfg(not(target_env = "msvc"))'.dependencies]
39 | tikv-jemallocator = "0.4.3"
40 | 
41 | [dev-dependencies]
42 | env_logger = "*"
43 | test-log = "0.2.10"
44 | 
45 | [features]
46 | verbose-release-log = ["tracing/release_max_level_trace"]
47 | 


--------------------------------------------------------------------------------
/rudder/src/config.rs:
--------------------------------------------------------------------------------
 1 | use runkv_common::config::{CacheConfig, LsmTreeConfig, MinioConfig, S3Config};
 2 | use serde::Deserialize;
 3 | 
 4 | #[derive(Deserialize, Clone, Debug)]
 5 | pub struct RudderConfig {
 6 |     pub id: u64,
 7 |     pub host: String,
 8 |     pub port: u16,
 9 |     pub data_path: String,
10 |     pub meta_path: String,
11 |     pub health_timeout: String,
12 |     pub s3: Option<S3Config>,
13 |     pub minio: Option<MinioConfig>,
14 |     pub cache: CacheConfig,
15 |     pub lsm_tree: LsmTreeConfig,
16 | }
17 | 


--------------------------------------------------------------------------------
/rudder/src/error.rs:
--------------------------------------------------------------------------------
 1 | use runkv_proto::common::Endpoint;
 2 | use runkv_proto::meta::KeyRange;
 3 | use tonic::Status;
 4 | 
 5 | #[derive(thiserror::Error, Debug)]
 6 | pub enum Error {
 7 |     #[error("storage error: {0}")]
 8 |     StorageError(#[from] runkv_storage::Error),
 9 |     #[error("invalid watermark: [current: {0}] [new: {1}]")]
10 |     InvalidWatermark(u64, u64),
11 |     #[error("transport error: {0}")]
12 |     TransportError(#[from] tonic::transport::Error),
13 |     #[error("rpc status error: {0}")]
14 |     RpcStatus(#[from] Status),
15 |     #[error("config error: {0}")]
16 |     ConfigError(String),
17 |     #[error("control error: {0}")]
18 |     ControlError(#[from] ControlError),
19 |     #[error("other: {0}")]
20 |     Other(String),
21 | }
22 | 
23 | impl Error {
24 |     pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> Error {
25 |         Error::Other(e.into().to_string())
26 |     }
27 | 
28 |     pub fn config_err(e: impl Into<Box<dyn std::error::Error>>) -> Error {
29 |         Error::ConfigError(e.into().to_string())
30 |     }
31 | }
32 | 
33 | #[derive(thiserror::Error, Debug)]
34 | pub enum ControlError {
35 |     #[error("node already exists: [node: {node}] [origin endpoint: {origin:?}] [given endpoint: {given:?}]")]
36 |     NodeAlreadyExists {
37 |         node: u64,
38 |         origin: Endpoint,
39 |         given: Endpoint,
40 |     },
41 |     #[error("node not exists: {0}")]
42 |     NodeNotExists(u64),
43 |     #[error("group already exists: {0}")]
44 |     GroupAlreadyExists(u64),
45 |     #[error("group not exists: {0}")]
46 |     GroupNotExists(u64),
47 |     #[error("raft node already exists: {0}")]
48 |     RaftNodeAlreadyExists(u64),
49 |     #[error("raft node not exists: {0}")]
50 |     RaftNodeNotExists(u64),
51 |     #[error("key range overlaps: [{0:?}] [{1:?}]")]
52 |     KeyRangeOverlaps(KeyRange, KeyRange),
53 | }
54 | 
55 | pub type Result<T> = std::result::Result<T, Error>;
56 | 


--------------------------------------------------------------------------------
/rudder/src/main.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(target_env = "msvc"))]
 2 | use tikv_jemallocator::Jemalloc;
 3 | 
 4 | #[cfg(not(target_env = "msvc"))]
 5 | #[global_allocator]
 6 | static GLOBAL: Jemalloc = Jemalloc;
 7 | 
 8 | use std::fs::read_to_string;
 9 | 
10 | use clap::Parser;
11 | use runkv_rudder::config::RudderConfig;
12 | use runkv_rudder::error::{Error, Result};
13 | use runkv_rudder::{bootstrap_rudder, build_rudder};
14 | use tracing::info;
15 | use tracing_subscriber::FmtSubscriber;
16 | 
17 | #[derive(Parser, Debug)]
18 | struct Args {
19 |     #[clap(short, long, default_value = "etc/rudder.toml")]
20 |     config_file_path: String,
21 | }
22 | 
23 | #[tokio::main]
24 | async fn main() -> Result<()> {
25 |     let subscriber = FmtSubscriber::new();
26 |     tracing::subscriber::set_global_default(subscriber).map_err(Error::err)?;
27 | 
28 |     let args = Args::parse();
29 |     info!("args: {:?}", args);
30 | 
31 |     let config: RudderConfig =
32 |         toml::from_str(&read_to_string(&args.config_file_path).map_err(Error::err)?)
33 |             .map_err(Error::config_err)?;
34 |     info!("config: {:?}", config);
35 | 
36 |     let (rudder, workers) = build_rudder(&config).await?;
37 |     bootstrap_rudder(&config, rudder, workers).await
38 | }
39 | 


--------------------------------------------------------------------------------
/rudder/src/meta/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::{BTreeMap, HashMap};
 2 | use std::sync::Arc;
 3 | use std::time::{Duration, SystemTime};
 4 | 
 5 | use async_trait::async_trait;
 6 | use runkv_proto::common::Endpoint;
 7 | use runkv_proto::meta::{KeyRange, KeyRangeInfo};
 8 | use runkv_proto::rudder::RaftState;
 9 | 
10 | use crate::error::Result;
11 | 
12 | pub mod mem;
13 | #[allow(dead_code)]
14 | pub mod object;
15 | 
16 | #[async_trait]
17 | pub trait MetaStore: Send + Sync + 'static {
18 |     /// Add new wheel.
19 |     async fn add_wheels(&self, wheels: HashMap<u64, Endpoint>) -> Result<()>;
20 | 
21 |     /// Get all wheel ids.
22 |     async fn wheels(&self) -> Result<HashMap<u64, Endpoint>>;
23 | 
24 |     /// Add new key range.
25 |     async fn add_key_ranges(&self, key_ranges: Vec<KeyRangeInfo>) -> Result<()>;
26 | 
27 |     /// Get all key range infos.
28 |     async fn all_key_range_infos(&self) -> Result<Vec<KeyRangeInfo>>;
29 | 
30 |     /// Update raft states.
31 |     async fn update_raft_states(&self, raft_states: HashMap<u64, RaftState>) -> Result<()>;
32 | 
33 |     /// Update exhauster meta.
34 |     async fn update_exhauster(&self, node_id: u64, endpoint: Endpoint) -> Result<()>;
35 | 
36 |     /// Random pick a available exhauster.
37 |     async fn pick_exhauster(&self, live: Duration) -> Result<Option<u64>>;
38 | 
39 |     /// Get all responsable key ranges grouped by groups.
40 |     async fn all_group_key_ranges(&self) -> Result<BTreeMap<u64, Vec<KeyRange>>>;
41 | 
42 |     /// Get all responsable key ranges.
43 |     async fn all_key_ranges(&self) -> Result<Vec<KeyRange>>;
44 | 
45 |     /// Pin sstables to prevent them from being compacted.
46 |     ///
47 |     /// Returns `true` if there is no conflicts and given sstables are pinned.
48 |     async fn pin_sstables(&self, sst_ids: &[u64], time: SystemTime) -> Result<bool>;
49 | 
50 |     /// Unpin sstables no matter if they were pinned before.
51 |     async fn unpin_sstables(&self, sst_ids: &[u64]) -> Result<()>;
52 | 
53 |     /// Check if sstables are pinned. Return a vector of pinned status.
54 |     async fn is_sstables_pinned(&self, sst_ids: &[u64], time: SystemTime) -> Result<Vec<bool>>;
55 | 
56 |     /// Get the current timestamp.
57 |     async fn timestamp(&self) -> Result<u32>;
58 | 
59 |     /// Fetch the current timestamp and advance it by `add`.
60 |     async fn timestamp_fetch_add(&self, add: u32) -> Result<u32>;
61 | }
62 | 
63 | pub type MetaStoreRef = Arc<dyn MetaStore>;
64 | 
65 | fn is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool {
66 |     !(r1.start_key > r2.end_key || r1.end_key < r2.start_key)
67 | }
68 | 
69 | fn _in_range(key: &[u8], range: &KeyRange) -> bool {
70 |     key >= &range.start_key[..] && key < &range.end_key[..]
71 | }
72 | 


--------------------------------------------------------------------------------
/rudder/src/meta/object.rs:
--------------------------------------------------------------------------------
 1 | use runkv_storage::ObjectStoreRef;
 2 | 
 3 | use crate::error::{Error, Result};
 4 | 
 5 | pub struct ObjectMetaStore {
 6 |     object_store: ObjectStoreRef,
 7 |     path: String,
 8 | }
 9 | 
10 | // TODO: Impl me.
11 | // #[async_trait]
12 | // impl MetaStore for ObjectMetaStore {}
13 | 
14 | impl ObjectMetaStore {
15 |     pub fn new(object_store: ObjectStoreRef, path: String) -> Self {
16 |         Self { object_store, path }
17 |     }
18 | 
19 |     async fn put(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
20 |         self.object_store
21 |             .put(&self.key(key), value)
22 |             .await
23 |             .map_err(Error::StorageError)
24 |     }
25 | 
26 |     async fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
27 |         self.object_store
28 |             .get(&self.key(key))
29 |             .await
30 |             .map_err(Error::StorageError)
31 |     }
32 | 
33 |     async fn remove(&self, key: &[u8]) -> Result<()> {
34 |         self.object_store
35 |             .remove(&self.key(key))
36 |             .await
37 |             .map_err(Error::StorageError)
38 |     }
39 | 
40 |     fn key(&self, key: &[u8]) -> String {
41 |         format!("{}/{}", self.path, base64::encode(key))
42 |     }
43 | }
44 | 
45 | #[cfg(test)]
46 | mod tests {
47 | 
48 |     use std::sync::Arc;
49 | 
50 |     use runkv_storage::MemObjectStore;
51 |     use test_log::test;
52 | 
53 |     use super::*;
54 | 
55 |     #[test(tokio::test)]
56 |     async fn test_crud() {
57 |         let object_store = Arc::new(MemObjectStore::default());
58 |         let store = ObjectMetaStore::new(object_store, "meta-test".to_string());
59 |         let key = b"test-key".to_vec();
60 |         let value = b"test-value".to_vec();
61 |         store.put(&key, value.clone()).await.unwrap();
62 |         let fetched_value = store.get(&key).await.unwrap().unwrap();
63 |         assert_eq!(fetched_value, value);
64 |         store.remove(&key).await.unwrap();
65 |         assert!(store.get(&key).await.unwrap().is_none());
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/rudder/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod compaction_detector;
2 | 


--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$(which tmux)" ]; then
 4 |     echo "tmux is required, please install manually."
 5 |     exit 0
 6 | fi
 7 | 
 8 | if [ -z "$(which cargo-make)" ]; then
 9 |     echo "Installing cargo-make..."
10 |     cargo install cargo-make --version "^0.35"
11 | fi
12 | 
13 | if [ $# -eq 0 ] || [ "$1" == "-h" ] || [ "$1" ==  "--help" ]; then
14 |     makers --list-all-steps
15 |     exit 0
16 | fi
17 | 
18 | makers --no-workspace "$@"


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | nightly-2022-10-16
2 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
 1 | comment_width = 120
 2 | format_code_in_doc_comments = true
 3 | format_macro_bodies = true
 4 | format_macro_matchers = true
 5 | normalize_comments = true
 6 | normalize_doc_attributes = true
 7 | imports_granularity = "Module"
 8 | group_imports = "StdExternalCrate"
 9 | reorder_imports = true
10 | tab_spaces = 4
11 | wrap_comments = true
12 | 


--------------------------------------------------------------------------------
/storage/Cargo.toml:
--------------------------------------------------------------------------------
  1 | [package]
  2 | name = "runkv-storage"
  3 | version = "0.1.0"
  4 | edition = "2021"
  5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
  6 | 
  7 | [dependencies]
  8 | anyhow = "1.0"
  9 | async-recursion = "1.0.0"
 10 | async-stream = "0.3.3"
 11 | async-trait = "0.1"
 12 | aws-config = "0.8"
 13 | aws-endpoint = "0.8"
 14 | aws-sdk-s3 = "0.8"
 15 | aws-smithy-http = "0.38"
 16 | aws-smithy-types = "0.38"
 17 | aws-types = { version = "0.8", features = ["hardcoded-credentials"] }
 18 | bcc = { version = "0.0.33", optional = true }
 19 | bitvec = "1"
 20 | bytes = "1"
 21 | bytesize = "1.1.0"
 22 | clap = { version = "3.1.6", features = ["derive"] }
 23 | crc32fast = "1.3.2"
 24 | farmhash = "1.1.5"
 25 | futures = "0.3"
 26 | futures-async-stream = "0.2"
 27 | hdrhistogram = "7"
 28 | isahc = { version = "1", default-features = false, optional = true }
 29 | itertools = "0.10.3"
 30 | lazy_static = "1.4.0"
 31 | libc = "0.2"
 32 | lz4 = "1.23.1"
 33 | moka = { version = "0.7", features = ["future"] }
 34 | nix = { version = "0.24.1", features = ["fs"] }
 35 | opentelemetry = { version = "0.17", optional = true, features = ["rt-tokio"] }
 36 | opentelemetry-jaeger = { version = "0.16", optional = true, features = [
 37 |     "rt-tokio",
 38 |     "collector_client",
 39 |     "isahc",
 40 |     "isahc_collector_client",
 41 | ] }
 42 | parking_lot = "0.12"
 43 | prometheus = "0.13.0"
 44 | rand = "0.8.5"
 45 | rand_chacha = "0.3.1"
 46 | rangemap = "1.0.2"
 47 | runkv-common = { path = "../common" }
 48 | runkv-proto = { path = "../proto" }
 49 | serde = "1.0"
 50 | serde_derive = "1.0"
 51 | tempfile = "3"
 52 | thiserror = "1.0"
 53 | tokio = { version = "1", features = [
 54 |     "rt-multi-thread",
 55 |     "sync",
 56 |     "macros",
 57 |     "time",
 58 |     "fs",
 59 |     "signal",
 60 | ] }
 61 | tracing = "0.1"
 62 | tracing-opentelemetry = { version = "0.17", optional = true }
 63 | tracing-subscriber = { version = "0.3.16", features = [
 64 |     "fmt",
 65 |     "parking_lot",
 66 |     "std",
 67 |     "time",
 68 | ], optional = true }
 69 | 
 70 | [dev-dependencies]
 71 | criterion = { version = "0.3", features = ["async", "async_tokio"] }
 72 | env_logger = "*"
 73 | test-log = "0.2.10"
 74 | 
 75 | [target.'cfg(target_os = "linux")'.dev-dependencies]
 76 | fiemap = "0.1.1"
 77 | 
 78 | [features]
 79 | deadlock = []
 80 | bpf = ["bcc"]
 81 | trace = [
 82 |     "isahc",
 83 |     "opentelemetry",
 84 |     "opentelemetry-jaeger",
 85 |     "tracing-opentelemetry",
 86 |     "tracing-subscriber",
 87 |     "tracing/release_max_level_trace",
 88 | ]
 89 | verbose-release-log = ["tracing/release_max_level_trace"]
 90 | 
 91 | [[bench]]
 92 | name = "bench_block_iter"
 93 | harness = false
 94 | 
 95 | [[bench]]
 96 | name = "bench_compression"
 97 | harness = false
 98 | 
 99 | [[bin]]
100 | name = "bench_raft_log_store"
101 | path = "bench/bench_raft_log_store/main.rs"
102 | 
103 | [[bin]]
104 | name = "file-cache-bench"
105 | path = "bench/file_cache_bench/main.rs"
106 | 


--------------------------------------------------------------------------------
/storage/bench/file_cache_bench/README.md:
--------------------------------------------------------------------------------
 1 | ## Usage
 2 | 
 3 | ```bash
 4 | sudo docker stop `sudo docker ps | grep jaeger | awk '{print $1}'` || true && \
 5 | sudo docker run --rm -d -p6831:6831/udp -p16686:16686 -p14268:14268 --name jaeger jaegertracing/all-in-one:latest && \
 6 | sudo rm -rf /data/filecache && \
 7 | cargo build --bin file-cache-bench --features "bpf trace" --release && \
 8 | sudo ./target/release/file-cache-bench -p /data/filecache --capacity 10240 --total-buffer-capacity 1024 --w-rate 100 --r-rate 100 --concurrency 8 --time 60 --slow 5
 9 | ```
10 | 
11 | ## Output Examples
12 | 
13 | ```plain
14 | Event {
15 |     magic: 16045690984833335023,
16 |     sid: 56298568754921497,
17 |     vfs_read_enter_ts: 255073825749145,
18 |     vfs_read_leave_ts: 255073841493162,
19 |     ext4_file_read_iter_enter_ts: 255073825750206,
20 |     ext4_file_read_iter_leave_ts: 255073841492771,
21 |     iomap_dio_rw_enter_ts: 255073836674011,
22 |     iomap_dio_rw_leave_ts: 255073841492271,
23 |     filemap_write_and_wait_range_enter_ts: 255073836674320,
24 |     filemap_write_and_wait_range_leave_ts: 255073836674622,
25 | }
26 | vfs_read                       |   15.744ms | ==================================================
27 | ext4_file_read_iter            |   15.743ms | =================================================
28 | iomap_dio_rw                   |    4.818ms |                                   ===============
29 | filemap_write_and_wait_range   |  302.000ns |                                   =
30 | ```
31 | 
32 | ```plain
33 | Total:
34 | disk total iops: 10835.1
35 | disk total throughput: 1.3 GiB/s
36 | disk read iops: 4379.6
37 | disk read throughput: 543.1 MiB/s
38 | disk write iops: 6455.4
39 | disk write throughput: 780.3 MiB/s
40 | insert iops: 788.6/s
41 | insert throughput: 788.6 MiB/s
42 | insert lat p50: 2us
43 | insert lat p90: 5us
44 | insert lat p99: 12us
45 | get iops: 656.6/s
46 | get miss: 6.06%
47 | get hit lat p50: 9087us
48 | get hit lat p90: 23551us
49 | get hit lat p99: 31487us
50 | get miss lat p50: 16us
51 | get miss lat p90: 36us
52 | get miss lat p99: 563us
53 | flush iops: 253.0/s
54 | flush throughput: 770.6 MiB/s
55 | ```


--------------------------------------------------------------------------------
/storage/bench/file_cache_bench/rate.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Singularity Data
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | use std::time::{Duration, Instant};
16 | 
17 | pub struct RateLimiter {
18 |     capacity: f64,
19 |     quota: f64,
20 | 
21 |     last: Instant,
22 | }
23 | 
24 | impl RateLimiter {
25 |     pub fn new(capacity: f64) -> Self {
26 |         Self {
27 |             capacity,
28 |             quota: 0.0,
29 |             last: Instant::now(),
30 |         }
31 |     }
32 | 
33 |     pub fn consume(&mut self, weight: f64) -> Option<Duration> {
34 |         let now = Instant::now();
35 |         let refill = now.duration_since(self.last).as_secs_f64() * self.capacity;
36 |         self.last = now;
37 |         self.quota = f64::min(self.quota + refill, self.capacity);
38 |         self.quota -= weight;
39 |         if self.quota >= 0.0 {
40 |             return None;
41 |         }
42 |         let wait = Duration::from_secs_f64((-self.quota) / self.capacity);
43 |         Some(wait)
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/storage/benches/bench_block_iter.rs:
--------------------------------------------------------------------------------
  1 | use bytes::BufMut;
  2 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
  3 | use runkv_common::coding::CompressionAlgorithm;
  4 | use runkv_storage::components::{Block, BlockBuilder, BlockBuilderOptions, BlockHolder};
  5 | use runkv_storage::iterator::{BlockIterator, Seek};
  6 | 
  7 | const TABLES_PER_SSTABLE: u32 = 10;
  8 | const KEYS_PER_TABLE: u64 = 100;
  9 | const RESTART_INTERVAL: usize = 16;
 10 | const BLOCK_CAPACITY: usize = TABLES_PER_SSTABLE as usize * KEYS_PER_TABLE as usize * 64;
 11 | 
 12 | fn block_iter_next(block: Box<Block>) {
 13 |     let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block));
 14 |     iter.seek(Seek::First).unwrap();
 15 |     while iter.is_valid() {
 16 |         iter.next().unwrap();
 17 |     }
 18 | }
 19 | 
 20 | fn block_iter_prev(block: Box<Block>) {
 21 |     let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block));
 22 |     iter.seek(Seek::Last).unwrap();
 23 |     while iter.is_valid() {
 24 |         iter.prev().unwrap();
 25 |     }
 26 | }
 27 | 
 28 | fn bench_block_iter(c: &mut Criterion) {
 29 |     let block = Box::new(build_block(TABLES_PER_SSTABLE, KEYS_PER_TABLE));
 30 | 
 31 |     println!("block size: {}", block.len());
 32 | 
 33 |     c.bench_with_input(
 34 |         BenchmarkId::new(
 35 |             format!(
 36 |                 "block - iter next - {} tables * {} keys",
 37 |                 TABLES_PER_SSTABLE, KEYS_PER_TABLE
 38 |             ),
 39 |             "",
 40 |         ),
 41 |         &block,
 42 |         |b, block| {
 43 |             b.iter(|| block_iter_next(block.clone()));
 44 |         },
 45 |     );
 46 | 
 47 |     c.bench_with_input(
 48 |         BenchmarkId::new(
 49 |             format!(
 50 |                 "block - iter prev - {} tables * {} keys",
 51 |                 TABLES_PER_SSTABLE, KEYS_PER_TABLE
 52 |             ),
 53 |             "",
 54 |         ),
 55 |         &block,
 56 |         |b, block| {
 57 |             b.iter(|| block_iter_prev(block.clone()));
 58 |         },
 59 |     );
 60 | 
 61 |     let mut iter = BlockIterator::new(BlockHolder::from_owned_block(block));
 62 |     iter.seek(Seek::First).unwrap();
 63 |     for t in 1..=TABLES_PER_SSTABLE {
 64 |         for i in 1..=KEYS_PER_TABLE {
 65 |             assert_eq!(iter.key(), key(t, i).to_vec());
 66 |             assert_eq!(iter.value(), value(i).to_vec());
 67 |             iter.next().unwrap();
 68 |         }
 69 |     }
 70 |     assert!(!iter.is_valid());
 71 | }
 72 | 
 73 | criterion_group!(benches, bench_block_iter);
 74 | criterion_main!(benches);
 75 | 
 76 | fn build_block(t: u32, i: u64) -> Block {
 77 |     let options = BlockBuilderOptions {
 78 |         capacity: BLOCK_CAPACITY,
 79 |         compression_algorithm: CompressionAlgorithm::None,
 80 |         restart_interval: RESTART_INTERVAL,
 81 |     };
 82 |     let mut builder = BlockBuilder::new(options);
 83 |     for tt in 1..=t {
 84 |         for ii in 1..=i {
 85 |             builder.add(&key(tt, ii), &value(ii));
 86 |         }
 87 |     }
 88 |     let data = builder.build();
 89 |     Block::decode(&data[..]).unwrap()
 90 | }
 91 | 
 92 | fn key(t: u32, i: u64) -> Vec<u8> {
 93 |     let mut buf = Vec::new();
 94 |     buf.put_u8(b't');
 95 |     buf.put_u32(t);
 96 |     buf.put_u64(i);
 97 |     buf
 98 | }
 99 | 
100 | fn value(i: u64) -> Vec<u8> {
101 |     let mut buf = Vec::new();
102 |     buf.put_u64(i);
103 |     buf
104 | }
105 | 


--------------------------------------------------------------------------------
/storage/benches/bench_compression.rs:
--------------------------------------------------------------------------------
 1 | use std::io::Write;
 2 | 
 3 | use bytes::BufMut;
 4 | use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
 5 | use rand::prelude::StdRng;
 6 | use rand::{Rng, SeedableRng};
 7 | 
 8 | const TABLES_PER_SSTABLE: u32 = 10;
 9 | const KEYS_PER_TABLE: u64 = 100;
10 | 
11 | fn gen_dataset(vsize: usize) -> Vec<Vec<u8>> {
12 |     let mut dataset = vec![];
13 |     let mut rng = StdRng::seed_from_u64(0);
14 |     for t in 1..=TABLES_PER_SSTABLE {
15 |         for i in 1..=KEYS_PER_TABLE {
16 |             let mut v = vec![0; vsize];
17 |             rng.fill(&mut v[..]);
18 |             let mut buf = vec![];
19 |             buf.put_u32(t);
20 |             buf.put_u64(i);
21 |             buf.put_slice(&v);
22 |             dataset.push(buf)
23 |         }
24 |     }
25 |     dataset
26 | }
27 | 
28 | fn gen_data(dataset: &[Vec<u8>]) -> Vec<u8> {
29 |     let mut data = vec![];
30 |     for entry in dataset.iter() {
31 |         data.put_slice(entry);
32 |     }
33 |     data
34 | }
35 | 
36 | fn block_compression(data: Vec<u8>) -> Vec<u8> {
37 |     let mut encoder = lz4::EncoderBuilder::new().level(4).build(vec![]).unwrap();
38 |     encoder.write_all(&data).unwrap();
39 |     let (buf, result) = encoder.finish();
40 |     result.unwrap();
41 |     buf
42 | }
43 | 
44 | fn stream_compression(dataset: Vec<Vec<u8>>) -> Vec<u8> {
45 |     let buf = vec![];
46 |     let mut encoder = lz4::EncoderBuilder::new().level(4).build(buf).unwrap();
47 |     for entry in dataset {
48 |         encoder.write_all(&entry).unwrap();
49 |     }
50 |     let (buf, result) = encoder.finish();
51 |     result.unwrap();
52 |     buf
53 | }
54 | 
55 | fn bench_compression(c: &mut Criterion) {
56 |     for vsize in [8, 16, 32, 64] {
57 |         let dataset = gen_dataset(vsize);
58 |         let data = gen_data(&dataset);
59 | 
60 |         c.bench_with_input(
61 |             BenchmarkId::new(format!("buffer - vsize: {}B", vsize), ""),
62 |             &dataset,
63 |             |b, dataset| b.iter(|| gen_data(dataset)),
64 |         );
65 | 
66 |         c.bench_with_input(
67 |             BenchmarkId::new(format!("block compression - vsize: {}B", vsize), ""),
68 |             &data,
69 |             |b, data| b.iter(|| block_compression(data.clone())),
70 |         );
71 | 
72 |         c.bench_with_input(
73 |             BenchmarkId::new(format!("stream compression - vsize: {}B", vsize), ""),
74 |             &dataset,
75 |             |b, dataset| b.iter(|| stream_compression(dataset.clone())),
76 |         );
77 | 
78 |         let uncompressed = data.len();
79 |         let block_compressed = block_compression(data).len();
80 |         let stream_compressed = stream_compression(dataset).len();
81 | 
82 |         println!("uncompressed size: {}", uncompressed);
83 |         println!(
84 |             "block compressed size: {}, rate: {:.3}",
85 |             block_compressed,
86 |             block_compressed as f64 / uncompressed as f64
87 |         );
88 |         println!(
89 |             "stream compressed size: {}, rate: {:.3}",
90 |             stream_compressed,
91 |             stream_compressed as f64 / uncompressed as f64
92 |         );
93 |     }
94 | }
95 | 
96 | criterion_group!(benches, bench_compression);
97 | criterion_main!(benches);
98 | 


--------------------------------------------------------------------------------
/storage/src/error.rs:
--------------------------------------------------------------------------------
 1 | use crate::manifest::ManifestError;
 2 | use crate::object_store::ObjectStoreError;
 3 | use crate::raft_log_store::error::RaftLogStoreError;
 4 | use crate::tiered_cache::TieredCacheError;
 5 | 
 6 | #[derive(thiserror::Error, Debug)]
 7 | pub enum Error {
 8 |     #[error("encode error: {0}")]
 9 |     EncodeError(String),
10 |     #[error("decode error: {0}")]
11 |     DecodeError(String),
12 |     #[error("object store error: {0}")]
13 |     ObjectStoreError(#[from] ObjectStoreError),
14 |     #[error("manifest error: {0}")]
15 |     ManifestError(#[from] ManifestError),
16 |     #[error("io error: {0}")]
17 |     IoError(#[from] std::io::Error),
18 |     #[error("raft log store error: {0}")]
19 |     RaftLogStoreError(#[from] RaftLogStoreError),
20 |     #[error("tiered cache error: {0}")]
21 |     TieredCacheError(#[from] TieredCacheError),
22 |     #[error("other: {0}")]
23 |     Other(String),
24 | }
25 | 
26 | impl Error {
27 |     pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> Self {
28 |         Self::Other(e.into().to_string())
29 |     }
30 | 
31 |     pub fn encode_error(e: impl Into<Box<dyn std::error::Error>>) -> Self {
32 |         Self::EncodeError(e.into().to_string())
33 |     }
34 | 
35 |     pub fn decode_error(e: impl Into<Box<dyn std::error::Error>>) -> Self {
36 |         Self::DecodeError(e.into().to_string())
37 |     }
38 | }
39 | 
40 | pub type Result<T> = std::result::Result<T, Error>;
41 | 


--------------------------------------------------------------------------------
/storage/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(drain_filter)]
 2 | #![feature(assert_matches)]
 3 | #![feature(generators, generator_trait)]
 4 | #![feature(stmt_expr_attributes)]
 5 | #![feature(proc_macro_hygiene)]
 6 | #![feature(trait_alias)]
 7 | #![feature(let_chains)]
 8 | #![feature(allocator_api)]
 9 | #![feature(lint_reasons)]
10 | #![feature(build_hasher_simple_hash_one)]
11 | #![feature(strict_provenance)]
12 | 
13 | mod error;
14 | mod lsm_tree;
15 | mod object_store;
16 | pub mod raft_log_store;
17 | pub mod tiered_cache;
18 | pub mod utils;
19 | 
20 | pub use error::*;
21 | pub use lsm_tree::*;
22 | pub use object_store::*;
23 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/components/metrics.rs:
--------------------------------------------------------------------------------
 1 | use std::sync::Arc;
 2 | 
 3 | use lazy_static::lazy_static;
 4 | 
 5 | lazy_static! {
 6 |     static ref INTERNAL_OPS_COUNTER_VEC: prometheus::CounterVec =
 7 |         prometheus::register_counter_vec!(
 8 |             "lsm_tree_internal_ops_counter_vec",
 9 |             "lsm_tree_internal_ops_counter_vec",
10 |             &["op", "node"],
11 |         )
12 |         .unwrap();
13 |     static ref INTERNAL_GAUGE_VEC: prometheus::GaugeVec = prometheus::register_gauge_vec!(
14 |         "lsm_tree_internal_gauge_vec",
15 |         "lsm_tree_internal_gauge_vec",
16 |         &["type", "node"],
17 |     )
18 |     .unwrap();
19 |     static ref BLOCK_CACHE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec =
20 |         prometheus::register_histogram_vec!(
21 |             "lsm_tree_block_cache_latency_histogram_vec",
22 |             "lsm tree block cache latency histogram vec",
23 |             &["op", "node"],
24 |             vec![0.00001, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]
25 |         )
26 |         .unwrap();
27 | }
28 | 
29 | pub struct LsmTreeMetrics {
30 |     pub rotate_memtable_counter: prometheus::Counter,
31 |     pub flush_memtable_counter: prometheus::Counter,
32 | 
33 |     pub active_memtable_size_gauge: prometheus::Gauge,
34 | 
35 |     pub block_cache_get_latency_histogram: prometheus::Histogram,
36 |     pub block_cache_insert_latency_histogram: prometheus::Histogram,
37 |     pub block_cache_fill_latency_histogram: prometheus::Histogram,
38 | }
39 | 
40 | pub type LsmTreeMetricsRef = Arc<LsmTreeMetrics>;
41 | 
42 | impl LsmTreeMetrics {
43 |     pub fn new(node: u64) -> Self {
44 |         Self {
45 |             rotate_memtable_counter: INTERNAL_OPS_COUNTER_VEC
46 |                 .get_metric_with_label_values(&["rotate_memtable", &node.to_string()])
47 |                 .unwrap(),
48 | 
49 |             flush_memtable_counter: INTERNAL_OPS_COUNTER_VEC
50 |                 .get_metric_with_label_values(&["flush_memtable", &node.to_string()])
51 |                 .unwrap(),
52 | 
53 |             active_memtable_size_gauge: INTERNAL_GAUGE_VEC
54 |                 .get_metric_with_label_values(&["active_memtable_size", &node.to_string()])
55 |                 .unwrap(),
56 | 
57 |             block_cache_get_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
58 |                 .get_metric_with_label_values(&["block_cache_get", &node.to_string()])
59 |                 .unwrap(),
60 |             block_cache_insert_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
61 |                 .get_metric_with_label_values(&["block_cache_insert", &node.to_string()])
62 |                 .unwrap(),
63 |             block_cache_fill_latency_histogram: BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
64 |                 .get_metric_with_label_values(&["block_cache_fill", &node.to_string()])
65 |                 .unwrap(),
66 |         }
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/components/mod.rs:
--------------------------------------------------------------------------------
 1 | mod block;
 2 | pub use block::*;
 3 | mod block_cache;
 4 | pub use block_cache::*;
 5 | mod memtable;
 6 | pub use memtable::*;
 7 | mod sstable;
 8 | pub use sstable::*;
 9 | mod sstable_store;
10 | pub use sstable_store::*;
11 | mod skiplist;
12 | pub use skiplist::*;
13 | mod metrics;
14 | pub use metrics::*;
15 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/components/skiplist/arena.rs:
--------------------------------------------------------------------------------
 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE).
 2 | 
 3 | use std::sync::atomic::{AtomicU32, Ordering};
 4 | use std::sync::Arc;
 5 | use std::{mem, ptr};
 6 | 
 7 | struct ArenaCore {
 8 |     len: AtomicU32,
 9 |     cap: usize,
10 |     ptr: *mut u8,
11 | }
12 | 
13 | impl Drop for ArenaCore {
14 |     fn drop(&mut self) {
15 |         unsafe {
16 |             let ptr = self.ptr as *mut u64;
17 |             let cap = self.cap / 8;
18 |             Vec::from_raw_parts(ptr, 0, cap);
19 |         }
20 |     }
21 | }
22 | 
23 | pub struct Arena {
24 |     core: Arc<ArenaCore>,
25 | }
26 | 
27 | impl Arena {
28 |     pub fn with_capacity(cap: u32) -> Arena {
29 |         let mut buf: Vec<u64> = Vec::with_capacity(cap as usize / 8);
30 |         let ptr = buf.as_mut_ptr() as *mut u8;
31 |         let cap = buf.capacity() * 8;
32 |         mem::forget(buf);
33 |         Arena {
34 |             core: Arc::new(ArenaCore {
35 |                 len: AtomicU32::new(1),
36 |                 cap,
37 |                 ptr,
38 |             }),
39 |         }
40 |     }
41 | 
42 |     pub fn len(&self) -> u32 {
43 |         self.core.len.load(Ordering::SeqCst)
44 |     }
45 | 
46 |     pub fn alloc(&self, align: usize, size: usize) -> u32 {
47 |         let align_mask = align - 1;
48 |         // Leave enough padding for align.
49 |         let size = size + align_mask;
50 |         let offset = self.core.len.fetch_add(size as u32, Ordering::SeqCst);
51 |         // Calculate the correct align point, it equals to
52 |         // (offset + align_mask) / align * align.
53 |         let ptr_offset = (offset as usize + align_mask) & !align_mask;
54 |         assert!(offset as usize + size <= self.core.cap);
55 |         ptr_offset as u32
56 |     }
57 | 
58 |     pub unsafe fn get_mut<N>(&self, offset: u32) -> *mut N {
59 |         if offset == 0 {
60 |             return ptr::null_mut();
61 |         }
62 |         self.core.ptr.add(offset as usize) as _
63 |     }
64 | 
65 |     pub fn offset<N>(&self, ptr: *const N) -> u32 {
66 |         let ptr_addr = ptr as usize;
67 |         let self_addr = self.core.ptr as usize;
68 |         if ptr_addr > self_addr && ptr_addr < self_addr + self.core.cap {
69 |             (ptr_addr - self_addr) as u32
70 |         } else {
71 |             0
72 |         }
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/components/skiplist/key.rs:
--------------------------------------------------------------------------------
 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE).
 2 | 
 3 | use std::cmp::Ordering;
 4 | 
 5 | use bytes::Bytes;
 6 | 
 7 | pub trait KeyComparator: Clone {
 8 |     fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering;
 9 |     fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool;
10 | }
11 | 
12 | #[derive(Default, Debug, Clone, Copy)]
13 | pub struct FixedLengthSuffixComparator {
14 |     len: usize,
15 | }
16 | 
17 | impl FixedLengthSuffixComparator {
18 |     pub const fn new(len: usize) -> FixedLengthSuffixComparator {
19 |         FixedLengthSuffixComparator { len }
20 |     }
21 | }
22 | 
23 | impl KeyComparator for FixedLengthSuffixComparator {
24 |     #[inline]
25 |     fn compare_key(&self, lhs: &[u8], rhs: &[u8]) -> Ordering {
26 |         if lhs.len() < self.len {
27 |             panic!(
28 |                 "cannot compare with suffix {}: {:?}",
29 |                 self.len,
30 |                 Bytes::copy_from_slice(lhs)
31 |             );
32 |         }
33 |         if rhs.len() < self.len {
34 |             panic!(
35 |                 "cannot compare with suffix {}: {:?}",
36 |                 self.len,
37 |                 Bytes::copy_from_slice(rhs)
38 |             );
39 |         }
40 |         let (l_p, l_s) = lhs.split_at(lhs.len() - self.len);
41 |         let (r_p, r_s) = rhs.split_at(rhs.len() - self.len);
42 |         let res = l_p.cmp(r_p);
43 |         match res {
44 |             Ordering::Greater | Ordering::Less => res,
45 |             Ordering::Equal => l_s.cmp(r_s),
46 |         }
47 |     }
48 | 
49 |     #[inline]
50 |     fn same_key(&self, lhs: &[u8], rhs: &[u8]) -> bool {
51 |         let (l_p, _) = lhs.split_at(lhs.len() - self.len);
52 |         let (r_p, _) = rhs.split_at(rhs.len() - self.len);
53 |         l_p == r_p
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/components/skiplist/mod.rs:
--------------------------------------------------------------------------------
 1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE).
 2 | 
 3 | mod arena;
 4 | mod key;
 5 | mod list;
 6 | 
 7 | pub const SKIPLIST_NODE_TOWER_MAX_HEIGHT: usize = 20;
 8 | 
 9 | pub use key::{FixedLengthSuffixComparator, KeyComparator};
10 | pub use list::{IterRef, Skiplist};
11 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/iterator/mod.rs:
--------------------------------------------------------------------------------
  1 | mod block_iterator;
  2 | mod concat_iterator;
  3 | mod memtable_iterator;
  4 | mod merge_iterator;
  5 | mod sstable_iterator;
  6 | mod user_key_iterator;
  7 | 
  8 | use async_trait::async_trait;
  9 | pub use block_iterator::*;
 10 | pub use concat_iterator::*;
 11 | pub use memtable_iterator::*;
 12 | pub use merge_iterator::*;
 13 | pub use sstable_iterator::*;
 14 | pub use user_key_iterator::*;
 15 | 
 16 | use crate::utils::compare_full_key;
 17 | use crate::Result;
 18 | 
 19 | pub enum Seek<'s> {
 20 |     /// Seek to the first valid position in order if exists.
 21 |     First,
 22 |     /// Seek to the last valid position in order if exists.
 23 |     Last,
 24 |     /// Seek forward for the first key euqals the given key or the frist key bigger than it.
 25 |     RandomForward(&'s [u8]),
 26 |     /// Seek backward for the first key equals the given key or the first key smaller than it.
 27 |     RandomBackward(&'s [u8]),
 28 | }
 29 | 
 30 | /// [`Iterator`] defines shared behaviours for all iterators.
 31 | ///
 32 | /// NOTE:
 33 | ///
 34 | /// [`Iterator`] must be initialized with `seek` before use.
 35 | #[async_trait]
 36 | pub trait Iterator: Send + Sync {
 37 |     /// Move a valid iterator to the next key.
 38 |     ///
 39 |     /// Note:
 40 |     ///
 41 |     /// - Before calling this function, make sure the iterator `is_valid`.
 42 |     /// - After calling this function, you may first check whether the iterator `is_valid` again,
 43 |     ///   then get the new data by calling `key` and `value`.
 44 |     /// - If the position after calling this is invalid, this function WON'T return an `Err`. You
 45 |     ///   should check `is_valid` before continuing the iteration.
 46 |     ///
 47 |     /// # Panics
 48 |     ///
 49 |     /// This function will panic if the iterator is invalid.
 50 |     async fn next(&mut self) -> Result<()>;
 51 | 
 52 |     /// Move a valid iterator to the next key.
 53 |     ///
 54 |     /// Note:
 55 |     ///
 56 |     /// - Before calling this function, make sure the iterator `is_valid`.
 57 |     /// - After calling this function, you may first check whether the iterator `is_valid` again,
 58 |     ///   then get the new data by calling `key` and `value`.
 59 |     /// - If the position after calling this is invalid, this function WON'T return an `Err`. You
 60 |     ///   should check `is_valid` before continuing the iteration.
 61 |     ///
 62 |     /// # Panics
 63 |     ///
 64 |     /// This function will panic if the iterator is invalid.
 65 |     async fn prev(&mut self) -> Result<()>;
 66 | 
 67 |     /// Retrieve the current key.
 68 |     ///
 69 |     /// Note:
 70 |     ///
 71 |     /// - Before calling this function, make sure the iterator `is_valid`.
 72 |     /// - This function should be straightforward and return immediately.
 73 |     ///
 74 |     /// # Panics
 75 |     ///
 76 |     /// This function will panic if the iterator is invalid.
 77 |     fn key(&self) -> &[u8];
 78 | 
 79 |     /// Retrieve the current value.
 80 |     ///
 81 |     /// Note:
 82 |     ///
 83 |     /// - Before calling this function, make sure the iterator `is_valid`.
 84 |     /// - This function should be straightforward and return immediately.
 85 |     ///
 86 |     /// # Panics
 87 |     ///
 88 |     /// This function will panic if the iterator is invalid.
 89 |     fn value(&self) -> &[u8];
 90 | 
 91 |     /// Indicate whether the iterator can be used.
 92 |     ///
 93 |     /// Note:
 94 |     ///
 95 |     /// - ONLY call `key`, `value`, and `next` if `is_valid` returns `true`.
 96 |     /// - This function should be straightforward and return immediately.
 97 |     fn is_valid(&self) -> bool;
 98 | 
 99 |     /// Initialize or reset iterator with the given seek mode. For more details, refer to [`Seek`].
100 |     ///
101 |     /// `seek` returns a bool which means a visible version of the given seek condition is found in
102 |     /// this iterator (but can be existing or be deleted).
103 |     ///
104 |     /// Note:
105 |     ///
106 |     /// - Do not decide whether the position is valid or not by checking the returned error of this
107 |     ///   function. This function WON'T return an `Err` if invalid. You should check `is_valid`
108 |     ///   before starting iteration.
109 |     async fn seek<'s>(&mut self, seek: Seek<'s>) -> Result<bool>;
110 | }
111 | 
112 | pub type BoxedIterator = Box<dyn Iterator>;
113 | 
114 | impl PartialEq for BoxedIterator {
115 |     fn eq(&self, other: &Self) -> bool {
116 |         self.key() == other.key()
117 |     }
118 | }
119 | 
120 | impl Eq for BoxedIterator {}
121 | 
122 | impl PartialOrd for BoxedIterator {
123 |     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
124 |         Some(self.cmp(other))
125 |     }
126 | }
127 | 
128 | impl Ord for BoxedIterator {
129 |     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
130 |         // Should not be used on `UserKeyIterator`
131 |         compare_full_key(self.key(), other.key())
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/manifest/error.rs:
--------------------------------------------------------------------------------
 1 | #[derive(thiserror::Error, Debug)]
 2 | pub enum ManifestError {
 3 |     #[error("version diff id does not match: [current: {0}] [new: {1}]")]
 4 |     VersionDiffIdNotMatch(u64, u64),
 5 |     #[error("invalid version diff: {0}")]
 6 |     InvalidVersionDiff(String),
 7 |     #[error("verion diff expired: [id: {0}]")]
 8 |     VersionDiffExpired(u64),
 9 |     #[error("level not exists: [idx: {0}] [total: {1}]")]
10 |     LevelNotExists(u64, u64),
11 |     #[error("invalid watermark: [current: {0}] [given: {1}]")]
12 |     InvalidWatermark(u64, u64),
13 |     #[error("other: {0}")]
14 |     Other(String),
15 | }
16 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/manifest/mod.rs:
--------------------------------------------------------------------------------
1 | mod error;
2 | mod version;
3 | 
4 | pub use error::*;
5 | pub use version::*;
6 | 


--------------------------------------------------------------------------------
/storage/src/lsm_tree/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod components;
 2 | pub mod iterator;
 3 | pub mod manifest;
 4 | 
 5 | pub const DEFAULT_SSTABLE_SIZE: usize = 4 * 1024 * 1024; // 4 MiB
 6 | pub const DEFAULT_BLOCK_SIZE: usize = 64 * 1024; // 64 KiB
 7 | pub const DEFAULT_RESTART_INTERVAL: usize = 16;
 8 | pub const TEST_DEFAULT_RESTART_INTERVAL: usize = 2;
 9 | pub const DEFAULT_ENTRY_SIZE: usize = 1024; // 1 KiB
10 | pub const DEFAULT_BLOOM_FALSE_POSITIVE: f64 = 0.1;
11 | pub const DEFAULT_SSTABLE_META_SIZE: usize = 4 * 1024; // 4 KiB
12 | pub const DEFAULT_MEMTABLE_SIZE: usize = 4 * 1024 * 1024; // 4 MiB
13 | 


--------------------------------------------------------------------------------
/storage/src/object_store/mem.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | use std::ops::Range;
 3 | 
 4 | use async_trait::async_trait;
 5 | use parking_lot::RwLock;
 6 | 
 7 | use super::ObjectStore;
 8 | use crate::{ObjectStoreError, Result};
 9 | 
10 | #[derive(Default)]
11 | pub struct MemObjectStore {
12 |     objects: RwLock<BTreeMap<String, Vec<u8>>>,
13 | }
14 | 
15 | #[async_trait]
16 | impl ObjectStore for MemObjectStore {
17 |     async fn put(&self, path: &str, obj: Vec<u8>) -> Result<()> {
18 |         let mut objects = self.objects.write();
19 |         objects.insert(path.to_string(), obj);
20 |         Ok(())
21 |     }
22 | 
23 |     async fn get(&self, path: &str) -> Result<Option<Vec<u8>>> {
24 |         let objects = self.objects.read();
25 |         let obj = objects.get(path).cloned();
26 |         Ok(obj)
27 |     }
28 | 
29 |     async fn get_range(&self, path: &str, range: Range<usize>) -> Result<Option<Vec<u8>>> {
30 |         let objects = self.objects.read();
31 |         let obj = objects.get(path).map(|obj| obj[range].to_vec());
32 |         Ok(obj)
33 |     }
34 | 
35 |     async fn remove(&self, path: &str) -> Result<()> {
36 |         let mut objects = self.objects.write();
37 |         objects
38 |             .remove(path)
39 |             .ok_or_else(|| ObjectStoreError::ObjectNotFound(path.to_string()))?;
40 |         Ok(())
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/storage/src/object_store/mod.rs:
--------------------------------------------------------------------------------
 1 | mod mem;
 2 | pub use mem::*;
 3 | mod s3;
 4 | use std::ops::Range;
 5 | use std::sync::Arc;
 6 | 
 7 | use async_trait::async_trait;
 8 | pub use s3::*;
 9 | 
10 | use crate::Result;
11 | 
12 | #[derive(thiserror::Error, Debug)]
13 | pub enum ObjectStoreError {
14 |     #[error("object not found: {0}")]
15 |     ObjectNotFound(String),
16 |     #[error("invalid range: {0}")]
17 |     InvalidRange(String),
18 |     #[error("S3 error: {0}")]
19 |     S3(String),
20 |     #[error("other: {0}")]
21 |     Other(String),
22 | }
23 | 
24 | #[async_trait]
25 | pub trait ObjectStore: Send + Sync {
26 |     async fn put(&self, path: &str, obj: Vec<u8>) -> Result<()>;
27 | 
28 |     async fn get(&self, path: &str) -> Result<Option<Vec<u8>>>;
29 | 
30 |     async fn get_range(&self, path: &str, range: Range<usize>) -> Result<Option<Vec<u8>>>;
31 | 
32 |     async fn remove(&self, path: &str) -> Result<()>;
33 | }
34 | 
35 | pub type ObjectStoreRef = Arc<dyn ObjectStore>;
36 | 


--------------------------------------------------------------------------------
/storage/src/object_store/s3.rs:
--------------------------------------------------------------------------------
  1 | use std::ops::Range;
  2 | 
  3 | use async_trait::async_trait;
  4 | use aws_sdk_s3::error::{GetObjectError, GetObjectErrorKind};
  5 | use aws_sdk_s3::types::SdkError;
  6 | use aws_sdk_s3::{Client, Endpoint, Region};
  7 | use aws_smithy_http::body::SdkBody;
  8 | 
  9 | use super::ObjectStore;
 10 | use crate::{ObjectStoreError, Result};
 11 | 
 12 | pub struct S3ObjectStore {
 13 |     client: Client,
 14 |     bucket: String,
 15 | }
 16 | 
 17 | impl S3ObjectStore {
 18 |     pub async fn new(bucket: String) -> Self {
 19 |         let config = aws_config::load_from_env().await;
 20 |         let client = Client::new(&config);
 21 |         Self { client, bucket }
 22 |     }
 23 | 
 24 |     /// Create a minio client. The server should be like `minio://key:secret@address:port/bucket`.
 25 |     pub async fn new_with_minio(server: &str) -> Self {
 26 |         let server = server.strip_prefix("minio://").unwrap();
 27 |         let (access_key_id, rest) = server.split_once(':').unwrap();
 28 |         let (secret_access_key, rest) = rest.split_once('@').unwrap();
 29 |         let (address, bucket) = rest.split_once('/').unwrap();
 30 | 
 31 |         let loader = aws_config::ConfigLoader::default();
 32 |         let builder = aws_sdk_s3::config::Builder::from(&loader.load().await);
 33 |         let builder = builder.region(Region::new("custom"));
 34 |         let builder = builder.endpoint_resolver(Endpoint::immutable(
 35 |             format!("http://{}", address).try_into().unwrap(),
 36 |         ));
 37 |         let builder = builder.credentials_provider(aws_sdk_s3::Credentials::from_keys(
 38 |             access_key_id,
 39 |             secret_access_key,
 40 |             None,
 41 |         ));
 42 |         let config = builder.build();
 43 |         let client = Client::from_conf(config);
 44 |         Self {
 45 |             client,
 46 |             bucket: bucket.to_string(),
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | fn err(err: impl Into<Box<dyn std::error::Error + Send + Sync>>) -> ObjectStoreError {
 52 |     ObjectStoreError::S3(err.into().to_string())
 53 | }
 54 | 
 55 | #[async_trait]
 56 | impl ObjectStore for S3ObjectStore {
 57 |     async fn put(&self, path: &str, obj: Vec<u8>) -> Result<()> {
 58 |         self.client
 59 |             .put_object()
 60 |             .bucket(&self.bucket)
 61 |             .body(SdkBody::from(obj).into())
 62 |             .key(path)
 63 |             .send()
 64 |             .await
 65 |             .map_err(err)?;
 66 |         Ok(())
 67 |     }
 68 | 
 69 |     async fn get(&self, path: &str) -> Result<Option<Vec<u8>>> {
 70 |         let req = self.client.get_object().bucket(&self.bucket).key(path);
 71 |         let rsp = match req.send().await {
 72 |             Ok(rsp) => rsp,
 73 |             Err(SdkError::ServiceError {
 74 |                 err:
 75 |                     GetObjectError {
 76 |                         kind: GetObjectErrorKind::NoSuchKey(..),
 77 |                         ..
 78 |                     },
 79 |                 ..
 80 |             }) => return Ok(None),
 81 |             Err(e) => return Err(err(e).into()),
 82 |         };
 83 |         let data = rsp.body.collect().await.map_err(err)?.into_bytes().to_vec();
 84 |         Ok(Some(data))
 85 |     }
 86 | 
 87 |     async fn get_range(&self, path: &str, range: Range<usize>) -> Result<Option<Vec<u8>>> {
 88 |         let req = self
 89 |             .client
 90 |             .get_object()
 91 |             .bucket(&self.bucket)
 92 |             .key(path)
 93 |             .range(format!("bytes={}-{}", range.start, range.end - 1));
 94 |         let rsp = match req.send().await {
 95 |             Ok(rsp) => rsp,
 96 |             Err(SdkError::ServiceError {
 97 |                 err:
 98 |                     GetObjectError {
 99 |                         kind: GetObjectErrorKind::NoSuchKey(..),
100 |                         ..
101 |                     },
102 |                 ..
103 |             }) => return Ok(None),
104 |             Err(e) => return Err(err(e).into()),
105 |         };
106 |         let data = rsp.body.collect().await.map_err(err)?.into_bytes().to_vec();
107 |         Ok(Some(data))
108 |     }
109 | 
110 |     async fn remove(&self, path: &str) -> Result<()> {
111 |         self.client
112 |             .delete_object()
113 |             .bucket(&self.bucket)
114 |             .key(path)
115 |             .send()
116 |             .await
117 |             .map_err(err)?;
118 |         Ok(())
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/storage/src/raft_log_store/block_cache.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | use std::time::Instant;
  3 | 
  4 | use futures::Future;
  5 | use moka::future::Cache;
  6 | 
  7 | use super::error::RaftLogStoreError;
  8 | use super::metrics::RaftLogStoreMetricsRef;
  9 | use super::DEFAULT_LOG_BATCH_SIZE;
 10 | use crate::error::Result;
 11 | 
 12 | #[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
 13 | struct BlockIndex {
 14 |     file_id: u64,
 15 |     offset: usize,
 16 | }
 17 | 
 18 | pub struct BlockCache {
 19 |     inner: Cache<BlockIndex, Arc<Vec<u8>>>,
 20 |     metrics: RaftLogStoreMetricsRef,
 21 | }
 22 | 
 23 | impl BlockCache {
 24 |     pub fn new(capacity: usize, metrics: RaftLogStoreMetricsRef) -> Self {
 25 |         let cache: Cache<BlockIndex, Arc<Vec<u8>>> = Cache::builder()
 26 |             .weigher(|_k, v: &Arc<Vec<u8>>| v.len() as u32)
 27 |             .initial_capacity(capacity / DEFAULT_LOG_BATCH_SIZE)
 28 |             .max_capacity(capacity as u64)
 29 |             .build();
 30 |         Self {
 31 |             inner: cache,
 32 |             metrics,
 33 |         }
 34 |     }
 35 | 
 36 |     #[tracing::instrument(level = "trace", skip(self))]
 37 |     pub fn get(&self, file_id: u64, offset: usize) -> Option<Arc<Vec<u8>>> {
 38 |         let start = Instant::now();
 39 | 
 40 |         let result = self.inner.get(&BlockIndex { file_id, offset });
 41 | 
 42 |         self.metrics
 43 |             .block_cache_get_latency_histogram
 44 |             .observe(start.elapsed().as_secs_f64());
 45 | 
 46 |         result
 47 |     }
 48 | 
 49 |     #[tracing::instrument(level = "trace", skip(self, block))]
 50 |     pub async fn insert(&self, file_id: u64, offset: usize, block: Arc<Vec<u8>>) {
 51 |         let start = Instant::now();
 52 | 
 53 |         self.inner
 54 |             .insert(BlockIndex { file_id, offset }, block)
 55 |             .await;
 56 | 
 57 |         self.metrics
 58 |             .block_cache_insert_latency_histogram
 59 |             .observe(start.elapsed().as_secs_f64());
 60 |     }
 61 | 
 62 |     #[tracing::instrument(level = "trace", skip(self, f))]
 63 |     pub async fn get_or_insert_with<F>(
 64 |         &self,
 65 |         file_id: u64,
 66 |         offset: usize,
 67 |         f: F,
 68 |     ) -> Result<Arc<Vec<u8>>>
 69 |     where
 70 |         F: Future<Output = Result<Arc<Vec<u8>>>>,
 71 |     {
 72 |         let future = async move {
 73 |             let start_fill = Instant::now();
 74 | 
 75 |             let r = f.await;
 76 | 
 77 |             self.metrics
 78 |                 .block_cache_fill_latency_histogram
 79 |                 .observe(start_fill.elapsed().as_secs_f64());
 80 | 
 81 |             r
 82 |         };
 83 | 
 84 |         let start = Instant::now();
 85 | 
 86 |         let result = match self
 87 |             .inner
 88 |             .get_or_try_insert_with(BlockIndex { file_id, offset }, future)
 89 |             .await
 90 |         {
 91 |             Ok(block) => block,
 92 |             Err(arc_error) => return Err(RaftLogStoreError::Other(arc_error.to_string()).into()),
 93 |         };
 94 | 
 95 |         self.metrics
 96 |             .block_cache_get_latency_histogram
 97 |             .observe(start.elapsed().as_secs_f64());
 98 | 
 99 |         Ok(result)
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/storage/src/raft_log_store/error.rs:
--------------------------------------------------------------------------------
 1 | #[derive(thiserror::Error, Debug)]
 2 | pub enum RaftLogStoreError {
 3 |     #[error("group {0} not exists")]
 4 |     GroupNotExists(u64),
 5 |     #[error("group {0} already exists")]
 6 |     GroupAlreadyExists(u64),
 7 |     #[error("encode error: {0}")]
 8 |     EncodeError(String),
 9 |     #[error("decode error: {0}")]
10 |     DecodeError(String),
11 |     #[error("checksum mismatch: [expected: {expected}] [get: {get}]")]
12 |     ChecksumMismatch { expected: u32, get: u32 },
13 |     #[error("io error: {0}")]
14 |     IoError(#[from] std::io::Error),
15 |     #[error("raft log gap exists: [{start}, {end})")]
16 |     RaftLogGap { start: u64, end: u64 },
17 |     #[error("raft log file gap: [{start}, {end})")]
18 |     RaftLogFileGap { start: u64, end: u64 },
19 |     #[error("raft log file not found: {0}")]
20 |     RaftLogFileNotFound(u64),
21 |     #[error("other: {0}")]
22 |     Other(String),
23 | }
24 | 
25 | impl RaftLogStoreError {
26 |     pub fn encode_error(e: impl Into<Box<dyn std::error::Error>>) -> Self {
27 |         Self::EncodeError(e.into().to_string())
28 |     }
29 | 
30 |     pub fn decode_error(e: impl Into<Box<dyn std::error::Error>>) -> Self {
31 |         Self::DecodeError(e.into().to_string())
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/storage/src/raft_log_store/metrics.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::Arc;
  2 | 
  3 | use lazy_static::lazy_static;
  4 | 
  5 | lazy_static! {
  6 |     static ref RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec =
  7 |         prometheus::register_histogram_vec!(
  8 |             "raft_log_store_latency_histogram_vec",
  9 |             "raft log store latency histogram vec",
 10 |             &["op", "node"],
 11 |             vec![0.0001, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
 12 |         )
 13 |         .unwrap();
 14 |     static ref RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec =
 15 |         prometheus::register_histogram_vec!(
 16 |             "raft_log_store_block_cache_latency_histogram_vec",
 17 |             "raft log store block cache latency histogram vec",
 18 |             &["op", "node"],
 19 |             vec![0.00001, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]
 20 |         )
 21 |         .unwrap();
 22 |     static ref RAFT_LOG_STORE_THROUGHPUT_GAUGE_VEC: prometheus::GaugeVec =
 23 |         prometheus::register_gauge_vec!(
 24 |             "raft_log_store_throughput_gauge_vec",
 25 |             "raft log store throughput guage vec",
 26 |             &["op", "node"]
 27 |         )
 28 |         .unwrap();
 29 |     static ref RAFT_LOG_STORE_OP_COUNTER_VEC: prometheus::CounterVec =
 30 |         prometheus::register_counter_vec!(
 31 |             "raft_log_store_op_counter_vec",
 32 |             "raft log store op counter vec",
 33 |             &["op", "node"]
 34 |         )
 35 |         .unwrap();
 36 |     static ref RAFT_LOG_STORE_BATCH_WRITERS_HISTOGRAM_VEC: prometheus::HistogramVec =
 37 |         prometheus::register_histogram_vec!(
 38 |             "raft_log_store_batch_writers_histogram_vec",
 39 |             "raft log store batch writers histogram vec",
 40 |             &["node"],
 41 |             vec![1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]
 42 |         )
 43 |         .unwrap();
 44 |     static ref RAFT_LOG_STORE_SYNC_SIZE_HISTOGRAM_VEC: prometheus::HistogramVec =
 45 |         prometheus::register_histogram_vec!(
 46 |             "raft_log_store_sync_size_histogram_vec",
 47 |             "raft log store sync size histogram vec",
 48 |             &["node"],
 49 |             vec![256.0, 1024.0, 4096.0, 8192.0, 16384.0, 65536.0],
 50 |         )
 51 |         .unwrap();
 52 | }
 53 | 
 54 | pub struct RaftLogStoreMetrics {
 55 |     pub sync_latency_histogram: prometheus::Histogram,
 56 |     pub sync_size_histogram: prometheus::Histogram,
 57 | 
 58 |     pub append_latency_histogram: prometheus::Histogram,
 59 | 
 60 |     pub append_log_latency_histogram: prometheus::Histogram,
 61 |     pub append_log_throughput_guage: prometheus::Gauge,
 62 | 
 63 |     pub batch_writers_histogram: prometheus::Histogram,
 64 | 
 65 |     pub block_cache_get_latency_histogram: prometheus::Histogram,
 66 |     pub block_cache_insert_latency_histogram: prometheus::Histogram,
 67 |     pub block_cache_fill_latency_histogram: prometheus::Histogram,
 68 | }
 69 | 
 70 | pub type RaftLogStoreMetricsRef = Arc<RaftLogStoreMetrics>;
 71 | 
 72 | impl RaftLogStoreMetrics {
 73 |     pub fn new(node: u64) -> Self {
 74 |         Self {
 75 |             sync_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC
 76 |                 .get_metric_with_label_values(&["sync", &node.to_string()])
 77 |                 .unwrap(),
 78 |             sync_size_histogram: RAFT_LOG_STORE_SYNC_SIZE_HISTOGRAM_VEC
 79 |                 .get_metric_with_label_values(&[&node.to_string()])
 80 |                 .unwrap(),
 81 | 
 82 |             append_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC
 83 |                 .get_metric_with_label_values(&["append", &node.to_string()])
 84 |                 .unwrap(),
 85 | 
 86 |             append_log_latency_histogram: RAFT_LOG_STORE_LATENCY_HISTOGRAM_VEC
 87 |                 .get_metric_with_label_values(&["append_log", &node.to_string()])
 88 |                 .unwrap(),
 89 |             append_log_throughput_guage: RAFT_LOG_STORE_THROUGHPUT_GAUGE_VEC
 90 |                 .get_metric_with_label_values(&["append_log", &node.to_string()])
 91 |                 .unwrap(),
 92 | 
 93 |             block_cache_get_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
 94 |                 .get_metric_with_label_values(&["block_cache_get", &node.to_string()])
 95 |                 .unwrap(),
 96 |             block_cache_insert_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
 97 |                 .get_metric_with_label_values(&["block_cache_insert", &node.to_string()])
 98 |                 .unwrap(),
 99 |             block_cache_fill_latency_histogram: RAFT_LOG_STORE_BLOCK_CACHE_LATENCY_HISTOGRAM_VEC
100 |                 .get_metric_with_label_values(&["block_cache_fill", &node.to_string()])
101 |                 .unwrap(),
102 | 
103 |             batch_writers_histogram: RAFT_LOG_STORE_BATCH_WRITERS_HISTOGRAM_VEC
104 |                 .get_metric_with_label_values(&[&node.to_string()])
105 |                 .unwrap(),
106 |         }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/storage/src/raft_log_store/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod block_cache;
 2 | pub mod entry;
 3 | pub mod error;
 4 | pub mod file;
 5 | pub mod log;
 6 | pub mod mem;
 7 | pub mod metrics;
 8 | pub mod queue;
 9 | pub mod store;
10 | 
11 | const DEFAULT_LOG_BATCH_SIZE: usize = 8 << 10;
12 | 
13 | pub use store::RaftLogStore;
14 | 


--------------------------------------------------------------------------------
/storage/src/raft_log_store/queue.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::VecDeque;
  2 | use std::sync::Arc;
  3 | 
  4 | use parking_lot::RwLock;
  5 | 
  6 | use super::file::{ActiveFile, FrozenFile};
  7 | use crate::error::Result;
  8 | use crate::raft_log_store::error::RaftLogStoreError;
  9 | 
 10 | #[derive(Debug)]
 11 | pub enum LogFile {
 12 |     Active(ActiveFile),
 13 |     Frozen(FrozenFile),
 14 | }
 15 | 
 16 | struct LogQueueCore {
 17 |     active: ActiveFile,
 18 |     frozens: VecDeque<FrozenFile>,
 19 | }
 20 | 
 21 | #[derive(Clone)]
 22 | pub struct LogQueue {
 23 |     node: u64,
 24 |     core: Arc<RwLock<LogQueueCore>>,
 25 | }
 26 | 
 27 | impl std::fmt::Debug for LogQueue {
 28 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 29 |         f.debug_struct("LogQueue")
 30 |             .field("node", &self.node)
 31 |             .finish()
 32 |     }
 33 | }
 34 | 
 35 | impl LogQueue {
 36 |     #[tracing::instrument(level = "trace", err)]
 37 |     pub fn init(node: u64, active: ActiveFile, mut frozens: Vec<FrozenFile>) -> Result<Self> {
 38 |         frozens.sort_by_key(|frozen| frozen.id());
 39 |         if !frozens.is_empty() {
 40 |             let mut id = frozens.first().unwrap().id();
 41 |             for frozen in frozens.iter() {
 42 |                 if frozen.id() != id {
 43 |                     return Err(RaftLogStoreError::RaftLogFileGap {
 44 |                         start: id,
 45 |                         end: frozen.id(),
 46 |                     }
 47 |                     .into());
 48 |                 }
 49 |                 id += 1;
 50 |             }
 51 |             if active.id() != id {
 52 |                 return Err(RaftLogStoreError::RaftLogFileGap {
 53 |                     start: id,
 54 |                     end: active.id(),
 55 |                 }
 56 |                 .into());
 57 |             }
 58 |         }
 59 |         Ok(Self {
 60 |             node,
 61 |             core: Arc::new(RwLock::new(LogQueueCore {
 62 |                 active,
 63 |                 frozens: VecDeque::from_iter(frozens),
 64 |             })),
 65 |         })
 66 |     }
 67 | 
 68 |     #[tracing::instrument(level = "trace")]
 69 |     pub async fn rotate(&self, active: ActiveFile, frozen: FrozenFile) {
 70 |         let mut core = self.core.write();
 71 | 
 72 |         core.active = active;
 73 |         core.frozens.push_back(frozen);
 74 |     }
 75 | 
 76 |     #[tracing::instrument(level = "trace", ret)]
 77 |     pub fn active(&self) -> ActiveFile {
 78 |         self.core.read().active.clone()
 79 |     }
 80 | 
 81 |     #[tracing::instrument(level = "trace", ret, err)]
 82 |     pub fn file(&self, id: u64) -> Result<LogFile> {
 83 |         let core = self.core.read();
 84 |         if id == core.active.id() {
 85 |             return Ok(LogFile::Active(core.active.clone()));
 86 |         }
 87 |         if core.frozens.is_empty() {
 88 |             return Err(RaftLogStoreError::RaftLogFileNotFound(id).into());
 89 |         }
 90 |         let first = core.frozens[0].id();
 91 |         if id < first || (id - first) as usize >= core.frozens.len() {
 92 |             return Err(RaftLogStoreError::RaftLogFileNotFound(id).into());
 93 |         }
 94 |         Ok(LogFile::Frozen(core.frozens[(id - first) as usize].clone()))
 95 |     }
 96 | 
 97 |     #[tracing::instrument(level = "trace", ret)]
 98 |     pub fn frozen_file_count(&self) -> usize {
 99 |         self.core.read().frozens.len()
100 |     }
101 | 
102 |     #[tracing::instrument(level = "trace", ret)]
103 |     pub fn frozens(&self) -> Vec<FrozenFile> {
104 |         let frozens = self.core.read().frozens.clone();
105 |         Vec::from_iter(frozens)
106 |     }
107 | }
108 | 
109 | #[cfg(test)]
110 | mod tests {
111 |     use super::*;
112 | 
113 |     fn is_send_sync_clone<T: Send + Sync + Clone + 'static>() {}
114 | 
115 |     #[test]
116 |     fn ensure_send_sync_clone() {
117 |         is_send_sync_clone::<LogQueue>();
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/storage/src/tiered_cache/file_cache/alloc.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Singularity Data
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | pub struct AlignedAllocator<const ALIGN: usize>;
16 | 
17 | use std::alloc::{Allocator, Global};
18 | 
19 | use super::utils;
20 | 
21 | unsafe impl<const ALIGN: usize> Allocator for AlignedAllocator<ALIGN> {
22 |     fn allocate(
23 |         &self,
24 |         layout: std::alloc::Layout,
25 |     ) -> Result<std::ptr::NonNull<[u8]>, std::alloc::AllocError> {
26 |         let layout = std::alloc::Layout::from_size_align(
27 |             layout.size(),
28 |             utils::align_up(ALIGN, layout.align()),
29 |         )
30 |         .unwrap();
31 |         Global.allocate(layout)
32 |     }
33 | 
34 |     unsafe fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
35 |         let layout = std::alloc::Layout::from_size_align(
36 |             layout.size(),
37 |             utils::align_up(ALIGN, layout.align()),
38 |         )
39 |         .unwrap();
40 |         Global.deallocate(ptr, layout)
41 |     }
42 | }
43 | 
44 | #[cfg(test)]
45 | mod tests {
46 |     use super::*;
47 | 
48 |     #[test]
49 |     fn test_aligned_buffer() {
50 |         const ALIGN: usize = 512;
51 |         let allocator = AlignedAllocator::<ALIGN>;
52 | 
53 |         let mut buf: Vec<u8, _> = Vec::with_capacity_in(ALIGN * 8, &allocator);
54 |         utils::assert_aligned(ALIGN, buf.as_ptr().addr());
55 | 
56 |         buf.extend_from_slice(&[b'x'; ALIGN * 8]);
57 |         utils::assert_aligned(ALIGN, buf.as_ptr().addr());
58 |         assert_eq!(buf, [b'x'; ALIGN * 8]);
59 | 
60 |         buf.extend_from_slice(&[b'x'; ALIGN * 8]);
61 |         utils::assert_aligned(ALIGN, buf.as_ptr().addr());
62 |         assert_eq!(buf, [b'x'; ALIGN * 16])
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/storage/src/tiered_cache/file_cache/buffer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Singularity Data
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | // http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | use std::sync::Arc;
 16 | 
 17 | use parking_lot::RwLock;
 18 | 
 19 | use super::LRU_SHARD_BITS;
 20 | use crate::tiered_cache::{TieredCacheEntryHolder, TieredCacheKey, TieredCacheValue};
 21 | use crate::utils::lru_cache::LruCache;
 22 | 
 23 | pub type Buffer<K, V> = Arc<LruCache<K, V>>;
 24 | 
 25 | struct TwoLevelBufferCore<K, V>
 26 | where
 27 |     K: TieredCacheKey,
 28 |     V: TieredCacheValue,
 29 | {
 30 |     active_buffer: Buffer<K, V>,
 31 |     frozen_buffer: Buffer<K, V>,
 32 | }
 33 | 
 34 | impl<K, V> TwoLevelBufferCore<K, V>
 35 | where
 36 |     K: TieredCacheKey,
 37 |     V: TieredCacheValue,
 38 | {
 39 |     fn swap(&mut self) {
 40 |         // Swap fields of `&mut self` to avoid the borrow checker complaining.
 41 |         std::mem::swap(&mut self.active_buffer, &mut self.frozen_buffer);
 42 |     }
 43 | }
 44 | 
 45 | pub struct TwoLevelBuffer<K, V>
 46 | where
 47 |     K: TieredCacheKey,
 48 |     V: TieredCacheValue,
 49 | {
 50 |     capacity: usize,
 51 |     core: Arc<RwLock<TwoLevelBufferCore<K, V>>>,
 52 | }
 53 | 
 54 | impl<K, V> Clone for TwoLevelBuffer<K, V>
 55 | where
 56 |     K: TieredCacheKey,
 57 |     V: TieredCacheValue,
 58 | {
 59 |     fn clone(&self) -> Self {
 60 |         Self {
 61 |             capacity: self.capacity,
 62 |             core: Arc::clone(&self.core),
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | impl<K, V> TwoLevelBuffer<K, V>
 68 | where
 69 |     K: TieredCacheKey,
 70 |     V: TieredCacheValue,
 71 | {
 72 |     pub fn new(capacity: usize) -> Self {
 73 |         Self {
 74 |             capacity,
 75 |             core: Arc::new(RwLock::new(TwoLevelBufferCore {
 76 |                 active_buffer: Arc::new(LruCache::new(LRU_SHARD_BITS, capacity)),
 77 |                 frozen_buffer: Arc::new(LruCache::new(LRU_SHARD_BITS, capacity)),
 78 |             })),
 79 |         }
 80 |     }
 81 | 
 82 |     pub fn insert(&self, hash: u64, key: K, charge: usize, value: V) {
 83 |         let core = self.core.read();
 84 |         core.active_buffer.insert(key, hash, charge, value);
 85 |     }
 86 | 
 87 |     pub fn get(&self, hash: u64, key: &K) -> Option<TieredCacheEntryHolder<K, V>> {
 88 |         let core = self.core.read();
 89 |         if let Some(entry) = core.active_buffer.lookup(hash, key) {
 90 |             return Some(TieredCacheEntryHolder::from_cached_value(entry));
 91 |         }
 92 |         if let Some(entry) = core.frozen_buffer.lookup(hash, key) {
 93 |             return Some(TieredCacheEntryHolder::from_cached_value(entry));
 94 |         }
 95 |         None
 96 |     }
 97 | 
 98 |     pub fn erase(&self, hash: u64, key: &K) {
 99 |         let core = self.core.read();
100 |         core.active_buffer.erase(hash, key);
101 |         core.frozen_buffer.erase(hash, key);
102 |     }
103 | 
104 |     pub fn active(&self) -> Buffer<K, V> {
105 |         self.core.read().active_buffer.clone()
106 |     }
107 | 
108 |     pub fn frozen(&self) -> Buffer<K, V> {
109 |         self.core.read().frozen_buffer.clone()
110 |     }
111 | 
112 |     pub fn swap(&self) {
113 |         self.core.write().swap();
114 |     }
115 | 
116 |     pub fn rotate(&self) -> Buffer<K, V> {
117 |         let mut buffer = Arc::new(LruCache::new(LRU_SHARD_BITS, self.capacity));
118 |         let mut core = self.core.write();
119 |         std::mem::swap(&mut buffer, &mut core.active_buffer);
120 |         std::mem::swap(&mut buffer, &mut core.frozen_buffer);
121 |         buffer
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/storage/src/tiered_cache/file_cache/error.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Singularity Data
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #[derive(thiserror::Error, Debug)]
16 | pub enum Error {
17 |     #[error("io error: {0}")]
18 |     Io(#[from] std::io::Error),
19 |     #[error("nix error: {0}")]
20 |     Nix(#[from] nix::errno::Errno),
21 |     #[error("unsupported file system, super block magic: {0}")]
22 |     UnsupportedFilesystem(i64),
23 |     #[error("invalid slot: {0}")]
24 |     InvalidSlot(usize),
25 |     #[error("other error: {0}")]
26 |     Other(String),
27 | }
28 | 
29 | pub type Result<T> = core::result::Result<T, Error>;
30 | 


--------------------------------------------------------------------------------
/storage/src/tiered_cache/file_cache/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2022 Singularity Data
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | pub mod alloc;
16 | pub mod buffer;
17 | pub mod cache;
18 | pub mod error;
19 | pub mod file;
20 | pub mod meta;
21 | pub mod metrics;
22 | pub mod store;
23 | pub mod utils;
24 | 
25 | #[cfg(test)]
26 | pub mod test_utils;
27 | 
28 | async fn asyncify<F, T>(f: F) -> error::Result<T>
29 | where
30 |     F: FnOnce() -> error::Result<T> + Send + 'static,
31 |     T: Send + 'static,
32 | {
33 |     match tokio::task::spawn_blocking(f).await {
34 |         Ok(res) => res,
35 |         Err(_) => Err(error::Error::Other("background task failed".to_string())),
36 |     }
37 | }
38 | 
39 | /// The logical block size of the underlying storage (typically 512 bytes).
40 | ///
41 | /// Can be determined using `ioctl(2)` `BLKSSZGET` operation or from the sheel using the command:
42 | ///
43 | /// ```bash
44 | ///     blockdev --getss
45 | /// ```
46 | ///
47 | /// For more details, see man open(2) NOTES section.
48 | const LOGICAL_BLOCK_SIZE: usize = 512;
49 | /// Size of `st_blocks` with `fstat(2)`.
50 | const ST_BLOCK_SIZE: usize = 512;
51 | 
52 | const LRU_SHARD_BITS: usize = 5;
53 | 
54 | type DioBuffer = Vec<u8, &'static alloc::AlignedAllocator<LOGICAL_BLOCK_SIZE>>;
55 | 
56 | static DIO_BUFFER_ALLOCATOR: alloc::AlignedAllocator<LOGICAL_BLOCK_SIZE> =
57 |     alloc::AlignedAllocator::<LOGICAL_BLOCK_SIZE>;
58 | 


--------------------------------------------------------------------------------
/storage/src/tiered_cache/file_cache/test_utils.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 Singularity Data
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | // http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | use std::path::Path;
 16 | use std::sync::Arc;
 17 | 
 18 | use async_trait::async_trait;
 19 | use bytes::{Buf, BufMut};
 20 | use tokio::sync::{mpsc, Mutex};
 21 | 
 22 | use super::cache::FlushBufferHook;
 23 | use super::error::Result;
 24 | use crate::tiered_cache::{TieredCacheKey, TieredCacheValue};
 25 | 
 26 | #[derive(Clone, Hash, Debug, PartialEq, Eq)]
 27 | pub struct TestCacheKey(pub u64);
 28 | 
 29 | impl TieredCacheKey for TestCacheKey {
 30 |     fn encoded_len() -> usize {
 31 |         8
 32 |     }
 33 | 
 34 |     fn encode(&self, mut buf: &mut [u8]) {
 35 |         buf.put_u64(self.0);
 36 |     }
 37 | 
 38 |     fn decode(mut buf: &[u8]) -> Self {
 39 |         Self(buf.get_u64())
 40 |     }
 41 | }
 42 | 
 43 | pub type TestCacheValue = Vec<u8>;
 44 | 
 45 | impl TieredCacheValue for Vec<u8> {
 46 |     fn len(&self) -> usize {
 47 |         Vec::len(self)
 48 |     }
 49 | 
 50 |     fn encoded_len(&self) -> usize {
 51 |         self.len()
 52 |     }
 53 | 
 54 |     fn encode(&self, mut buf: &mut [u8]) {
 55 |         buf.put_slice(self)
 56 |     }
 57 | 
 58 |     fn decode(buf: Vec<u8>) -> Self {
 59 |         buf.to_vec()
 60 |     }
 61 | }
 62 | 
 63 | pub fn key(v: u64) -> TestCacheKey {
 64 |     TestCacheKey(v)
 65 | }
 66 | 
 67 | #[derive(Clone)]
 68 | pub struct FlushHolder {
 69 |     pre_sender: mpsc::UnboundedSender<()>,
 70 |     pre_receiver: Arc<Mutex<mpsc::UnboundedReceiver<()>>>,
 71 | 
 72 |     post_sender: mpsc::UnboundedSender<()>,
 73 |     post_receiver: Arc<Mutex<mpsc::UnboundedReceiver<()>>>,
 74 | }
 75 | 
 76 | impl Default for FlushHolder {
 77 |     fn default() -> Self {
 78 |         let (tx0, rx0) = mpsc::unbounded_channel();
 79 |         let (tx1, rx1) = mpsc::unbounded_channel();
 80 |         Self {
 81 |             pre_sender: tx0,
 82 |             pre_receiver: Arc::new(Mutex::new(rx0)),
 83 | 
 84 |             post_sender: tx1,
 85 |             post_receiver: Arc::new(Mutex::new(rx1)),
 86 |         }
 87 |     }
 88 | }
 89 | 
 90 | impl FlushHolder {
 91 |     pub fn trigger(&self) {
 92 |         self.pre_sender.send(()).unwrap();
 93 |     }
 94 | 
 95 |     pub async fn wait(&self) {
 96 |         self.post_receiver.lock().await.recv().await.unwrap();
 97 |     }
 98 | }
 99 | 
100 | #[async_trait]
101 | impl FlushBufferHook for FlushHolder {
102 |     async fn pre_flush(&self) -> Result<()> {
103 |         self.pre_receiver.lock().await.recv().await.unwrap();
104 |         Ok(())
105 |     }
106 | 
107 |     async fn post_flush(&self, _bytes: usize) -> Result<()> {
108 |         self.post_sender.send(()).unwrap();
109 |         Ok(())
110 |     }
111 | }
112 | 
113 | /// `datasize()` returns the actual data size of a file.
114 | ///
115 | /// File systems like ext4 takes metadata blocks into account in `stat.st_blocks` of `fstat(2)`.
116 | /// So it'not accurate if you really want to know the data size of sparse file with `fstat`.
117 | ///
118 | /// `datasize` is implemented by iterates the `fiemap` of the file.
119 | pub fn datasize(path: impl AsRef<Path>) -> Result<usize> {
120 |     let mut size = 0;
121 | 
122 |     let fm = fiemap::fiemap(path)?;
123 |     for fe in fm {
124 |         let fe = fe.unwrap();
125 |         size += fe.fe_length as usize;
126 |     }
127 | 
128 |     Ok(size)
129 | }
130 | 


--------------------------------------------------------------------------------
/storage/src/utils/bloom.rs:
--------------------------------------------------------------------------------
  1 | // Ported from [AgateDB](https://github.com/tikv/agatedb) with [license](https://github.com/tikv/agatedb/blob/master/LICENSE).
  2 | 
  3 | // TODO: Refactor this in rusty style.
  4 | 
  5 | use bytes::BufMut;
  6 | 
  7 | pub trait BitSlice {
  8 |     fn get_bit(&self, idx: usize) -> bool;
  9 |     fn bit_len(&self) -> usize;
 10 | }
 11 | 
 12 | pub trait BitSliceMut {
 13 |     fn set_bit(&mut self, idx: usize, val: bool);
 14 | }
 15 | 
 16 | impl<T: AsRef<[u8]>> BitSlice for T {
 17 |     fn get_bit(&self, idx: usize) -> bool {
 18 |         let pos = idx / 8;
 19 |         let offset = idx % 8;
 20 |         (self.as_ref()[pos] & (1 << offset)) != 0
 21 |     }
 22 | 
 23 |     fn bit_len(&self) -> usize {
 24 |         self.as_ref().len() * 8
 25 |     }
 26 | }
 27 | 
 28 | impl<T: AsMut<[u8]>> BitSliceMut for T {
 29 |     fn set_bit(&mut self, idx: usize, val: bool) {
 30 |         let pos = idx / 8;
 31 |         let offset = idx % 8;
 32 |         if val {
 33 |             self.as_mut()[pos] |= 1 << offset;
 34 |         } else {
 35 |             self.as_mut()[pos] &= !(1 << offset);
 36 |         }
 37 |     }
 38 | }
 39 | 
 40 | /// Bloom implements bloom filter functionalities over
 41 | /// a bit-slice of data.
 42 | pub struct Bloom<'a> {
 43 |     /// data of filter in bits
 44 |     filter: &'a [u8],
 45 |     /// number of hash functions
 46 |     k: u8,
 47 | }
 48 | 
 49 | impl<'a> Bloom<'a> {
 50 |     /// Create a bloom filter from a byte slice
 51 |     pub fn new(buf: &'a [u8]) -> Self {
 52 |         let filter = &buf[..buf.len() - 1];
 53 |         let k = buf[buf.len() - 1];
 54 |         Self { filter, k }
 55 |     }
 56 | 
 57 |     /// Get bloom filter bits per key from entries count and FPR
 58 |     pub fn bloom_bits_per_key(entries: usize, false_positive_rate: f64) -> usize {
 59 |         let size =
 60 |             -1.0 * (entries as f64) * false_positive_rate.ln() / std::f64::consts::LN_2.powi(2);
 61 |         let locs = (std::f64::consts::LN_2 * size / (entries as f64)).ceil();
 62 |         locs as usize
 63 |     }
 64 | 
 65 |     /// Build bloom filter from key hashes
 66 |     pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Vec<u8> {
 67 |         // 0.69 is approximately ln(2)
 68 |         let k = ((bits_per_key as f64) * 0.69) as u32;
 69 |         // limit k in [1, 30]
 70 |         let k = k.clamp(1, 30);
 71 |         // For small len(keys), we set a minimum bloom filter length to avoid high FPR
 72 |         let nbits = (keys.len() * bits_per_key).max(64);
 73 |         let nbytes = (nbits + 7) / 8;
 74 |         // nbits is always multiplication of 8
 75 |         let nbits = nbytes * 8;
 76 |         let mut filter = Vec::with_capacity(nbytes + 1);
 77 |         filter.resize(nbytes, 0);
 78 |         for h in keys {
 79 |             let mut h = *h;
 80 |             let delta = (h >> 17) | (h << 15);
 81 |             for _ in 0..k {
 82 |                 let bit_pos = (h as usize) % nbits;
 83 |                 filter.set_bit(bit_pos, true);
 84 |                 h = h.wrapping_add(delta);
 85 |             }
 86 |         }
 87 |         filter.put_u8(k as u8);
 88 |         filter
 89 |     }
 90 | 
 91 |     /// Check if a bloom filter may contain some data
 92 |     pub fn may_contain(&self, mut h: u32) -> bool {
 93 |         if self.k > 30 {
 94 |             // potential new encoding for short bloom filters
 95 |             true
 96 |         } else {
 97 |             let nbits = self.filter.bit_len();
 98 |             let delta = (h >> 17) | (h << 15);
 99 |             for _ in 0..self.k {
100 |                 let bit_pos = h % (nbits as u32);
101 |                 if !self.filter.get_bit(bit_pos as usize) {
102 |                     return false;
103 |                 }
104 |                 h = h.wrapping_add(delta);
105 |             }
106 |             true
107 |         }
108 |     }
109 | }
110 | 
111 | #[cfg(test)]
112 | mod tests {
113 |     use test_log::test;
114 | 
115 |     use super::*;
116 | 
117 |     #[test]
118 |     fn test_small_bloom_filter() {
119 |         let hash: Vec<u32> = vec![b"hello".to_vec(), b"world".to_vec()]
120 |             .into_iter()
121 |             .map(|x| farmhash::fingerprint32(&x))
122 |             .collect();
123 |         let buf = Bloom::build_from_key_hashes(&hash, 10);
124 | 
125 |         let check_hash: Vec<u32> = vec![
126 |             b"hello".to_vec(),
127 |             b"world".to_vec(),
128 |             b"x".to_vec(),
129 |             b"fool".to_vec(),
130 |         ]
131 |         .into_iter()
132 |         .map(|x| farmhash::fingerprint32(&x))
133 |         .collect();
134 | 
135 |         let f = Bloom::new(&buf);
136 |         assert_eq!(f.k, 6);
137 | 
138 |         assert!(f.may_contain(check_hash[0]));
139 |         assert!(f.may_contain(check_hash[1]));
140 |         assert!(!f.may_contain(check_hash[2]));
141 |         assert!(!f.may_contain(check_hash[3]));
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/storage/src/utils/mod.rs:
--------------------------------------------------------------------------------
1 | mod coding;
2 | pub use coding::*;
3 | mod bloom;
4 | pub use bloom::*;
5 | pub mod lru_cache;
6 | 


--------------------------------------------------------------------------------
/tests/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-tests"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [[test]]
 8 | name = "integrations"
 9 | path = "integrations/lib.rs"
10 | 
11 | [dependencies]
12 | anyhow = "1.0"
13 | bytes = "1"
14 | bytesize = "1.1.0"
15 | clap = { version = "3.1.6", features = ["derive"] }
16 | env_logger = "*"
17 | futures = "0.3"
18 | itertools = "0.10.3"
19 | lazy_static = "1.4.0"
20 | parking_lot = "0.12"
21 | rand = "0.8.5"
22 | runkv-client = { path = "../client" }
23 | runkv-common = { path = "../common" }
24 | runkv-exhauster = { path = "../exhauster" }
25 | runkv-proto = { path = "../proto" }
26 | runkv-rudder = { path = "../rudder" }
27 | runkv-storage = { path = "../storage" }
28 | runkv-wheel = { path = "../wheel" }
29 | tempfile = "3"
30 | test-log = "0.2.10"
31 | tokio = { version = "1", features = [
32 |     "rt-multi-thread",
33 |     "sync",
34 |     "macros",
35 |     "time",
36 | ] }
37 | toml = "0.4.2"
38 | tonic = "0.6.2"
39 | tracing = "0.1"
40 | 
41 | [features]
42 | tracing = ["runkv-wheel/tracing"]
43 | deadlock = ["parking_lot/deadlock_detection"]
44 | verbose-release-log = ["tracing/release_max_level_trace"]
45 | 


--------------------------------------------------------------------------------
/tests/etc/exhauster.toml:
--------------------------------------------------------------------------------
 1 | id = 201
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | heartbeat_interval = "1 s"
 7 | 
 8 | [rudder]
 9 | id = 1
10 | host = "127.0.0.1"
11 | port = 0
12 | 
13 | [minio]
14 | url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
15 | 
16 | [s3]
17 | bucket = "runkv"
18 | 
19 | [buffer]
20 | write_buffer_capacity = "64 MiB"
21 | 
22 | [cache]
23 | block_cache_capacity = "512 MiB"
24 | meta_cache_capacity = "256 MiB"
25 | 


--------------------------------------------------------------------------------
/tests/etc/lsm_tree.toml:
--------------------------------------------------------------------------------
 1 | [lsm_tree]
 2 | l1_capacity = "20 MiB"
 3 | level_multiplier = 2
 4 | 
 5 | trigger_l0_compaction_ssts = 4
 6 | trigger_l0_compaction_interval = "500 ms"
 7 | trigger_lmax_compaction_interval = "5 s"
 8 | trigger_compaction_interval = "2 s"
 9 | 
10 | sstable_capacity = "4 MiB"
11 | block_capacity = "1 MiB"
12 | restart_interval = 2
13 | bloom_false_positive = 0.1
14 | 
15 | compaction_pin_ttl = "15 s"
16 | 
17 | [[lsm_tree.levels_options]]
18 | compaction_strategy = "Overlap"
19 | compression_algorithm = "None"
20 | 
21 | [[lsm_tree.levels_options]]
22 | compaction_strategy = "NonOverlap"
23 | compression_algorithm = "None"
24 | 
25 | [[lsm_tree.levels_options]]
26 | compaction_strategy = "NonOverlap"
27 | compression_algorithm = "None"
28 | 
29 | [[lsm_tree.levels_options]]
30 | compaction_strategy = "NonOverlap"
31 | compression_algorithm = "Lz4"
32 | 


--------------------------------------------------------------------------------
/tests/etc/port.toml:
--------------------------------------------------------------------------------
1 | test_concurrent_put_get = 12300
2 | test_multi_raft_group_concurrent_put_get = 12310


--------------------------------------------------------------------------------
/tests/etc/rudder.toml:
--------------------------------------------------------------------------------
 1 | id = 1
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | data_path = "data"
 5 | meta_path = "meta"
 6 | health_timeout = "10 s"
 7 | 
 8 | # [minio]
 9 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
10 | 
11 | # [s3]
12 | # bucket = "runkv"
13 | 
14 | [cache]
15 | block_cache_capacity = "0 B"
16 | meta_cache_capacity = "64 kiB"
17 | 


--------------------------------------------------------------------------------
/tests/etc/wheel.toml:
--------------------------------------------------------------------------------
 1 | id = 101
 2 | host = "127.0.0.1"
 3 | port = 0
 4 | log = ".run/log/"
 5 | data_path = "data"
 6 | meta_path = "meta"
 7 | poll_interval = "100ms"
 8 | heartbeat_interval = "100ms"
 9 | 
10 | [rudder]
11 | id = 1
12 | host = "127.0.0.1"
13 | port = 0
14 | 
15 | # [minio]
16 | # url = 'minio://minioadmin:minioadmin@127.0.0.1:9000/runkv'
17 | 
18 | # [s3]
19 | # bucket = "runkv"
20 | 
21 | [buffer]
22 | write_buffer_capacity = "1 MiB"
23 | 
24 | [cache]
25 | block_cache_capacity = "16 MiB"
26 | meta_cache_capacity = "4 MiB"
27 | 
28 | [raft_log_store]
29 | log_dir_path = "/path/to/log/dir"
30 | log_file_capacity = "64 MiB"
31 | block_cache_capacity = "256 MiB"
32 | persist = "sync"
33 | 
34 | [prometheus]
35 | host = "127.0.0.1"
36 | port = 9898
37 | 
38 | [tiered_cache]
39 | type = "FileCache"
40 | [tiered_cache.args]
41 | dir = "/path/to/file/cache/dir"
42 | capacity = "256 MiB"
43 | total_buffer_capacity = "64 MiB"
44 | cache_file_fallocate_unit = "64 MiB"
45 | cache_meta_fallocate_unit = "16 MiB"
46 | cache_file_max_write_size = "2 MiB"


--------------------------------------------------------------------------------
/tests/integrations/lib.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::read_to_string;
 2 | 
 3 | mod test_concurrent_put_get;
 4 | mod test_multi_raft_group_concurrent_put_get;
 5 | 
 6 | const PORT_CONFIG_PATH: &str = "etc/port.toml";
 7 | 
 8 | const RUDDER_CONFIG_PATH: &str = "etc/rudder.toml";
 9 | const WHEEL_CONFIG_PATH: &str = "etc/wheel.toml";
10 | const EXHAUSTER_CONFIG_PATH: &str = "etc/exhauster.toml";
11 | const LSM_TREE_CONFIG_PATH: &str = "etc/lsm_tree.toml";
12 | 
13 | fn port(name: &str) -> u16 {
14 |     let table = read_to_string(PORT_CONFIG_PATH)
15 |         .unwrap()
16 |         .parse::<toml::Value>()
17 |         .unwrap();
18 |     let value = match table {
19 |         toml::Value::Table(ports) => ports[name].clone(),
20 |         _ => unreachable!(),
21 |     };
22 |     match value {
23 |         toml::Value::Integer(port) => port as u16,
24 |         _ => unreachable!(),
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/integrations/test_concurrent_put_get.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use runkv_tests::{run, Args, Options};
 4 | use test_log::test;
 5 | 
 6 | use crate::*;
 7 | 
 8 | #[test(tokio::test)]
 9 | async fn test_concurrent_put_get() {
10 |     let port = crate::port("test_concurrent_put_get");
11 | 
12 |     let options = Options {
13 |         log: false,
14 |         rudder_config_path: RUDDER_CONFIG_PATH.to_string(),
15 |         wheel_config_path: WHEEL_CONFIG_PATH.to_string(),
16 |         exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(),
17 |         lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(),
18 |         rudder_node_id: 10000,
19 |         wheel_node_id_base: 0,
20 |         exhauster_node_id_base: 100,
21 |         rudder_port: port,
22 |         exhauster_port_base: port,
23 |         wheel_port_base: port + 1,
24 |         wheel_prometheus_port_base: 0,
25 |     };
26 | 
27 |     let tempdir = tempfile::tempdir().unwrap();
28 |     let raft_log_store_data_dir = Path::new(tempdir.path())
29 |         .join("raft")
30 |         .to_str()
31 |         .unwrap()
32 |         .to_string();
33 |     let log_dir = Path::new(tempdir.path())
34 |         .join("log")
35 |         .to_str()
36 |         .unwrap()
37 |         .to_string();
38 |     let file_cache_dir = Path::new(tempdir.path())
39 |         .join("filecache")
40 |         .to_str()
41 |         .unwrap()
42 |         .to_string();
43 | 
44 |     let args = Args {
45 |         wheels: 1,
46 |         exhausters: 1,
47 |         groups: 1,
48 |         key_size: 64,
49 |         value_size: 64,
50 |         concurrency: 1000,
51 |         r#loop: 3,
52 |         raft_log_store_data_dir,
53 |         persist: "none".to_string(),
54 |         log_dir,
55 |         file_cache_dir,
56 |         s3_uri: "memory://".to_string(),
57 |     };
58 | 
59 |     run(args, options).await;
60 | }
61 | 


--------------------------------------------------------------------------------
/tests/integrations/test_multi_raft_group_concurrent_put_get.rs:
--------------------------------------------------------------------------------
 1 | use std::path::Path;
 2 | 
 3 | use runkv_tests::{run, Args, Options};
 4 | use test_log::test;
 5 | 
 6 | use crate::*;
 7 | 
 8 | #[test(tokio::test)]
 9 | async fn test_multi_raft_group_concurrent_put_get() {
10 |     let port = crate::port("test_multi_raft_group_concurrent_put_get");
11 | 
12 |     let options = Options {
13 |         log: false,
14 |         rudder_config_path: RUDDER_CONFIG_PATH.to_string(),
15 |         wheel_config_path: WHEEL_CONFIG_PATH.to_string(),
16 |         exhauster_config_path: EXHAUSTER_CONFIG_PATH.to_string(),
17 |         lsm_tree_config_path: LSM_TREE_CONFIG_PATH.to_string(),
18 |         rudder_node_id: 10000,
19 |         wheel_node_id_base: 0,
20 |         exhauster_node_id_base: 100,
21 |         rudder_port: port,
22 |         exhauster_port_base: port,
23 |         wheel_port_base: port + 1,
24 |         wheel_prometheus_port_base: 0,
25 |     };
26 | 
27 |     let tempdir = tempfile::tempdir().unwrap();
28 |     let raft_log_store_data_dir = Path::new(tempdir.path())
29 |         .join("raft")
30 |         .to_str()
31 |         .unwrap()
32 |         .to_string();
33 |     let log_dir = Path::new(tempdir.path())
34 |         .join("log")
35 |         .to_str()
36 |         .unwrap()
37 |         .to_string();
38 |     let file_cache_dir = Path::new(tempdir.path())
39 |         .join("filecache")
40 |         .to_str()
41 |         .unwrap()
42 |         .to_string();
43 | 
44 |     let args = Args {
45 |         wheels: 1,
46 |         exhausters: 1,
47 |         groups: 10,
48 |         key_size: 64,
49 |         value_size: 64,
50 |         concurrency: 100,
51 |         r#loop: 3,
52 |         raft_log_store_data_dir,
53 |         persist: "none".to_string(),
54 |         log_dir,
55 |         file_cache_dir,
56 |         s3_uri: "memory://".to_string(),
57 |     };
58 | 
59 |     run(args, options).await;
60 | }
61 | 


--------------------------------------------------------------------------------
/wheel/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "runkv-wheel"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 6 | 
 7 | [dependencies]
 8 | anyhow = "1.0"
 9 | async-trait = "0.1"
10 | bincode = "1.3.3"
11 | bytes = "1"
12 | bytesize = { version = "1.1.0", features = ["serde"] }
13 | chrono = { version = "0.4", features = ["serde"] }
14 | clap = { version = "3.1.6", features = ["derive"] }
15 | futures = "0.3"
16 | futures-util = "0.3"
17 | http = "0.2.6"
18 | humantime = "2.1.0"
19 | humantime-serde = "1.1.1"
20 | hyper = { version = "^0.14", features = ["server", "http1", "tcp"] }
21 | itertools = "0.10.3"
22 | lazy_static = "1.4.0"
23 | moka = { version = "0.7", features = ["future"] }
24 | parking_lot = "0.12"
25 | prometheus = "0.13.0"
26 | prost = "0.9"
27 | raft = { git = "https://github.com/mrcroxx/raft-rs", rev = "710b3a9cf2342cdcc1d7b43e945490945024ecd2" }
28 | # Uncomment this line if you want to debug raft-rs locally.
29 | # raft = { path = "../../raft-rs" }
30 | rand = "0.8.5"
31 | runkv-common = { path = "../common" }
32 | runkv-proto = { path = "../proto" }
33 | runkv-storage = { path = "../storage" }
34 | serde = "1.0"
35 | serde_derive = "1.0"
36 | slog = "2.7" # Required by "raft".
37 | tempfile = "3"
38 | thiserror = "1.0"
39 | tokio = { version = "1", features = [
40 |     "rt-multi-thread",
41 |     "sync",
42 |     "macros",
43 |     "time",
44 | ] }
45 | toml = "0.4.2"
46 | tonic = "0.6.2"
47 | tracing = "0.1"
48 | tracing-subscriber = "0.3"
49 | 
50 | [target.'cfg(not(target_env = "msvc"))'.dependencies]
51 | tikv-jemallocator = "0.4.3"
52 | 
53 | [dev-dependencies]
54 | assert_matches = "1.5.0"
55 | env_logger = "*"
56 | test-log = "0.2.10"
57 | 
58 | [features]
59 | tracing = ["runkv-common/tracing"]
60 | deadlock = []
61 | verbose-release-log = ["tracing/release_max_level_trace"]
62 | 


--------------------------------------------------------------------------------
/wheel/src/components/command.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::hash_map::DefaultHasher;
 2 | use std::hash::{Hash, Hasher};
 3 | 
 4 | use runkv_common::coding::BytesSerde;
 5 | use runkv_proto::kv::KvRequest;
 6 | use serde::{Deserialize, Serialize};
 7 | 
 8 | #[derive(Serialize, Deserialize, Clone, Debug)]
 9 | pub enum Command {
10 |     KvRequest {
11 |         request_id: u64,
12 |         sequence: u64,
13 |         request: KvRequest,
14 |     },
15 |     CompactRaftLog {
16 |         index: u64,
17 |         sequence: u64,
18 |     },
19 | }
20 | 
21 | impl<'de> BytesSerde<'de> for Command {}
22 | 
23 | impl Command {
24 |     pub fn id(&self) -> u64 {
25 |         match self {
26 |             Self::KvRequest { request_id, .. } => *request_id,
27 |             Self::CompactRaftLog { index, sequence } => {
28 |                 let mut hasher = DefaultHasher::default();
29 |                 index.hash(&mut hasher);
30 |                 sequence.hash(&mut hasher);
31 |                 hasher.finish()
32 |             }
33 |         }
34 |     }
35 | 
36 |     pub fn is_read_only(&self) -> bool {
37 |         match self {
38 |             Self::KvRequest { request, .. } => request.is_read_only(),
39 |             Self::CompactRaftLog { .. } => false,
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/wheel/src/components/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod command;
2 | pub mod fsm;
3 | pub mod lsm_tree;
4 | pub mod raft_log_store;
5 | pub mod raft_manager;
6 | pub mod raft_network;
7 | pub mod read_only_cmd_pool;
8 | 


--------------------------------------------------------------------------------
/wheel/src/components/read_only_cmd_pool.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::sync::Arc;
 3 | 
 4 | use itertools::Itertools;
 5 | use parking_lot::Mutex;
 6 | use tracing::trace;
 7 | 
 8 | use super::command::Command;
 9 | 
10 | struct ReadyItem {
11 |     index: u64,
12 |     cmds: Vec<Command>,
13 | }
14 | 
15 | struct ReadOnlyCmdPoolCore {
16 |     /// { id -> [cmd] }
17 |     pending: Mutex<HashMap<u64, Vec<Command>>>,
18 |     ready: Mutex<Vec<ReadyItem>>,
19 | }
20 | 
21 | #[derive(Clone)]
22 | pub struct ReadOnlyCmdPool {
23 |     core: Arc<ReadOnlyCmdPoolCore>,
24 | }
25 | 
26 | impl Default for ReadOnlyCmdPool {
27 |     fn default() -> Self {
28 |         Self {
29 |             core: Arc::new(ReadOnlyCmdPoolCore {
30 |                 pending: Mutex::new(HashMap::new()),
31 |                 ready: Mutex::new(vec![]),
32 |             }),
33 |         }
34 |     }
35 | }
36 | 
37 | impl ReadOnlyCmdPool {
38 |     pub fn append(&self, id: u64, cmds: Vec<Command>) {
39 |         assert!(self.core.pending.lock().insert(id, cmds).is_none());
40 |     }
41 | 
42 |     pub fn ready(&self, id: u64, index: u64) {
43 |         let cmds = match self.core.pending.lock().remove(&id) {
44 |             None => {
45 |                 trace!("no read-only cmds found at: {}", index);
46 |                 return;
47 |             }
48 |             Some(cmds) => cmds,
49 |         };
50 |         let item = ReadyItem { index, cmds };
51 |         let mut ready = self.core.ready.lock();
52 |         if let Some(last) = ready.last() {
53 |             assert!(last.index <= index);
54 |         }
55 |         ready.push(item);
56 |     }
57 | 
58 |     pub fn split(&self, index: u64) -> Vec<Command> {
59 |         let mut ready = self.core.ready.lock();
60 |         let p = ready.partition_point(|item| item.index <= index);
61 |         let cmds = ready.drain(..p).flat_map(|item| item.cmds).collect_vec();
62 |         cmds
63 |     }
64 | }
65 | 
66 | #[cfg(test)]
67 | mod tests {
68 |     use super::*;
69 | 
70 |     fn is_send_sync_clone<T: Send + Sync + Clone + 'static>() {}
71 | 
72 |     #[test]
73 |     fn ensure_send_sync_clone() {
74 |         is_send_sync_clone::<ReadOnlyCmdPool>();
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/wheel/src/config.rs:
--------------------------------------------------------------------------------
 1 | use runkv_common::config::{
 2 |     CacheConfig, LsmTreeConfig, MinioConfig, Node, PrometheusConfig, S3Config,
 3 | };
 4 | use serde::Deserialize;
 5 | 
 6 | #[derive(Deserialize, Clone, Debug)]
 7 | pub struct WheelConfig {
 8 |     pub id: u64,
 9 |     pub host: String,
10 |     pub port: u16,
11 |     pub log: String,
12 |     pub data_path: String,
13 |     pub meta_path: String,
14 |     pub poll_interval: String,
15 |     pub heartbeat_interval: String,
16 |     pub rudder: Node,
17 |     pub s3: Option<S3Config>,
18 |     pub minio: Option<MinioConfig>,
19 |     pub buffer: BufferConfig,
20 |     pub cache: CacheConfig,
21 |     pub lsm_tree: LsmTreeConfig,
22 |     pub raft_log_store: RaftLogStoreConfig,
23 |     pub tiered_cache: TieredCacheConfig,
24 |     pub prometheus: PrometheusConfig,
25 | }
26 | 
27 | #[derive(Deserialize, Clone, Debug)]
28 | pub struct BufferConfig {
29 |     pub write_buffer_capacity: String,
30 | }
31 | 
32 | #[derive(Deserialize, Clone, Debug)]
33 | pub struct RaftLogStoreConfig {
34 |     pub log_dir_path: String,
35 |     pub log_file_capacity: String,
36 |     pub block_cache_capacity: String,
37 |     pub persist: String,
38 | }
39 | 
40 | #[derive(Deserialize, Clone, Debug, PartialEq, Eq)]
41 | #[serde(tag = "type", content = "args")]
42 | pub enum TieredCacheConfig {
43 |     None,
44 |     FileCache(FileCacheConfig),
45 | }
46 | 
47 | #[derive(Deserialize, Clone, Debug, PartialEq, Eq)]
48 | pub struct FileCacheConfig {
49 |     pub dir: String,
50 |     pub capacity: String,
51 |     pub total_buffer_capacity: String,
52 |     pub cache_file_fallocate_unit: String,
53 |     pub cache_meta_fallocate_unit: String,
54 |     pub cache_file_max_write_size: String,
55 | }
56 | 
57 | #[cfg(test)]
58 | mod tests {
59 |     use super::{FileCacheConfig, TieredCacheConfig};
60 | 
61 |     #[test]
62 |     fn test_tiered_cache_config_parse() {
63 |         let text = r#"type = "None""#;
64 |         let config: TieredCacheConfig = toml::from_str(text).unwrap();
65 |         assert_eq!(TieredCacheConfig::None, config);
66 | 
67 |         let text = r#"
68 | type = "FileCache"
69 | [args]
70 | dir = "<dir>"
71 | capacity = "<capacity>"
72 | total_buffer_capacity = "<total_buffer_capacity>"
73 | cache_file_fallocate_unit = "<cache_file_fallocate_unit>"
74 | cache_meta_fallocate_unit = "<cache_meta_fallocate_unit>"
75 | cache_file_max_write_size = "<cache_file_max_write_size>"
76 | "#;
77 |         let config: TieredCacheConfig = toml::from_str(text).unwrap();
78 |         assert_eq!(
79 |             TieredCacheConfig::FileCache(FileCacheConfig {
80 |                 dir: "<dir>".to_string(),
81 |                 capacity: "<capacity>".to_string(),
82 |                 total_buffer_capacity: "<total_buffer_capacity>".to_string(),
83 |                 cache_file_fallocate_unit: "<cache_file_fallocate_unit>".to_string(),
84 |                 cache_meta_fallocate_unit: "<cache_meta_fallocate_unit>".to_string(),
85 |                 cache_file_max_write_size: "<cache_file_max_write_size>".to_string(),
86 |             }),
87 |             config
88 |         );
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/wheel/src/error.rs:
--------------------------------------------------------------------------------
 1 | use runkv_proto::meta::KeyRange;
 2 | use tonic::Status;
 3 | 
 4 | #[derive(thiserror::Error, Debug)]
 5 | pub enum Error {
 6 |     #[error("config error: {0}")]
 7 |     ConfigError(String),
 8 |     #[error("storage error: {0}")]
 9 |     StorageError(#[from] runkv_storage::Error),
10 |     #[error("transport error: {0}")]
11 |     TransportError(#[from] tonic::transport::Error),
12 |     #[error("rpc status error: {0}")]
13 |     RpcStatus(#[from] Status),
14 |     #[error("serde error: {0}")]
15 |     SerdeError(String),
16 |     #[error("raft error: {0}")]
17 |     RaftError(#[from] raft::Error),
18 |     #[error("raft manage error: {0}")]
19 |     RaftManagerError(#[from] RaftManageError),
20 |     #[error("meta error: {0}")]
21 |     MetaError(#[from] MetaError),
22 |     #[error("kv error: {0}")]
23 |     KvError(#[from] KvError),
24 |     #[error("other: {0}")]
25 |     Other(String),
26 | }
27 | 
28 | impl Error {
29 |     pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> Self {
30 |         Self::Other(e.into().to_string())
31 |     }
32 | 
33 |     pub fn config_err(e: impl Into<Box<dyn std::error::Error>>) -> Self {
34 |         Self::ConfigError(e.into().to_string())
35 |     }
36 | 
37 |     pub fn serde_err(e: impl Into<Box<dyn std::error::Error>>) -> Self {
38 |         Self::SerdeError(e.into().to_string())
39 |     }
40 | }
41 | 
42 | pub type Result<T> = std::result::Result<T, Error>;
43 | 
44 | #[derive(thiserror::Error, Debug)]
45 | pub enum RaftManageError {
46 |     #[error("raft group already exists")]
47 |     RaftGroupAlreadyExists(u64),
48 |     #[error("raft group not exists")]
49 |     RaftGroupNotExists(u64),
50 |     #[error("raft node not exists: [raft node: {raft_node}] [node: {node}]")]
51 |     RaftNodeNotExists { raft_node: u64, node: u64 },
52 |     #[error("raft node already exists: [group: {group}] [raft node: {raft_node}] [node: {node}]")]
53 |     RaftNodeAlreadyExists {
54 |         group: u64,
55 |         raft_node: u64,
56 |         node: u64,
57 |     },
58 |     #[error("other: {0}")]
59 |     Other(String),
60 | }
61 | 
62 | impl RaftManageError {
63 |     pub fn err(e: impl Into<Box<dyn std::error::Error>>) -> Self {
64 |         Self::Other(e.into().to_string())
65 |     }
66 | }
67 | 
68 | #[derive(thiserror::Error, Debug)]
69 | pub enum MetaError {
70 |     #[error("key range overlaps: {r1:?} {r2:?}")]
71 |     KeyRangeOverlaps { r1: KeyRange, r2: KeyRange },
72 | }
73 | 
74 | #[derive(thiserror::Error, Debug)]
75 | pub enum KvError {
76 |     #[error("ops include invalid shard or ops cross multiple shards: {0}")]
77 |     InvalidShard(String),
78 |     #[error("no valid leader in raft group {0}")]
79 |     NoValidLeader(u64),
80 | }
81 | 


--------------------------------------------------------------------------------
/wheel/src/main.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(not(target_env = "msvc"))]
 2 | use tikv_jemallocator::Jemalloc;
 3 | 
 4 | #[cfg(not(target_env = "msvc"))]
 5 | #[global_allocator]
 6 | static GLOBAL: Jemalloc = Jemalloc;
 7 | 
 8 | use std::fs::read_to_string;
 9 | 
10 | use clap::Parser;
11 | use runkv_wheel::config::WheelConfig;
12 | use runkv_wheel::error::{Error, Result};
13 | use runkv_wheel::{bootstrap_wheel, build_wheel};
14 | use tracing::info;
15 | use tracing_subscriber::FmtSubscriber;
16 | 
17 | #[derive(Parser, Debug)]
18 | struct Args {
19 |     #[clap(short, long, default_value = "etc/wheel.toml")]
20 |     config_file_path: String,
21 | }
22 | 
23 | #[tokio::main]
24 | async fn main() -> Result<()> {
25 |     let subscriber = FmtSubscriber::new();
26 |     tracing::subscriber::set_global_default(subscriber).map_err(Error::err)?;
27 | 
28 |     let args = Args::parse();
29 |     info!("args: {:?}", args);
30 | 
31 |     let config: WheelConfig =
32 |         toml::from_str(&read_to_string(&args.config_file_path).map_err(Error::err)?)
33 |             .map_err(Error::config_err)?;
34 |     info!("config: {:?}", config);
35 | 
36 |     let (wheel, workers) = build_wheel(&config).await?;
37 |     bootstrap_wheel(&config, wheel, workers).await
38 | }
39 | 


--------------------------------------------------------------------------------
/wheel/src/meta/mem.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{BTreeMap, HashMap};
  2 | use std::sync::Arc;
  3 | 
  4 | use async_trait::async_trait;
  5 | use itertools::Itertools;
  6 | use parking_lot::RwLock;
  7 | use runkv_proto::meta::KeyRange;
  8 | 
  9 | use super::{in_range, is_overlap, MetaStore};
 10 | use crate::error::{MetaError, Result};
 11 | 
 12 | type RaftStates = Arc<RwLock<HashMap<u64, Option<raft::SoftState>>>>;
 13 | 
 14 | #[derive(Default)]
 15 | struct MemoryMetaStoreCore {
 16 |     /// `{ [start key .. end key) -> (group, [raft node 1, range node 2, ..]) }`
 17 |     key_ranges: BTreeMap<KeyRange, (u64, Vec<u64>)>,
 18 | }
 19 | 
 20 | #[derive(Default)]
 21 | pub struct MemoryMetaStore {
 22 |     inner: RwLock<MemoryMetaStoreCore>,
 23 | 
 24 |     raft_states: RaftStates,
 25 | }
 26 | 
 27 | impl MemoryMetaStore {}
 28 | 
 29 | #[async_trait]
 30 | impl MetaStore for MemoryMetaStore {
 31 |     async fn add_key_range(
 32 |         &self,
 33 |         key_range: KeyRange,
 34 |         group: u64,
 35 |         raft_nodes: &[u64],
 36 |     ) -> Result<()> {
 37 |         let mut guard = self.inner.write();
 38 |         for r in guard.key_ranges.keys() {
 39 |             if is_overlap(r, &key_range) {
 40 |                 return Err(MetaError::KeyRangeOverlaps {
 41 |                     r1: r.to_owned(),
 42 |                     r2: key_range,
 43 |                 }
 44 |                 .into());
 45 |             }
 46 |         }
 47 |         guard
 48 |             .key_ranges
 49 |             .insert(key_range, (group, raft_nodes.to_vec()));
 50 |         Ok(())
 51 |     }
 52 | 
 53 |     async fn key_ranges(&self) -> Result<Vec<KeyRange>> {
 54 |         let guard = self.inner.read();
 55 |         Ok(guard.key_ranges.keys().cloned().collect_vec())
 56 |     }
 57 | 
 58 |     async fn in_range(&self, key: &[u8]) -> Result<Option<(KeyRange, u64, Vec<u64>)>> {
 59 |         let guard = self.inner.read();
 60 |         for (r, (group, raft_nodes)) in guard.key_ranges.iter() {
 61 |             if in_range(key, r) {
 62 |                 return Ok(Some((r.to_owned(), *group, raft_nodes.to_owned())));
 63 |             }
 64 |         }
 65 |         Ok(None)
 66 |     }
 67 | 
 68 |     async fn all_in_range(&self, keys: &[&[u8]]) -> Result<Option<(KeyRange, u64, Vec<u64>)>> {
 69 |         if keys.is_empty() {
 70 |             return Ok(None);
 71 |         }
 72 |         let guard = self.inner.read();
 73 |         let mut result = None;
 74 |         for (r, (group, raft_nodes)) in guard.key_ranges.iter() {
 75 |             if in_range(keys[0], r) {
 76 |                 result = Some((r.to_owned(), *group, raft_nodes.to_owned()));
 77 |                 break;
 78 |             }
 79 |         }
 80 |         if result.is_none() {
 81 |             return Ok(None);
 82 |         }
 83 |         let (range, group, raft_nodes) = result.unwrap();
 84 |         for key in &keys[1..] {
 85 |             if !in_range(key, &range) {
 86 |                 return Ok(None);
 87 |             }
 88 |         }
 89 |         Ok(Some((range, group, raft_nodes)))
 90 |     }
 91 | 
 92 |     async fn update_raft_state(
 93 |         &self,
 94 |         raft_node: u64,
 95 |         raft_state: Option<raft::SoftState>,
 96 |     ) -> Result<()> {
 97 |         let mut raft_states = self.raft_states.write();
 98 |         raft_states.insert(raft_node, raft_state);
 99 |         Ok(())
100 |     }
101 | 
102 |     async fn all_raft_states(&self) -> Result<HashMap<u64, Option<raft::SoftState>>> {
103 |         Ok(self.raft_states.read().clone())
104 |     }
105 | 
106 |     async fn is_raft_leader(&self, raft_node: u64) -> Result<bool> {
107 |         let raft_states = self.raft_states.read();
108 |         let is_leader = match raft_states.get(&raft_node) {
109 |             None | Some(None) => false,
110 |             Some(Some(ss)) => ss.raft_state == raft::StateRole::Leader,
111 |         };
112 |         Ok(is_leader)
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/wheel/src/meta/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::sync::Arc;
 3 | 
 4 | use async_trait::async_trait;
 5 | use runkv_proto::meta::KeyRange;
 6 | 
 7 | use crate::error::Result;
 8 | 
 9 | pub mod mem;
10 | #[allow(dead_code)]
11 | pub mod object;
12 | 
13 | #[async_trait]
14 | pub trait MetaStore: Send + Sync + 'static {
15 |     async fn add_key_range(
16 |         &self,
17 |         key_range: KeyRange,
18 |         group: u64,
19 |         raft_nodes: &[u64],
20 |     ) -> Result<()>;
21 | 
22 |     async fn key_ranges(&self) -> Result<Vec<KeyRange>>;
23 | 
24 |     async fn in_range(&self, key: &[u8]) -> Result<Option<(KeyRange, u64, Vec<u64>)>>;
25 | 
26 |     async fn all_in_range(&self, keys: &[&[u8]]) -> Result<Option<(KeyRange, u64, Vec<u64>)>>;
27 | 
28 |     async fn update_raft_state(
29 |         &self,
30 |         raft_node: u64,
31 |         raft_state: Option<raft::SoftState>,
32 |     ) -> Result<()>;
33 | 
34 |     async fn all_raft_states(&self) -> Result<HashMap<u64, Option<raft::SoftState>>>;
35 | 
36 |     async fn is_raft_leader(&self, raft_node: u64) -> Result<bool>;
37 | }
38 | 
39 | pub type MetaStoreRef = Arc<dyn MetaStore>;
40 | 
41 | fn is_overlap(r1: &KeyRange, r2: &KeyRange) -> bool {
42 |     !(r1.start_key > r2.end_key || r1.end_key < r2.start_key)
43 | }
44 | 
45 | fn in_range(key: &[u8], range: &KeyRange) -> bool {
46 |     key >= &range.start_key[..] && key < &range.end_key[..]
47 | }
48 | 


--------------------------------------------------------------------------------
/wheel/src/meta/object.rs:
--------------------------------------------------------------------------------
1 | // TODO: Impl me.
2 | // TODO: Necessary?
3 | 


--------------------------------------------------------------------------------
/wheel/src/trace.rs:
--------------------------------------------------------------------------------
 1 | use lazy_static::lazy_static;
 2 | use runkv_common::sharded_hash_map::ShardedHashMap;
 3 | 
 4 | lazy_static! {
 5 |     pub static ref TRACE_CTX: TraceContext = TraceContext::default();
 6 |     pub static ref TRACE_RAFT_LATENCY_HISTOGRAM_VEC: prometheus::HistogramVec =
 7 |         prometheus::register_histogram_vec!(
 8 |             "trace_raft_latency_histogram_vec",
 9 |             "trace raft latency histogram vec",
10 |             &["op", "node", "group", "raft_node"],
11 |             vec![0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0]
12 |         )
13 |         .unwrap();
14 | }
15 | 
16 | #[derive(Default)]
17 | pub struct TraceContext {
18 |     pub propose_ts: ShardedHashMap<u64, u64>,
19 | }
20 | 


--------------------------------------------------------------------------------
/wheel/src/worker/heartbeater.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Duration;
  2 | 
  3 | use async_trait::async_trait;
  4 | use runkv_common::channel_pool::ChannelPool;
  5 | use runkv_common::Worker;
  6 | use runkv_proto::common::Endpoint;
  7 | use runkv_proto::rudder::rudder_service_client::RudderServiceClient;
  8 | use runkv_proto::rudder::{
  9 |     heartbeat_request, heartbeat_response, HeartbeatRequest, RaftState, WheelHeartbeatRequest,
 10 | };
 11 | use runkv_storage::manifest::{ManifestError, VersionManager};
 12 | use tonic::Request;
 13 | use tracing::warn;
 14 | 
 15 | use crate::error::{Error, Result};
 16 | use crate::meta::MetaStoreRef;
 17 | 
 18 | pub struct HeartbeaterOptions {
 19 |     pub node: u64,
 20 |     pub rudder_node: u64,
 21 | 
 22 |     pub meta_store: MetaStoreRef,
 23 |     pub version_manager: VersionManager,
 24 |     pub channel_pool: ChannelPool,
 25 |     pub heartbeat_interval: Duration,
 26 |     pub endpoint: Endpoint,
 27 | }
 28 | 
 29 | /// [`Heartbeater`] is respons responsible to sync local version manager.
 30 | pub struct Heartbeater {
 31 |     node: u64,
 32 |     rudder_node: u64,
 33 | 
 34 |     endpoint: Endpoint,
 35 |     heartbeat_interval: Duration,
 36 | 
 37 |     meta_store: MetaStoreRef,
 38 |     version_manager: VersionManager,
 39 |     channel_pool: ChannelPool,
 40 | }
 41 | 
 42 | #[async_trait]
 43 | impl Worker for Heartbeater {
 44 |     async fn run(&mut self) -> anyhow::Result<()> {
 45 |         // TODO: Gracefully kill.
 46 |         loop {
 47 |             match self.run_inner().await {
 48 |                 Ok(_) => {}
 49 |                 Err(e) => warn!("error occur when heartbeater running: {}", e),
 50 |             }
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | impl Heartbeater {
 56 |     pub fn new(options: HeartbeaterOptions) -> Self {
 57 |         Self {
 58 |             node: options.node,
 59 |             rudder_node: options.rudder_node,
 60 | 
 61 |             endpoint: options.endpoint,
 62 |             heartbeat_interval: options.heartbeat_interval,
 63 | 
 64 |             version_manager: options.version_manager,
 65 |             meta_store: options.meta_store,
 66 |             channel_pool: options.channel_pool,
 67 |         }
 68 |     }
 69 | 
 70 |     async fn run_inner(&mut self) -> Result<()> {
 71 |         let raft_states = self.meta_store.all_raft_states().await?;
 72 |         let raft_states = raft_states
 73 |             .into_iter()
 74 |             .map(|(raft_node, ss)| {
 75 |                 (
 76 |                     raft_node,
 77 |                     RaftState {
 78 |                         is_leader: match ss {
 79 |                             Some(ss) => ss.raft_state == raft::StateRole::Leader,
 80 |                             None => false,
 81 |                         },
 82 |                     },
 83 |                 )
 84 |             })
 85 |             .collect();
 86 | 
 87 |         let request = Request::new(HeartbeatRequest {
 88 |             node_id: self.node,
 89 |             endpoint: Some(self.endpoint.clone()),
 90 |             heartbeat_message: Some(heartbeat_request::HeartbeatMessage::WheelHeartbeat(
 91 |                 WheelHeartbeatRequest {
 92 |                     watermark: self.version_manager.watermark().await,
 93 |                     next_version_id: self.version_manager.latest_version_id().await + 1,
 94 |                     raft_states,
 95 |                 },
 96 |             )),
 97 |         });
 98 | 
 99 |         let mut client = RudderServiceClient::new(
100 |             self.channel_pool
101 |                 .get(self.rudder_node)
102 |                 .await
103 |                 .map_err(Error::err)?,
104 |         );
105 |         let rsp = client.heartbeat(request).await?.into_inner();
106 | 
107 |         let hb = match rsp.heartbeat_message.unwrap() {
108 |             heartbeat_response::HeartbeatMessage::WheelHeartbeat(hb) => hb,
109 |             _ => unreachable!(),
110 |         };
111 |         for version_diff in hb.version_diffs {
112 |             if let Err(runkv_storage::Error::ManifestError(ManifestError::VersionDiffIdNotMatch(
113 |                 old,
114 |                 new,
115 |             ))) = self.version_manager.update(version_diff, true).await
116 |             {
117 |                 warn!(
118 |                     "version diff id not match, skip: [old: {}] [new: {}]",
119 |                     old, new
120 |                 );
121 |             }
122 |         }
123 |         tokio::time::sleep(self.heartbeat_interval).await;
124 |         Ok(())
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/wheel/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod heartbeater;
2 | pub mod raft;
3 | pub mod sstable_uploader;
4 | 


--------------------------------------------------------------------------------